LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
845
847 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
850 } else {
853 }
856
857 // Generate outline atomics library calls only if LSE was not specified for
858 // subtarget
859 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
885#define LCALLNAMES(A, B, N) \
886 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
887 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
888 setLibcallName(A##N##_REL, #B #N "_rel"); \
889 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
890#define LCALLNAME4(A, B) \
891 LCALLNAMES(A, B, 1) \
892 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
893#define LCALLNAME5(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) \
896 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
897 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
898 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
899 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
900 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
903#undef LCALLNAMES
904#undef LCALLNAME4
905#undef LCALLNAME5
906 }
907
908 if (Subtarget->hasLSE128()) {
909 // Custom lowering because i128 is not legal. Must be replaced by 2x64
910 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
914 }
915
916 // 128-bit loads and stores can be done without expanding
919
920 // Aligned 128-bit loads and stores are single-copy atomic according to the
921 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
922 if (Subtarget->hasLSE2()) {
925 }
926
927 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
928 // custom lowering, as there are no un-paired non-temporal stores and
929 // legalization will break up 256 bit inputs.
931 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
932 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
933 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
938
939 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
940 // custom lowering, as there are no un-paired non-temporal loads legalization
941 // will break up 256 bit inputs.
942 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
943 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
944 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
945 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
946 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
947 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
948 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
949 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
950
951 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
953
954 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
955 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
956 // Issue __sincos_stret if available.
959 } else {
962 }
963
964 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
965 // MSVCRT doesn't have powi; fall back to pow
966 setLibcallName(RTLIB::POWI_F32, nullptr);
967 setLibcallName(RTLIB::POWI_F64, nullptr);
968 }
969
970 // Make floating-point constants legal for the large code model, so they don't
971 // become loads from the constant pool.
972 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
975 }
976
977 // AArch64 does not have floating-point extending loads, i1 sign-extending
978 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
979 for (MVT VT : MVT::fp_valuetypes()) {
980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
981 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
982 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
985 }
986 for (MVT VT : MVT::integer_valuetypes())
988
989 for (MVT WideVT : MVT::fp_valuetypes()) {
990 for (MVT NarrowVT : MVT::fp_valuetypes()) {
991 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
992 setTruncStoreAction(WideVT, NarrowVT, Expand);
993 }
994 }
995 }
996
997 if (Subtarget->hasFPARMv8()) {
1001 }
1002
1003 // Indexed loads and stores are supported.
1004 for (unsigned im = (unsigned)ISD::PRE_INC;
1006 setIndexedLoadAction(im, MVT::i8, Legal);
1007 setIndexedLoadAction(im, MVT::i16, Legal);
1008 setIndexedLoadAction(im, MVT::i32, Legal);
1009 setIndexedLoadAction(im, MVT::i64, Legal);
1010 setIndexedLoadAction(im, MVT::f64, Legal);
1011 setIndexedLoadAction(im, MVT::f32, Legal);
1012 setIndexedLoadAction(im, MVT::f16, Legal);
1013 setIndexedLoadAction(im, MVT::bf16, Legal);
1014 setIndexedStoreAction(im, MVT::i8, Legal);
1015 setIndexedStoreAction(im, MVT::i16, Legal);
1016 setIndexedStoreAction(im, MVT::i32, Legal);
1017 setIndexedStoreAction(im, MVT::i64, Legal);
1018 setIndexedStoreAction(im, MVT::f64, Legal);
1019 setIndexedStoreAction(im, MVT::f32, Legal);
1020 setIndexedStoreAction(im, MVT::f16, Legal);
1021 setIndexedStoreAction(im, MVT::bf16, Legal);
1022 }
1023
1024 // Trap.
1025 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1028
1029 // We combine OR nodes for bitfield operations.
1031 // Try to create BICs for vector ANDs.
1033
1034 // Vector add and sub nodes may conceal a high-half opportunity.
1035 // Also, try to fold ADD into CSINC/CSINV..
1038
1041
1042 // Try and combine setcc with csel
1044
1046
1053
1055
1057
1059
1063
1065
1067
1069
1071
1075
1077
1078 // In case of strict alignment, avoid an excessive number of byte wide stores.
1081 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1082
1086 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1087
1090
1093 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1094
1096
1098
1099 EnableExtLdPromotion = true;
1100
1101 // Set required alignment.
1103 // Set preferred alignments.
1104
1105 // Don't align loops on Windows. The SEH unwind info generation needs to
1106 // know the exact length of functions before the alignments have been
1107 // expanded.
1108 if (!Subtarget->isTargetWindows())
1112
1113 // Only change the limit for entries in a jump table if specified by
1114 // the sub target, but not at the command line.
1115 unsigned MaxJT = STI.getMaximumJumpTableSize();
1116 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1118
1120
1122
1124
1125 if (Subtarget->hasNEON()) {
1126 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1127 // silliness like this:
1128 for (auto Op :
1146 setOperationAction(Op, MVT::v1f64, Expand);
1147
1148 for (auto Op :
1153 setOperationAction(Op, MVT::v1i64, Expand);
1154
1155 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1156 // elements smaller than i32, so promote the input to i32 first.
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1159
1160 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1161 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1162 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1165 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1167
1168 if (Subtarget->hasFullFP16()) {
1171
1180 } else {
1181 // when AArch64 doesn't have fullfp16 support, promote the input
1182 // to i32 first.
1183 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1184 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1185 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1188 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1191 }
1192
1193 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1194 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1201 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1206 }
1207
1208 // Custom handling for some quad-vector types to detect MULL.
1209 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1210 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1211 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1212 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1215
1216 // Saturates
1217 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1218 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1223 }
1224
1225 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1226 MVT::v4i32}) {
1233 }
1234
1235 // Vector reductions
1236 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1237 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1238 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1243
1245 }
1246 }
1247 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1248 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1257 }
1262
1264 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1265 // Likewise, narrowing and extending vector loads/stores aren't handled
1266 // directly.
1269
1270 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1273 } else {
1276 }
1279
1282
1283 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1284 setTruncStoreAction(VT, InnerVT, Expand);
1285 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1286 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1287 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1288 }
1289 }
1290
1291 // AArch64 has implementations of a lot of rounding-like FP operations.
1292 for (auto Op :
1297 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1299 if (Subtarget->hasFullFP16())
1300 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1302 }
1303
1304 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1305
1310
1314
1315 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1317 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1320 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1321
1322 // ADDP custom lowering
1323 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1325 // FADDP custom lowering
1326 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1328 }
1329
1330 if (Subtarget->hasSME()) {
1332 }
1333
1334 // FIXME: Move lowering for more nodes here if those are common between
1335 // SVE and SME.
1336 if (Subtarget->hasSVEorSME()) {
1337 for (auto VT :
1338 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1343 }
1344 }
1345
1346 if (Subtarget->hasSVEorSME()) {
1347 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1390
1396
1405
1410
1411 if (!Subtarget->isLittleEndian())
1413
1414 if (Subtarget->hasSVE2orSME())
1415 // For SLI/SRI.
1417 }
1418
1419 // Illegal unpacked integer vector types.
1420 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1423 }
1424
1425 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1426 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1427 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1429
1430 for (auto VT :
1431 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1432 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1434
1435 for (auto VT :
1436 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1444
1448
1449 // There are no legal MVT::nxv16f## based types.
1450 if (VT != MVT::nxv16i1) {
1453 }
1454 }
1455
1456 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1457 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1458 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1459 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1464 }
1465
1466 // Firstly, exclude all scalable vector extending loads/truncating stores,
1467 // include both integer and floating scalable vector.
1469 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1470 setTruncStoreAction(VT, InnerVT, Expand);
1471 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1472 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1473 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1474 }
1475 }
1476
1477 // Then, selectively enable those which we directly support.
1478 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1479 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1480 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1481 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1482 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1483 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1484 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1485 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1486 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1487 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1488 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1489 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1490 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1491 }
1492
1493 // SVE supports truncating stores of 64 and 128-bit vectors
1494 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1495 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1496 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1497 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1498 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1499
1500 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1501 MVT::nxv4f32, MVT::nxv2f64}) {
1537 if (Subtarget->isSVEAvailable())
1542
1556
1568
1569 if (!Subtarget->isLittleEndian())
1571 }
1572
1573 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1580
1581 if (!Subtarget->isLittleEndian())
1583 }
1584
1587
1588 // NEON doesn't support integer divides, but SVE does
1589 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1590 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1593 }
1594
1595 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1596 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1597 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1598
1599 if (Subtarget->isSVEAvailable()) {
1600 // NEON doesn't support across-vector reductions, but SVE does.
1601 for (auto VT :
1602 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1604 }
1605
1606 // NOTE: Currently this has to happen after computeRegisterProperties rather
1607 // than the preferred option of combining it with the addRegisterClass call.
1608 if (Subtarget->useSVEForFixedLengthVectors()) {
1611 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1612 addTypeForFixedLengthSVE(VT);
1613 }
1616 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1617 addTypeForFixedLengthSVE(VT);
1618 }
1619
1620 // 64bit results can mean a bigger than NEON input.
1621 for (auto VT : {MVT::v8i8, MVT::v4i16})
1624
1625 // 128bit results imply a bigger than NEON input.
1626 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1628 for (auto VT : {MVT::v8f16, MVT::v4f32})
1630
1631 // These operations are not supported on NEON but SVE can do them.
1633 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1634 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1635 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1636 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1637 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1638 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1639 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1640 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1641 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1642 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1643 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1644 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1645 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1646 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1647 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1652
1653 // Int operations with no NEON support.
1654 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1655 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1663 }
1664
1665
1666 // Use SVE for vectors with more than 2 elements.
1667 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1669 }
1670
1671 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1672 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1673 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1674 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1675
1677 }
1678
1679 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1680 // Only required for llvm.aarch64.mops.memset.tag
1682 }
1683
1685
1686 if (Subtarget->hasSVE()) {
1691 }
1692
1693 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1694
1695 IsStrictFPEnabled = true;
1697
1698 if (Subtarget->isWindowsArm64EC()) {
1699 // FIXME: are there intrinsics we need to exclude from this?
1700 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1701 auto code = static_cast<RTLIB::Libcall>(i);
1702 auto libcallName = getLibcallName(code);
1703 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1704 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1705 }
1706 }
1707 }
1708}
1709
1710void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1711 assert(VT.isVector() && "VT should be a vector type");
1712
1713 if (VT.isFloatingPoint()) {
1715 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1716 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1717 }
1718
1719 // Mark vector float intrinsics as expand.
1720 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1730 }
1731
1732 // But we do support custom-lowering for FCOPYSIGN.
1733 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1734 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1735 VT == MVT::v8f16) &&
1736 Subtarget->hasFullFP16()))
1738
1751
1755 for (MVT InnerVT : MVT::all_valuetypes())
1756 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1757
1758 // CNT supports only B element sizes, then use UADDLP to widen.
1759 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1761
1767
1768 for (unsigned Opcode :
1771 setOperationAction(Opcode, VT, Custom);
1772
1773 if (!VT.isFloatingPoint())
1775
1776 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1777 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1778 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1779 setOperationAction(Opcode, VT, Legal);
1780
1781 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1782 // NEON types.
1783 if (VT.isFloatingPoint() &&
1784 VT.getVectorElementType() != MVT::bf16 &&
1785 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1786 for (unsigned Opcode :
1792 setOperationAction(Opcode, VT, Legal);
1793
1794 // Strict fp extend and trunc are legal
1795 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1797 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1799
1800 // FIXME: We could potentially make use of the vector comparison instructions
1801 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1802 // complications:
1803 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1804 // so we would need to expand when the condition code doesn't match the
1805 // kind of comparison.
1806 // * Some kinds of comparison require more than one FCMXY instruction so
1807 // would need to be expanded instead.
1808 // * The lowering of the non-strict versions involves target-specific ISD
1809 // nodes so we would likely need to add strict versions of all of them and
1810 // handle them appropriately.
1813
1814 if (Subtarget->isLittleEndian()) {
1815 for (unsigned im = (unsigned)ISD::PRE_INC;
1819 }
1820 }
1821
1822 if (Subtarget->hasD128()) {
1825 }
1826}
1827
1829 EVT OpVT) const {
1830 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1831 if (!Subtarget->hasSVE())
1832 return true;
1833
1834 // We can only support legal predicate result types. We can use the SVE
1835 // whilelo instruction for generating fixed-width predicates too.
1836 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1837 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1838 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1839 return true;
1840
1841 // The whilelo instruction only works with i32 or i64 scalar inputs.
1842 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1843 return true;
1844
1845 return false;
1846}
1847
1849 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1850}
1851
1852void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1853 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1854
1855 // By default everything must be expanded.
1856 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1858
1859 if (VT.isFloatingPoint()) {
1869 }
1870
1872 VT == MVT::v1f64 ? Expand : Custom;
1873
1874 // Mark integer truncating stores/extending loads as having custom lowering
1875 if (VT.isInteger()) {
1876 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1877 while (InnerVT != VT) {
1878 setTruncStoreAction(VT, InnerVT, Default);
1879 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1880 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1881 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1882 InnerVT = InnerVT.changeVectorElementType(
1883 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1884 }
1885 }
1886
1887 // Mark floating-point truncating stores/extending loads as having custom
1888 // lowering
1889 if (VT.isFloatingPoint()) {
1890 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1891 while (InnerVT != VT) {
1892 setTruncStoreAction(VT, InnerVT, Custom);
1893 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1894 InnerVT = InnerVT.changeVectorElementType(
1896 }
1897 }
1898
1899 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
1900 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1901
1902 // Lower fixed length vector operations to scalable equivalents.
1907 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
1942 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
1943 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
1945 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
1964 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
1990}
1991
1992void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1993 addRegisterClass(VT, &AArch64::FPR64RegClass);
1994 addTypeForNEON(VT);
1995}
1996
1997void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1998 addRegisterClass(VT, &AArch64::FPR128RegClass);
1999 addTypeForNEON(VT);
2000}
2001
2003 LLVMContext &C, EVT VT) const {
2004 if (!VT.isVector())
2005 return MVT::i32;
2006 if (VT.isScalableVector())
2007 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2009}
2010
2011// isIntImmediate - This method tests to see if the node is a constant
2012// operand. If so Imm will receive the value.
2013static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2014 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2015 Imm = C->getZExtValue();
2016 return true;
2017 }
2018 return false;
2019}
2020
2021// isOpcWithIntImmediate - This method tests to see if the node is a specific
2022// opcode and that it has a immediate integer right operand.
2023// If so Imm will receive the value.
2024static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2025 uint64_t &Imm) {
2026 return N->getOpcode() == Opc &&
2027 isIntImmediate(N->getOperand(1).getNode(), Imm);
2028}
2029
2030static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2031 const APInt &Demanded,
2033 unsigned NewOpc) {
2034 uint64_t OldImm = Imm, NewImm, Enc;
2035 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2036
2037 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2038 // bimm64.
2039 if (Imm == 0 || Imm == Mask ||
2041 return false;
2042
2043 unsigned EltSize = Size;
2044 uint64_t DemandedBits = Demanded.getZExtValue();
2045
2046 // Clear bits that are not demanded.
2047 Imm &= DemandedBits;
2048
2049 while (true) {
2050 // The goal here is to set the non-demanded bits in a way that minimizes
2051 // the number of switching between 0 and 1. In order to achieve this goal,
2052 // we set the non-demanded bits to the value of the preceding demanded bits.
2053 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2054 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2055 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2056 // The final result is 0b11000011.
2057 uint64_t NonDemandedBits = ~DemandedBits;
2058 uint64_t InvertedImm = ~Imm & DemandedBits;
2059 uint64_t RotatedImm =
2060 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2061 NonDemandedBits;
2062 uint64_t Sum = RotatedImm + NonDemandedBits;
2063 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2064 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2065 NewImm = (Imm | Ones) & Mask;
2066
2067 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2068 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2069 // we halve the element size and continue the search.
2070 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2071 break;
2072
2073 // We cannot shrink the element size any further if it is 2-bits.
2074 if (EltSize == 2)
2075 return false;
2076
2077 EltSize /= 2;
2078 Mask >>= EltSize;
2079 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2080
2081 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2082 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2083 return false;
2084
2085 // Merge the upper and lower halves of Imm and DemandedBits.
2086 Imm |= Hi;
2087 DemandedBits |= DemandedBitsHi;
2088 }
2089
2090 ++NumOptimizedImms;
2091
2092 // Replicate the element across the register width.
2093 while (EltSize < Size) {
2094 NewImm |= NewImm << EltSize;
2095 EltSize *= 2;
2096 }
2097
2098 (void)OldImm;
2099 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2100 "demanded bits should never be altered");
2101 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2102
2103 // Create the new constant immediate node.
2104 EVT VT = Op.getValueType();
2105 SDLoc DL(Op);
2106 SDValue New;
2107
2108 // If the new constant immediate is all-zeros or all-ones, let the target
2109 // independent DAG combine optimize this node.
2110 if (NewImm == 0 || NewImm == OrigMask) {
2111 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2112 TLO.DAG.getConstant(NewImm, DL, VT));
2113 // Otherwise, create a machine node so that target independent DAG combine
2114 // doesn't undo this optimization.
2115 } else {
2117 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2118 New = SDValue(
2119 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2120 }
2121
2122 return TLO.CombineTo(Op, New);
2123}
2124
2126 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2127 TargetLoweringOpt &TLO) const {
2128 // Delay this optimization to as late as possible.
2129 if (!TLO.LegalOps)
2130 return false;
2131
2133 return false;
2134
2135 EVT VT = Op.getValueType();
2136 if (VT.isVector())
2137 return false;
2138
2139 unsigned Size = VT.getSizeInBits();
2140 assert((Size == 32 || Size == 64) &&
2141 "i32 or i64 is expected after legalization.");
2142
2143 // Exit early if we demand all bits.
2144 if (DemandedBits.popcount() == Size)
2145 return false;
2146
2147 unsigned NewOpc;
2148 switch (Op.getOpcode()) {
2149 default:
2150 return false;
2151 case ISD::AND:
2152 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2153 break;
2154 case ISD::OR:
2155 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2156 break;
2157 case ISD::XOR:
2158 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2159 break;
2160 }
2161 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2162 if (!C)
2163 return false;
2164 uint64_t Imm = C->getZExtValue();
2165 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2166}
2167
2168/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2169/// Mask are known to be either zero or one and return them Known.
2171 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2172 const SelectionDAG &DAG, unsigned Depth) const {
2173 switch (Op.getOpcode()) {
2174 default:
2175 break;
2176 case AArch64ISD::DUP: {
2177 SDValue SrcOp = Op.getOperand(0);
2178 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2179 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2180 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2181 "Expected DUP implicit truncation");
2182 Known = Known.trunc(Op.getScalarValueSizeInBits());
2183 }
2184 break;
2185 }
2186 case AArch64ISD::CSEL: {
2187 KnownBits Known2;
2188 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2189 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2190 Known = Known.intersectWith(Known2);
2191 break;
2192 }
2193 case AArch64ISD::BICi: {
2194 // Compute the bit cleared value.
2195 uint64_t Mask =
2196 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2197 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2198 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2199 break;
2200 }
2201 case AArch64ISD::VLSHR: {
2202 KnownBits Known2;
2203 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2204 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2205 Known = KnownBits::lshr(Known, Known2);
2206 break;
2207 }
2208 case AArch64ISD::VASHR: {
2209 KnownBits Known2;
2210 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2211 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2212 Known = KnownBits::ashr(Known, Known2);
2213 break;
2214 }
2215 case AArch64ISD::VSHL: {
2216 KnownBits Known2;
2217 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2218 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2219 Known = KnownBits::shl(Known, Known2);
2220 break;
2221 }
2222 case AArch64ISD::MOVI: {
2224 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2225 break;
2226 }
2228 case AArch64ISD::ADDlow: {
2229 if (!Subtarget->isTargetILP32())
2230 break;
2231 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2232 Known.Zero = APInt::getHighBitsSet(64, 32);
2233 break;
2234 }
2236 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2237 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2238 break;
2239 }
2241 Intrinsic::ID IntID =
2242 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2243 switch (IntID) {
2244 default: return;
2245 case Intrinsic::aarch64_ldaxr:
2246 case Intrinsic::aarch64_ldxr: {
2247 unsigned BitWidth = Known.getBitWidth();
2248 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2249 unsigned MemBits = VT.getScalarSizeInBits();
2250 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2251 return;
2252 }
2253 }
2254 break;
2255 }
2257 case ISD::INTRINSIC_VOID: {
2258 unsigned IntNo = Op.getConstantOperandVal(0);
2259 switch (IntNo) {
2260 default:
2261 break;
2262 case Intrinsic::aarch64_neon_uaddlv: {
2263 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2264 unsigned BitWidth = Known.getBitWidth();
2265 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2266 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2267 assert(BitWidth >= Bound && "Unexpected width!");
2269 Known.Zero |= Mask;
2270 }
2271 break;
2272 }
2273 case Intrinsic::aarch64_neon_umaxv:
2274 case Intrinsic::aarch64_neon_uminv: {
2275 // Figure out the datatype of the vector operand. The UMINV instruction
2276 // will zero extend the result, so we can mark as known zero all the
2277 // bits larger than the element datatype. 32-bit or larget doesn't need
2278 // this as those are legal types and will be handled by isel directly.
2279 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2280 unsigned BitWidth = Known.getBitWidth();
2281 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2282 assert(BitWidth >= 8 && "Unexpected width!");
2284 Known.Zero |= Mask;
2285 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2286 assert(BitWidth >= 16 && "Unexpected width!");
2288 Known.Zero |= Mask;
2289 }
2290 break;
2291 } break;
2292 }
2293 }
2294 }
2295}
2296
2298 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2299 unsigned Depth) const {
2300 EVT VT = Op.getValueType();
2301 unsigned VTBits = VT.getScalarSizeInBits();
2302 unsigned Opcode = Op.getOpcode();
2303 switch (Opcode) {
2304 case AArch64ISD::CMEQ:
2305 case AArch64ISD::CMGE:
2306 case AArch64ISD::CMGT:
2307 case AArch64ISD::CMHI:
2308 case AArch64ISD::CMHS:
2309 case AArch64ISD::FCMEQ:
2310 case AArch64ISD::FCMGE:
2311 case AArch64ISD::FCMGT:
2312 case AArch64ISD::CMEQz:
2313 case AArch64ISD::CMGEz:
2314 case AArch64ISD::CMGTz:
2315 case AArch64ISD::CMLEz:
2316 case AArch64ISD::CMLTz:
2317 case AArch64ISD::FCMEQz:
2318 case AArch64ISD::FCMGEz:
2319 case AArch64ISD::FCMGTz:
2320 case AArch64ISD::FCMLEz:
2321 case AArch64ISD::FCMLTz:
2322 // Compares return either 0 or all-ones
2323 return VTBits;
2324 }
2325
2326 return 1;
2327}
2328
2330 EVT) const {
2331 return MVT::i64;
2332}
2333
2335 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2336 unsigned *Fast) const {
2337 if (Subtarget->requiresStrictAlign())
2338 return false;
2339
2340 if (Fast) {
2341 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2342 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2343 // See comments in performSTORECombine() for more details about
2344 // these conditions.
2345
2346 // Code that uses clang vector extensions can mark that it
2347 // wants unaligned accesses to be treated as fast by
2348 // underspecifying alignment to be 1 or 2.
2349 Alignment <= 2 ||
2350
2351 // Disregard v2i64. Memcpy lowering produces those and splitting
2352 // them regresses performance on micro-benchmarks and olden/bh.
2353 VT == MVT::v2i64;
2354 }
2355 return true;
2356}
2357
2358// Same as above but handling LLTs instead.
2360 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2361 unsigned *Fast) const {
2362 if (Subtarget->requiresStrictAlign())
2363 return false;
2364
2365 if (Fast) {
2366 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2367 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2368 Ty.getSizeInBytes() != 16 ||
2369 // See comments in performSTORECombine() for more details about
2370 // these conditions.
2371
2372 // Code that uses clang vector extensions can mark that it
2373 // wants unaligned accesses to be treated as fast by
2374 // underspecifying alignment to be 1 or 2.
2375 Alignment <= 2 ||
2376
2377 // Disregard v2i64. Memcpy lowering produces those and splitting
2378 // them regresses performance on micro-benchmarks and olden/bh.
2379 Ty == LLT::fixed_vector(2, 64);
2380 }
2381 return true;
2382}
2383
2384FastISel *
2386 const TargetLibraryInfo *libInfo) const {
2387 return AArch64::createFastISel(funcInfo, libInfo);
2388}
2389
2390const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2391#define MAKE_CASE(V) \
2392 case V: \
2393 return #V;
2394 switch ((AArch64ISD::NodeType)Opcode) {
2396 break;
2713 }
2714#undef MAKE_CASE
2715 return nullptr;
2716}
2717
2720 MachineBasicBlock *MBB) const {
2721 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2722 // phi node:
2723
2724 // OrigBB:
2725 // [... previous instrs leading to comparison ...]
2726 // b.ne TrueBB
2727 // b EndBB
2728 // TrueBB:
2729 // ; Fallthrough
2730 // EndBB:
2731 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2732
2733 MachineFunction *MF = MBB->getParent();
2734 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2735 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2736 DebugLoc DL = MI.getDebugLoc();
2738
2739 Register DestReg = MI.getOperand(0).getReg();
2740 Register IfTrueReg = MI.getOperand(1).getReg();
2741 Register IfFalseReg = MI.getOperand(2).getReg();
2742 unsigned CondCode = MI.getOperand(3).getImm();
2743 bool NZCVKilled = MI.getOperand(4).isKill();
2744
2745 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2746 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2747 MF->insert(It, TrueBB);
2748 MF->insert(It, EndBB);
2749
2750 // Transfer rest of current basic-block to EndBB
2751 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2752 MBB->end());
2754
2755 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2756 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2757 MBB->addSuccessor(TrueBB);
2758 MBB->addSuccessor(EndBB);
2759
2760 // TrueBB falls through to the end.
2761 TrueBB->addSuccessor(EndBB);
2762
2763 if (!NZCVKilled) {
2764 TrueBB->addLiveIn(AArch64::NZCV);
2765 EndBB->addLiveIn(AArch64::NZCV);
2766 }
2767
2768 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2769 .addReg(IfTrueReg)
2770 .addMBB(TrueBB)
2771 .addReg(IfFalseReg)
2772 .addMBB(MBB);
2773
2774 MI.eraseFromParent();
2775 return EndBB;
2776}
2777
2779 MachineInstr &MI, MachineBasicBlock *BB) const {
2781 BB->getParent()->getFunction().getPersonalityFn())) &&
2782 "SEH does not use catchret!");
2783 return BB;
2784}
2785
2788 MachineBasicBlock *MBB) const {
2789 MachineFunction &MF = *MBB->getParent();
2790 MachineBasicBlock::iterator MBBI = MI.getIterator();
2792 const AArch64InstrInfo &TII =
2793 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2794 Register TargetReg = MI.getOperand(0).getReg();
2796 TII.probedStackAlloc(MBBI, TargetReg, false);
2797
2798 MI.eraseFromParent();
2799 return NextInst->getParent();
2800}
2801
2803AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2805 MachineBasicBlock *BB) const {
2806 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2807 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2808
2809 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2810 MIB.add(MI.getOperand(1)); // slice index register
2811 MIB.add(MI.getOperand(2)); // slice index offset
2812 MIB.add(MI.getOperand(3)); // pg
2813 MIB.add(MI.getOperand(4)); // base
2814 MIB.add(MI.getOperand(5)); // offset
2815
2816 MI.eraseFromParent(); // The pseudo is gone now.
2817 return BB;
2818}
2819
2822 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2824 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2825
2826 MIB.addReg(AArch64::ZA, RegState::Define);
2827 MIB.add(MI.getOperand(0)); // Vector select register
2828 MIB.add(MI.getOperand(1)); // Vector select offset
2829 MIB.add(MI.getOperand(2)); // Base
2830 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2831
2832 MI.eraseFromParent(); // The pseudo is gone now.
2833 return BB;
2834}
2835
2838 unsigned Opcode,
2839 bool Op0IsDef) const {
2840 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2842
2843 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2844 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2845 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2846 MIB.add(MI.getOperand(I));
2847
2848 MI.eraseFromParent(); // The pseudo is gone now.
2849 return BB;
2850}
2851
2853AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2855 MachineBasicBlock *BB, bool HasTile) const {
2856 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2857 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2858 unsigned StartIdx = 0;
2859
2860 if (HasTile) {
2861 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2862 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2863 StartIdx = 1;
2864 } else
2865 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2866
2867 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2868 MIB.add(MI.getOperand(I));
2869
2870 MI.eraseFromParent(); // The pseudo is gone now.
2871 return BB;
2872}
2873
2876 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2878 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2879 MIB.add(MI.getOperand(0)); // Mask
2880
2881 unsigned Mask = MI.getOperand(0).getImm();
2882 for (unsigned I = 0; I < 8; I++) {
2883 if (Mask & (1 << I))
2884 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2885 }
2886
2887 MI.eraseFromParent(); // The pseudo is gone now.
2888 return BB;
2889}
2890
2892 MachineInstr &MI, MachineBasicBlock *BB) const {
2893
2894 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2895 if (SMEOrigInstr != -1) {
2896 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2897 uint64_t SMEMatrixType =
2898 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2899 switch (SMEMatrixType) {
2901 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2903 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2905 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2907 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2909 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2911 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2912 }
2913 }
2914
2915 switch (MI.getOpcode()) {
2916 default:
2917#ifndef NDEBUG
2918 MI.dump();
2919#endif
2920 llvm_unreachable("Unexpected instruction for custom inserter!");
2921
2922 case AArch64::F128CSEL:
2923 return EmitF128CSEL(MI, BB);
2924 case TargetOpcode::STATEPOINT:
2925 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2926 // while bl call instruction (where statepoint will be lowered at the end)
2927 // has implicit def. This def is early-clobber as it will be set at
2928 // the moment of the call and earlier than any use is read.
2929 // Add this implicit dead def here as a workaround.
2930 MI.addOperand(*MI.getMF(),
2932 AArch64::LR, /*isDef*/ true,
2933 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2934 /*isUndef*/ false, /*isEarlyClobber*/ true));
2935 [[fallthrough]];
2936 case TargetOpcode::STACKMAP:
2937 case TargetOpcode::PATCHPOINT:
2938 return emitPatchPoint(MI, BB);
2939
2940 case TargetOpcode::PATCHABLE_EVENT_CALL:
2941 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2942 return BB;
2943
2944 case AArch64::CATCHRET:
2945 return EmitLoweredCatchRet(MI, BB);
2946
2947 case AArch64::PROBED_STACKALLOC_DYN:
2948 return EmitDynamicProbedAlloc(MI, BB);
2949
2950 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2951 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2952 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2953 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2954 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2955 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2956 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2957 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2958 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2959 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2960 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2961 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2962 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2963 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2964 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2965 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2966 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2967 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2968 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2969 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2970 case AArch64::LDR_ZA_PSEUDO:
2971 return EmitFill(MI, BB);
2972 case AArch64::LDR_TX_PSEUDO:
2973 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2974 case AArch64::STR_TX_PSEUDO:
2975 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2976 case AArch64::ZERO_M_PSEUDO:
2977 return EmitZero(MI, BB);
2978 case AArch64::ZERO_T_PSEUDO:
2979 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2980 }
2981}
2982
2983//===----------------------------------------------------------------------===//
2984// AArch64 Lowering private implementation.
2985//===----------------------------------------------------------------------===//
2986
2987//===----------------------------------------------------------------------===//
2988// Lowering Code
2989//===----------------------------------------------------------------------===//
2990
2991// Forward declarations of SVE fixed length lowering helpers
2996 SelectionDAG &DAG);
2999 EVT VT);
3000
3001/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3002static bool isZerosVector(const SDNode *N) {
3003 // Look through a bit convert.
3004 while (N->getOpcode() == ISD::BITCAST)
3005 N = N->getOperand(0).getNode();
3006
3008 return true;
3009
3010 if (N->getOpcode() != AArch64ISD::DUP)
3011 return false;
3012
3013 auto Opnd0 = N->getOperand(0);
3014 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3015}
3016
3017/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3018/// CC
3020 switch (CC) {
3021 default:
3022 llvm_unreachable("Unknown condition code!");
3023 case ISD::SETNE:
3024 return AArch64CC::NE;
3025 case ISD::SETEQ:
3026 return AArch64CC::EQ;
3027 case ISD::SETGT:
3028 return AArch64CC::GT;
3029 case ISD::SETGE:
3030 return AArch64CC::GE;
3031 case ISD::SETLT:
3032 return AArch64CC::LT;
3033 case ISD::SETLE:
3034 return AArch64CC::LE;
3035 case ISD::SETUGT:
3036 return AArch64CC::HI;
3037 case ISD::SETUGE:
3038 return AArch64CC::HS;
3039 case ISD::SETULT:
3040 return AArch64CC::LO;
3041 case ISD::SETULE:
3042 return AArch64CC::LS;
3043 }
3044}
3045
3046/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3048 AArch64CC::CondCode &CondCode,
3049 AArch64CC::CondCode &CondCode2) {
3050 CondCode2 = AArch64CC::AL;
3051 switch (CC) {
3052 default:
3053 llvm_unreachable("Unknown FP condition!");
3054 case ISD::SETEQ:
3055 case ISD::SETOEQ:
3056 CondCode = AArch64CC::EQ;
3057 break;
3058 case ISD::SETGT:
3059 case ISD::SETOGT:
3060 CondCode = AArch64CC::GT;
3061 break;
3062 case ISD::SETGE:
3063 case ISD::SETOGE:
3064 CondCode = AArch64CC::GE;
3065 break;
3066 case ISD::SETOLT:
3067 CondCode = AArch64CC::MI;
3068 break;
3069 case ISD::SETOLE:
3070 CondCode = AArch64CC::LS;
3071 break;
3072 case ISD::SETONE:
3073 CondCode = AArch64CC::MI;
3074 CondCode2 = AArch64CC::GT;
3075 break;
3076 case ISD::SETO:
3077 CondCode = AArch64CC::VC;
3078 break;
3079 case ISD::SETUO:
3080 CondCode = AArch64CC::VS;
3081 break;
3082 case ISD::SETUEQ:
3083 CondCode = AArch64CC::EQ;
3084 CondCode2 = AArch64CC::VS;
3085 break;
3086 case ISD::SETUGT:
3087 CondCode = AArch64CC::HI;
3088 break;
3089 case ISD::SETUGE:
3090 CondCode = AArch64CC::PL;
3091 break;
3092 case ISD::SETLT:
3093 case ISD::SETULT:
3094 CondCode = AArch64CC::LT;
3095 break;
3096 case ISD::SETLE:
3097 case ISD::SETULE:
3098 CondCode = AArch64CC::LE;
3099 break;
3100 case ISD::SETNE:
3101 case ISD::SETUNE:
3102 CondCode = AArch64CC::NE;
3103 break;
3104 }
3105}
3106
3107/// Convert a DAG fp condition code to an AArch64 CC.
3108/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3109/// should be AND'ed instead of OR'ed.
3111 AArch64CC::CondCode &CondCode,
3112 AArch64CC::CondCode &CondCode2) {
3113 CondCode2 = AArch64CC::AL;
3114 switch (CC) {
3115 default:
3116 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3117 assert(CondCode2 == AArch64CC::AL);
3118 break;
3119 case ISD::SETONE:
3120 // (a one b)
3121 // == ((a olt b) || (a ogt b))
3122 // == ((a ord b) && (a une b))
3123 CondCode = AArch64CC::VC;
3124 CondCode2 = AArch64CC::NE;
3125 break;
3126 case ISD::SETUEQ:
3127 // (a ueq b)
3128 // == ((a uno b) || (a oeq b))
3129 // == ((a ule b) && (a uge b))
3130 CondCode = AArch64CC::PL;
3131 CondCode2 = AArch64CC::LE;
3132 break;
3133 }
3134}
3135
3136/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3137/// CC usable with the vector instructions. Fewer operations are available
3138/// without a real NZCV register, so we have to use less efficient combinations
3139/// to get the same effect.
3141 AArch64CC::CondCode &CondCode,
3142 AArch64CC::CondCode &CondCode2,
3143 bool &Invert) {
3144 Invert = false;
3145 switch (CC) {
3146 default:
3147 // Mostly the scalar mappings work fine.
3148 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3149 break;
3150 case ISD::SETUO:
3151 Invert = true;
3152 [[fallthrough]];
3153 case ISD::SETO:
3154 CondCode = AArch64CC::MI;
3155 CondCode2 = AArch64CC::GE;
3156 break;
3157 case ISD::SETUEQ:
3158 case ISD::SETULT:
3159 case ISD::SETULE:
3160 case ISD::SETUGT:
3161 case ISD::SETUGE:
3162 // All of the compare-mask comparisons are ordered, but we can switch
3163 // between the two by a double inversion. E.g. ULE == !OGT.
3164 Invert = true;
3165 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3166 CondCode, CondCode2);
3167 break;
3168 }
3169}
3170
3172 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3173 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3174 LLVM_DEBUG(dbgs() << "Is imm " << C
3175 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3176 return IsLegal;
3177}
3178
3179// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3180// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3181// can be set differently by this operation. It comes down to whether
3182// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3183// everything is fine. If not then the optimization is wrong. Thus general
3184// comparisons are only valid if op2 != 0.
3185//
3186// So, finally, the only LLVM-native comparisons that don't mention C and V
3187// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3188// the absence of information about op2.
3190 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3191 (CC == ISD::SETEQ || CC == ISD::SETNE);
3192}
3193
3195 SelectionDAG &DAG, SDValue Chain,
3196 bool IsSignaling) {
3197 EVT VT = LHS.getValueType();
3198 assert(VT != MVT::f128);
3199
3200 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3201
3202 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3203 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3204 {Chain, LHS});
3205 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3206 {LHS.getValue(1), RHS});
3207 Chain = RHS.getValue(1);
3208 VT = MVT::f32;
3209 }
3210 unsigned Opcode =
3212 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3213}
3214
3216 const SDLoc &dl, SelectionDAG &DAG) {
3217 EVT VT = LHS.getValueType();
3218 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3219
3220 if (VT.isFloatingPoint()) {
3221 assert(VT != MVT::f128);
3222 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3223 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3224 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3225 VT = MVT::f32;
3226 }
3227 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3228 }
3229
3230 // The CMP instruction is just an alias for SUBS, and representing it as
3231 // SUBS means that it's possible to get CSE with subtract operations.
3232 // A later phase can perform the optimization of setting the destination
3233 // register to WZR/XZR if it ends up being unused.
3234 unsigned Opcode = AArch64ISD::SUBS;
3235
3236 if (isCMN(RHS, CC)) {
3237 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3238 Opcode = AArch64ISD::ADDS;
3239 RHS = RHS.getOperand(1);
3240 } else if (isCMN(LHS, CC)) {
3241 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3242 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3243 Opcode = AArch64ISD::ADDS;
3244 LHS = LHS.getOperand(1);
3245 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3246 if (LHS.getOpcode() == ISD::AND) {
3247 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3248 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3249 // of the signed comparisons.
3250 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3251 DAG.getVTList(VT, MVT_CC),
3252 LHS.getOperand(0),
3253 LHS.getOperand(1));
3254 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3255 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3256 return ANDSNode.getValue(1);
3257 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3258 // Use result of ANDS
3259 return LHS.getValue(1);
3260 }
3261 }
3262
3263 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3264 .getValue(1);
3265}
3266
3267/// \defgroup AArch64CCMP CMP;CCMP matching
3268///
3269/// These functions deal with the formation of CMP;CCMP;... sequences.
3270/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3271/// a comparison. They set the NZCV flags to a predefined value if their
3272/// predicate is false. This allows to express arbitrary conjunctions, for
3273/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3274/// expressed as:
3275/// cmp A
3276/// ccmp B, inv(CB), CA
3277/// check for CB flags
3278///
3279/// This naturally lets us implement chains of AND operations with SETCC
3280/// operands. And we can even implement some other situations by transforming
3281/// them:
3282/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3283/// negating the flags used in a CCMP/FCCMP operations.
3284/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3285/// by negating the flags we test for afterwards. i.e.
3286/// NEG (CMP CCMP CCCMP ...) can be implemented.
3287/// - Note that we can only ever negate all previously processed results.
3288/// What we can not implement by flipping the flags to test is a negation
3289/// of two sub-trees (because the negation affects all sub-trees emitted so
3290/// far, so the 2nd sub-tree we emit would also affect the first).
3291/// With those tools we can implement some OR operations:
3292/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3293/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3294/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3295/// elimination rules from earlier to implement the whole thing as a
3296/// CCMP/FCCMP chain.
3297///
3298/// As complete example:
3299/// or (or (setCA (cmp A)) (setCB (cmp B)))
3300/// (and (setCC (cmp C)) (setCD (cmp D)))"
3301/// can be reassociated to:
3302/// or (and (setCC (cmp C)) setCD (cmp D))
3303// (or (setCA (cmp A)) (setCB (cmp B)))
3304/// can be transformed to:
3305/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3306/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3307/// which can be implemented as:
3308/// cmp C
3309/// ccmp D, inv(CD), CC
3310/// ccmp A, CA, inv(CD)
3311/// ccmp B, CB, inv(CA)
3312/// check for CB flags
3313///
3314/// A counterexample is "or (and A B) (and C D)" which translates to
3315/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3316/// can only implement 1 of the inner (not) operations, but not both!
3317/// @{
3318
3319/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3321 ISD::CondCode CC, SDValue CCOp,
3322 AArch64CC::CondCode Predicate,
3323 AArch64CC::CondCode OutCC,
3324 const SDLoc &DL, SelectionDAG &DAG) {
3325 unsigned Opcode = 0;
3326 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3327
3328 if (LHS.getValueType().isFloatingPoint()) {
3329 assert(LHS.getValueType() != MVT::f128);
3330 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3331 LHS.getValueType() == MVT::bf16) {
3332 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3333 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3334 }
3335 Opcode = AArch64ISD::FCCMP;
3336 } else if (RHS.getOpcode() == ISD::SUB) {
3337 SDValue SubOp0 = RHS.getOperand(0);
3338 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3339 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3340 Opcode = AArch64ISD::CCMN;
3341 RHS = RHS.getOperand(1);
3342 }
3343 }
3344 if (Opcode == 0)
3345 Opcode = AArch64ISD::CCMP;
3346
3347 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3349 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3350 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3351 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3352}
3353
3354/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3355/// expressed as a conjunction. See \ref AArch64CCMP.
3356/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3357/// changing the conditions on the SETCC tests.
3358/// (this means we can call emitConjunctionRec() with
3359/// Negate==true on this sub-tree)
3360/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3361/// cannot do the negation naturally. We are required to
3362/// emit the subtree first in this case.
3363/// \param WillNegate Is true if are called when the result of this
3364/// subexpression must be negated. This happens when the
3365/// outer expression is an OR. We can use this fact to know
3366/// that we have a double negation (or (or ...) ...) that
3367/// can be implemented for free.
3368static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3369 bool &MustBeFirst, bool WillNegate,
3370 unsigned Depth = 0) {
3371 if (!Val.hasOneUse())
3372 return false;
3373 unsigned Opcode = Val->getOpcode();
3374 if (Opcode == ISD::SETCC) {
3375 if (Val->getOperand(0).getValueType() == MVT::f128)
3376 return false;
3377 CanNegate = true;
3378 MustBeFirst = false;
3379 return true;
3380 }
3381 // Protect against exponential runtime and stack overflow.
3382 if (Depth > 6)
3383 return false;
3384 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3385 bool IsOR = Opcode == ISD::OR;
3386 SDValue O0 = Val->getOperand(0);
3387 SDValue O1 = Val->getOperand(1);
3388 bool CanNegateL;
3389 bool MustBeFirstL;
3390 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3391 return false;
3392 bool CanNegateR;
3393 bool MustBeFirstR;
3394 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3395 return false;
3396
3397 if (MustBeFirstL && MustBeFirstR)
3398 return false;
3399
3400 if (IsOR) {
3401 // For an OR expression we need to be able to naturally negate at least
3402 // one side or we cannot do the transformation at all.
3403 if (!CanNegateL && !CanNegateR)
3404 return false;
3405 // If we the result of the OR will be negated and we can naturally negate
3406 // the leafs, then this sub-tree as a whole negates naturally.
3407 CanNegate = WillNegate && CanNegateL && CanNegateR;
3408 // If we cannot naturally negate the whole sub-tree, then this must be
3409 // emitted first.
3410 MustBeFirst = !CanNegate;
3411 } else {
3412 assert(Opcode == ISD::AND && "Must be OR or AND");
3413 // We cannot naturally negate an AND operation.
3414 CanNegate = false;
3415 MustBeFirst = MustBeFirstL || MustBeFirstR;
3416 }
3417 return true;
3418 }
3419 return false;
3420}
3421
3422/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3423/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3424/// Tries to transform the given i1 producing node @p Val to a series compare
3425/// and conditional compare operations. @returns an NZCV flags producing node
3426/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3427/// transformation was not possible.
3428/// \p Negate is true if we want this sub-tree being negated just by changing
3429/// SETCC conditions.
3431 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3432 AArch64CC::CondCode Predicate) {
3433 // We're at a tree leaf, produce a conditional comparison operation.
3434 unsigned Opcode = Val->getOpcode();
3435 if (Opcode == ISD::SETCC) {
3436 SDValue LHS = Val->getOperand(0);
3437 SDValue RHS = Val->getOperand(1);
3438 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3439 bool isInteger = LHS.getValueType().isInteger();
3440 if (Negate)
3441 CC = getSetCCInverse(CC, LHS.getValueType());
3442 SDLoc DL(Val);
3443 // Determine OutCC and handle FP special case.
3444 if (isInteger) {
3445 OutCC = changeIntCCToAArch64CC(CC);
3446 } else {
3447 assert(LHS.getValueType().isFloatingPoint());
3448 AArch64CC::CondCode ExtraCC;
3449 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3450 // Some floating point conditions can't be tested with a single condition
3451 // code. Construct an additional comparison in this case.
3452 if (ExtraCC != AArch64CC::AL) {
3453 SDValue ExtraCmp;
3454 if (!CCOp.getNode())
3455 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3456 else
3457 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3458 ExtraCC, DL, DAG);
3459 CCOp = ExtraCmp;
3460 Predicate = ExtraCC;
3461 }
3462 }
3463
3464 // Produce a normal comparison if we are first in the chain
3465 if (!CCOp)
3466 return emitComparison(LHS, RHS, CC, DL, DAG);
3467 // Otherwise produce a ccmp.
3468 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3469 DAG);
3470 }
3471 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3472
3473 bool IsOR = Opcode == ISD::OR;
3474
3475 SDValue LHS = Val->getOperand(0);
3476 bool CanNegateL;
3477 bool MustBeFirstL;
3478 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3479 assert(ValidL && "Valid conjunction/disjunction tree");
3480 (void)ValidL;
3481
3482 SDValue RHS = Val->getOperand(1);
3483 bool CanNegateR;
3484 bool MustBeFirstR;
3485 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3486 assert(ValidR && "Valid conjunction/disjunction tree");
3487 (void)ValidR;
3488
3489 // Swap sub-tree that must come first to the right side.
3490 if (MustBeFirstL) {
3491 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3492 std::swap(LHS, RHS);
3493 std::swap(CanNegateL, CanNegateR);
3494 std::swap(MustBeFirstL, MustBeFirstR);
3495 }
3496
3497 bool NegateR;
3498 bool NegateAfterR;
3499 bool NegateL;
3500 bool NegateAfterAll;
3501 if (Opcode == ISD::OR) {
3502 // Swap the sub-tree that we can negate naturally to the left.
3503 if (!CanNegateL) {
3504 assert(CanNegateR && "at least one side must be negatable");
3505 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3506 assert(!Negate);
3507 std::swap(LHS, RHS);
3508 NegateR = false;
3509 NegateAfterR = true;
3510 } else {
3511 // Negate the left sub-tree if possible, otherwise negate the result.
3512 NegateR = CanNegateR;
3513 NegateAfterR = !CanNegateR;
3514 }
3515 NegateL = true;
3516 NegateAfterAll = !Negate;
3517 } else {
3518 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3519 assert(!Negate && "Valid conjunction/disjunction tree");
3520
3521 NegateL = false;
3522 NegateR = false;
3523 NegateAfterR = false;
3524 NegateAfterAll = false;
3525 }
3526
3527 // Emit sub-trees.
3528 AArch64CC::CondCode RHSCC;
3529 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3530 if (NegateAfterR)
3531 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3532 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3533 if (NegateAfterAll)
3534 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3535 return CmpL;
3536}
3537
3538/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3539/// In some cases this is even possible with OR operations in the expression.
3540/// See \ref AArch64CCMP.
3541/// \see emitConjunctionRec().
3543 AArch64CC::CondCode &OutCC) {
3544 bool DummyCanNegate;
3545 bool DummyMustBeFirst;
3546 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3547 return SDValue();
3548
3549 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3550}
3551
3552/// @}
3553
3554/// Returns how profitable it is to fold a comparison's operand's shift and/or
3555/// extension operations.
3557 auto isSupportedExtend = [&](SDValue V) {
3558 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3559 return true;
3560
3561 if (V.getOpcode() == ISD::AND)
3562 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3563 uint64_t Mask = MaskCst->getZExtValue();
3564 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3565 }
3566
3567 return false;
3568 };
3569
3570 if (!Op.hasOneUse())
3571 return 0;
3572
3573 if (isSupportedExtend(Op))
3574 return 1;
3575
3576 unsigned Opc = Op.getOpcode();
3577 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3578 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3579 uint64_t Shift = ShiftCst->getZExtValue();
3580 if (isSupportedExtend(Op.getOperand(0)))
3581 return (Shift <= 4) ? 2 : 1;
3582 EVT VT = Op.getValueType();
3583 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3584 return 1;
3585 }
3586
3587 return 0;
3588}
3589
3591 SDValue &AArch64cc, SelectionDAG &DAG,
3592 const SDLoc &dl) {
3593 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3594 EVT VT = RHS.getValueType();
3595 uint64_t C = RHSC->getZExtValue();
3596 if (!isLegalArithImmed(C)) {
3597 // Constant does not fit, try adjusting it by one?
3598 switch (CC) {
3599 default:
3600 break;
3601 case ISD::SETLT:
3602 case ISD::SETGE:
3603 if ((VT == MVT::i32 && C != 0x80000000 &&
3604 isLegalArithImmed((uint32_t)(C - 1))) ||
3605 (VT == MVT::i64 && C != 0x80000000ULL &&
3606 isLegalArithImmed(C - 1ULL))) {
3608 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3609 RHS = DAG.getConstant(C, dl, VT);
3610 }
3611 break;
3612 case ISD::SETULT:
3613 case ISD::SETUGE:
3614 if ((VT == MVT::i32 && C != 0 &&
3615 isLegalArithImmed((uint32_t)(C - 1))) ||
3616 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3618 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3619 RHS = DAG.getConstant(C, dl, VT);
3620 }
3621 break;
3622 case ISD::SETLE:
3623 case ISD::SETGT:
3624 if ((VT == MVT::i32 && C != INT32_MAX &&
3625 isLegalArithImmed((uint32_t)(C + 1))) ||
3626 (VT == MVT::i64 && C != INT64_MAX &&
3627 isLegalArithImmed(C + 1ULL))) {
3629 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3630 RHS = DAG.getConstant(C, dl, VT);
3631 }
3632 break;
3633 case ISD::SETULE:
3634 case ISD::SETUGT:
3635 if ((VT == MVT::i32 && C != UINT32_MAX &&
3636 isLegalArithImmed((uint32_t)(C + 1))) ||
3637 (VT == MVT::i64 && C != UINT64_MAX &&
3638 isLegalArithImmed(C + 1ULL))) {
3640 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3641 RHS = DAG.getConstant(C, dl, VT);
3642 }
3643 break;
3644 }
3645 }
3646 }
3647
3648 // Comparisons are canonicalized so that the RHS operand is simpler than the
3649 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3650 // can fold some shift+extend operations on the RHS operand, so swap the
3651 // operands if that can be done.
3652 //
3653 // For example:
3654 // lsl w13, w11, #1
3655 // cmp w13, w12
3656 // can be turned into:
3657 // cmp w12, w11, lsl #1
3658 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3659 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3660
3662 std::swap(LHS, RHS);
3664 }
3665 }
3666
3667 SDValue Cmp;
3668 AArch64CC::CondCode AArch64CC;
3669 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3670 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3671
3672 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3673 // For the i8 operand, the largest immediate is 255, so this can be easily
3674 // encoded in the compare instruction. For the i16 operand, however, the
3675 // largest immediate cannot be encoded in the compare.
3676 // Therefore, use a sign extending load and cmn to avoid materializing the
3677 // -1 constant. For example,
3678 // movz w1, #65535
3679 // ldrh w0, [x0, #0]
3680 // cmp w0, w1
3681 // >
3682 // ldrsh w0, [x0, #0]
3683 // cmn w0, #1
3684 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3685 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3686 // ensure both the LHS and RHS are truly zero extended and to make sure the
3687 // transformation is profitable.
3688 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3689 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3690 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3691 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3692 int16_t ValueofRHS = RHS->getAsZExtVal();
3693 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3694 SDValue SExt =
3695 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3696 DAG.getValueType(MVT::i16));
3697 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3698 RHS.getValueType()),
3699 CC, dl, DAG);
3700 AArch64CC = changeIntCCToAArch64CC(CC);
3701 }
3702 }
3703
3704 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3705 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3706 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3707 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3708 }
3709 }
3710 }
3711
3712 if (!Cmp) {
3713 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3714 AArch64CC = changeIntCCToAArch64CC(CC);
3715 }
3716 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3717 return Cmp;
3718}
3719
3720static std::pair<SDValue, SDValue>
3722 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3723 "Unsupported value type");
3724 SDValue Value, Overflow;
3725 SDLoc DL(Op);
3726 SDValue LHS = Op.getOperand(0);
3727 SDValue RHS = Op.getOperand(1);
3728 unsigned Opc = 0;
3729 switch (Op.getOpcode()) {
3730 default:
3731 llvm_unreachable("Unknown overflow instruction!");
3732 case ISD::SADDO:
3733 Opc = AArch64ISD::ADDS;
3734 CC = AArch64CC::VS;
3735 break;
3736 case ISD::UADDO:
3737 Opc = AArch64ISD::ADDS;
3738 CC = AArch64CC::HS;
3739 break;
3740 case ISD::SSUBO:
3741 Opc = AArch64ISD::SUBS;
3742 CC = AArch64CC::VS;
3743 break;
3744 case ISD::USUBO:
3745 Opc = AArch64ISD::SUBS;
3746 CC = AArch64CC::LO;
3747 break;
3748 // Multiply needs a little bit extra work.
3749 case ISD::SMULO:
3750 case ISD::UMULO: {
3751 CC = AArch64CC::NE;
3752 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3753 if (Op.getValueType() == MVT::i32) {
3754 // Extend to 64-bits, then perform a 64-bit multiply.
3755 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3756 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3757 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3758 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3759 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3760
3761 // Check that the result fits into a 32-bit integer.
3762 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3763 if (IsSigned) {
3764 // cmp xreg, wreg, sxtw
3765 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3766 Overflow =
3767 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3768 } else {
3769 // tst xreg, #0xffffffff00000000
3770 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3771 Overflow =
3772 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3773 }
3774 break;
3775 }
3776 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3777 // For the 64 bit multiply
3778 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3779 if (IsSigned) {
3780 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3781 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3782 DAG.getConstant(63, DL, MVT::i64));
3783 // It is important that LowerBits is last, otherwise the arithmetic
3784 // shift will not be folded into the compare (SUBS).
3785 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3786 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3787 .getValue(1);
3788 } else {
3789 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3790 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3791 Overflow =
3792 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3793 DAG.getConstant(0, DL, MVT::i64),
3794 UpperBits).getValue(1);
3795 }
3796 break;
3797 }
3798 } // switch (...)
3799
3800 if (Opc) {
3801 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3802
3803 // Emit the AArch64 operation with overflow check.
3804 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3805 Overflow = Value.getValue(1);
3806 }
3807 return std::make_pair(Value, Overflow);
3808}
3809
3810SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3811 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3812 !Subtarget->isNeonAvailable()))
3813 return LowerToScalableOp(Op, DAG);
3814
3815 SDValue Sel = Op.getOperand(0);
3816 SDValue Other = Op.getOperand(1);
3817 SDLoc dl(Sel);
3818
3819 // If the operand is an overflow checking operation, invert the condition
3820 // code and kill the Not operation. I.e., transform:
3821 // (xor (overflow_op_bool, 1))
3822 // -->
3823 // (csel 1, 0, invert(cc), overflow_op_bool)
3824 // ... which later gets transformed to just a cset instruction with an
3825 // inverted condition code, rather than a cset + eor sequence.
3827 // Only lower legal XALUO ops.
3829 return SDValue();
3830
3831 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3832 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3834 SDValue Value, Overflow;
3835 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3836 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3837 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3838 CCVal, Overflow);
3839 }
3840 // If neither operand is a SELECT_CC, give up.
3841 if (Sel.getOpcode() != ISD::SELECT_CC)
3842 std::swap(Sel, Other);
3843 if (Sel.getOpcode() != ISD::SELECT_CC)
3844 return Op;
3845
3846 // The folding we want to perform is:
3847 // (xor x, (select_cc a, b, cc, 0, -1) )
3848 // -->
3849 // (csel x, (xor x, -1), cc ...)
3850 //
3851 // The latter will get matched to a CSINV instruction.
3852
3853 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3854 SDValue LHS = Sel.getOperand(0);
3855 SDValue RHS = Sel.getOperand(1);
3856 SDValue TVal = Sel.getOperand(2);
3857 SDValue FVal = Sel.getOperand(3);
3858
3859 // FIXME: This could be generalized to non-integer comparisons.
3860 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3861 return Op;
3862
3863 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3864 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3865
3866 // The values aren't constants, this isn't the pattern we're looking for.
3867 if (!CFVal || !CTVal)
3868 return Op;
3869
3870 // We can commute the SELECT_CC by inverting the condition. This
3871 // might be needed to make this fit into a CSINV pattern.
3872 if (CTVal->isAllOnes() && CFVal->isZero()) {
3873 std::swap(TVal, FVal);
3874 std::swap(CTVal, CFVal);
3875 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3876 }
3877
3878 // If the constants line up, perform the transform!
3879 if (CTVal->isZero() && CFVal->isAllOnes()) {
3880 SDValue CCVal;
3881 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3882
3883 FVal = Other;
3884 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3885 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3886
3887 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3888 CCVal, Cmp);
3889 }
3890
3891 return Op;
3892}
3893
3894// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3895// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3896// sets 'C' bit to 0.
3898 SDLoc DL(Value);
3899 EVT VT = Value.getValueType();
3900 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3901 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3902 SDValue Cmp =
3903 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3904 return Cmp.getValue(1);
3905}
3906
3907// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3908// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3910 bool Invert) {
3911 assert(Glue.getResNo() == 1);
3912 SDLoc DL(Glue);
3913 SDValue Zero = DAG.getConstant(0, DL, VT);
3914 SDValue One = DAG.getConstant(1, DL, VT);
3915 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3916 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3917 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3918}
3919
3920// Value is 1 if 'V' bit of NZCV is 1, else 0
3922 assert(Glue.getResNo() == 1);
3923 SDLoc DL(Glue);
3924 SDValue Zero = DAG.getConstant(0, DL, VT);
3925 SDValue One = DAG.getConstant(1, DL, VT);
3926 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3927 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3928}
3929
3930// This lowering is inefficient, but it will get cleaned up by
3931// `foldOverflowCheck`
3933 unsigned Opcode, bool IsSigned) {
3934 EVT VT0 = Op.getValue(0).getValueType();
3935 EVT VT1 = Op.getValue(1).getValueType();
3936
3937 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3938 return SDValue();
3939
3940 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3941 SDValue OpLHS = Op.getOperand(0);
3942 SDValue OpRHS = Op.getOperand(1);
3943 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3944
3945 SDLoc DL(Op);
3946 SDVTList VTs = DAG.getVTList(VT0, VT1);
3947
3948 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3949 OpRHS, OpCarryIn);
3950
3951 SDValue OutFlag =
3952 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3953 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3954
3955 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3956}
3957
3959 // Let legalize expand this if it isn't a legal type yet.
3960 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3961 return SDValue();
3962
3963 SDLoc dl(Op);
3965 // The actual operation that sets the overflow or carry flag.
3966 SDValue Value, Overflow;
3967 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3968
3969 // We use 0 and 1 as false and true values.
3970 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3971 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3972
3973 // We use an inverted condition, because the conditional select is inverted
3974 // too. This will allow it to be selected to a single instruction:
3975 // CSINC Wd, WZR, WZR, invert(cond).
3976 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3977 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3978 CCVal, Overflow);
3979
3980 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3981 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3982}
3983
3984// Prefetch operands are:
3985// 1: Address to prefetch
3986// 2: bool isWrite
3987// 3: int locality (0 = no locality ... 3 = extreme locality)
3988// 4: bool isDataCache
3990 SDLoc DL(Op);
3991 unsigned IsWrite = Op.getConstantOperandVal(2);
3992 unsigned Locality = Op.getConstantOperandVal(3);
3993 unsigned IsData = Op.getConstantOperandVal(4);
3994
3995 bool IsStream = !Locality;
3996 // When the locality number is set
3997 if (Locality) {
3998 // The front-end should have filtered out the out-of-range values
3999 assert(Locality <= 3 && "Prefetch locality out-of-range");
4000 // The locality degree is the opposite of the cache speed.
4001 // Put the number the other way around.
4002 // The encoding starts at 0 for level 1
4003 Locality = 3 - Locality;
4004 }
4005
4006 // built the mask value encoding the expected behavior.
4007 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4008 (!IsData << 3) | // IsDataCache bit
4009 (Locality << 1) | // Cache level bits
4010 (unsigned)IsStream; // Stream bit
4011 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4012 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4013 Op.getOperand(1));
4014}
4015
4016SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4017 SelectionDAG &DAG) const {
4018 EVT VT = Op.getValueType();
4019 if (VT.isScalableVector())
4020 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4021
4022 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4023 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4024
4025 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4026 return SDValue();
4027}
4028
4029SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4030 SelectionDAG &DAG) const {
4031 EVT VT = Op.getValueType();
4032 if (VT.isScalableVector())
4033 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4034
4035 bool IsStrict = Op->isStrictFPOpcode();
4036 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4037 EVT SrcVT = SrcVal.getValueType();
4038 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4039
4040 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4041 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4042
4043 // Expand cases where the result type is BF16 but we don't have hardware
4044 // instructions to lower it.
4045 if (VT.getScalarType() == MVT::bf16 &&
4046 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4047 Subtarget->hasBF16())) {
4048 SDLoc dl(Op);
4049 SDValue Narrow = SrcVal;
4050 SDValue NaN;
4051 EVT I32 = SrcVT.changeElementType(MVT::i32);
4052 EVT F32 = SrcVT.changeElementType(MVT::f32);
4053 if (SrcVT.getScalarType() == MVT::f32) {
4054 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4055 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4056 if (!NeverSNaN) {
4057 // Set the quiet bit.
4058 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4059 DAG.getConstant(0x400000, dl, I32));
4060 }
4061 } else if (SrcVT.getScalarType() == MVT::f64) {
4062 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4063 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4064 } else {
4065 return SDValue();
4066 }
4067 if (!Trunc) {
4068 SDValue One = DAG.getConstant(1, dl, I32);
4069 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4070 DAG.getShiftAmountConstant(16, I32, dl));
4071 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4072 SDValue RoundingBias =
4073 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4074 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4075 }
4076
4077 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4078 // 0x80000000.
4079 if (NaN) {
4080 SDValue IsNaN = DAG.getSetCC(
4081 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4082 SrcVal, SrcVal, ISD::SETUO);
4083 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4084 }
4085
4086 // Now that we have rounded, shift the bits into position.
4087 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4088 DAG.getShiftAmountConstant(16, I32, dl));
4089 if (VT.isVector()) {
4090 EVT I16 = I32.changeVectorElementType(MVT::i16);
4091 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4092 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4093 }
4094 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4095 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4096 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4097 : Result;
4098 }
4099
4100 if (SrcVT != MVT::f128) {
4101 // Expand cases where the input is a vector bigger than NEON.
4103 return SDValue();
4104
4105 // It's legal except when f128 is involved
4106 return Op;
4107 }
4108
4109 return SDValue();
4110}
4111
4112SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4113 SelectionDAG &DAG) const {
4114 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4115 // Any additional optimization in this function should be recorded
4116 // in the cost tables.
4117 bool IsStrict = Op->isStrictFPOpcode();
4118 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4119 EVT VT = Op.getValueType();
4120
4121 if (VT.isScalableVector()) {
4122 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4125 return LowerToPredicatedOp(Op, DAG, Opcode);
4126 }
4127
4128 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4129 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4130 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4131
4132 unsigned NumElts = InVT.getVectorNumElements();
4133
4134 // f16 conversions are promoted to f32 when full fp16 is not supported.
4135 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4136 InVT.getVectorElementType() == MVT::bf16) {
4137 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4138 SDLoc dl(Op);
4139 if (IsStrict) {
4140 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4141 {Op.getOperand(0), Op.getOperand(1)});
4142 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4143 {Ext.getValue(1), Ext.getValue(0)});
4144 }
4145 return DAG.getNode(
4146 Op.getOpcode(), dl, Op.getValueType(),
4147 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4148 }
4149
4150 uint64_t VTSize = VT.getFixedSizeInBits();
4151 uint64_t InVTSize = InVT.getFixedSizeInBits();
4152 if (VTSize < InVTSize) {
4153 SDLoc dl(Op);
4154 if (IsStrict) {
4156 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4157 {Op.getOperand(0), Op.getOperand(1)});
4158 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4159 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4160 }
4161 SDValue Cv =
4162 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4163 Op.getOperand(0));
4164 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4165 }
4166
4167 if (VTSize > InVTSize) {
4168 SDLoc dl(Op);
4169 MVT ExtVT =
4172 if (IsStrict) {
4173 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4174 {Op.getOperand(0), Op.getOperand(1)});
4175 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4176 {Ext.getValue(1), Ext.getValue(0)});
4177 }
4178 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4179 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4180 }
4181
4182 // Use a scalar operation for conversions between single-element vectors of
4183 // the same size.
4184 if (NumElts == 1) {
4185 SDLoc dl(Op);
4186 SDValue Extract = DAG.getNode(
4188 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4189 EVT ScalarVT = VT.getScalarType();
4190 if (IsStrict)
4191 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4192 {Op.getOperand(0), Extract});
4193 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4194 }
4195
4196 // Type changing conversions are illegal.
4197 return Op;
4198}
4199
4200SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4201 SelectionDAG &DAG) const {
4202 bool IsStrict = Op->isStrictFPOpcode();
4203 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4204
4205 if (SrcVal.getValueType().isVector())
4206 return LowerVectorFP_TO_INT(Op, DAG);
4207
4208 // f16 conversions are promoted to f32 when full fp16 is not supported.
4209 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4210 SrcVal.getValueType() == MVT::bf16) {
4211 SDLoc dl(Op);
4212 if (IsStrict) {
4213 SDValue Ext =
4214 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4215 {Op.getOperand(0), SrcVal});
4216 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4217 {Ext.getValue(1), Ext.getValue(0)});
4218 }
4219 return DAG.getNode(
4220 Op.getOpcode(), dl, Op.getValueType(),
4221 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4222 }
4223
4224 if (SrcVal.getValueType() != MVT::f128) {
4225 // It's legal except when f128 is involved
4226 return Op;
4227 }
4228
4229 return SDValue();
4230}
4231
4232SDValue
4233AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4234 SelectionDAG &DAG) const {
4235 // AArch64 FP-to-int conversions saturate to the destination element size, so
4236 // we can lower common saturating conversions to simple instructions.
4237 SDValue SrcVal = Op.getOperand(0);
4238 EVT SrcVT = SrcVal.getValueType();
4239 EVT DstVT = Op.getValueType();
4240 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4241
4242 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4243 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4244 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4245 assert(SatWidth <= DstElementWidth &&
4246 "Saturation width cannot exceed result width");
4247
4248 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4249 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4250 // types, so this is hard to reach.
4251 if (DstVT.isScalableVector())
4252 return SDValue();
4253
4254 EVT SrcElementVT = SrcVT.getVectorElementType();
4255
4256 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4257 if ((SrcElementVT == MVT::f16 &&
4258 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4259 SrcElementVT == MVT::bf16) {
4260 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4261 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4262 SrcVT = F32VT;
4263 SrcElementVT = MVT::f32;
4264 SrcElementWidth = 32;
4265 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4266 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4267 return SDValue();
4268
4269 SDLoc DL(Op);
4270 // Cases that we can emit directly.
4271 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4272 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4273 DAG.getValueType(DstVT.getScalarType()));
4274
4275 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4276 // result. This is only valid if the legal cvt is larger than the saturate
4277 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4278 // (at least until sqxtn is selected).
4279 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4280 return SDValue();
4281
4282 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4283 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4284 DAG.getValueType(IntVT.getScalarType()));
4285 SDValue Sat;
4286 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4287 SDValue MinC = DAG.getConstant(
4288 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4289 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4290 SDValue MaxC = DAG.getConstant(
4291 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4292 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4293 } else {
4294 SDValue MinC = DAG.getConstant(
4295 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4296 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4297 }
4298
4299 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4300}
4301
4302SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4303 SelectionDAG &DAG) const {
4304 // AArch64 FP-to-int conversions saturate to the destination register size, so
4305 // we can lower common saturating conversions to simple instructions.
4306 SDValue SrcVal = Op.getOperand(0);
4307 EVT SrcVT = SrcVal.getValueType();
4308
4309 if (SrcVT.isVector())
4310 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4311
4312 EVT DstVT = Op.getValueType();
4313 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4314 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4315 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4316 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4317
4318 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4319 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4320 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4321 SrcVT = MVT::f32;
4322 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4323 SrcVT != MVT::bf16)
4324 return SDValue();
4325
4326 SDLoc DL(Op);
4327 // Cases that we can emit directly.
4328 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4329 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4330 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4331 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4332 DAG.getValueType(DstVT));
4333
4334 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4335 // result. This is only valid if the legal cvt is larger than the saturate
4336 // width.
4337 if (DstWidth < SatWidth)
4338 return SDValue();
4339
4340 SDValue NativeCvt =
4341 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4342 SDValue Sat;
4343 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4344 SDValue MinC = DAG.getConstant(
4345 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4346 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4347 SDValue MaxC = DAG.getConstant(
4348 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4349 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4350 } else {
4351 SDValue MinC = DAG.getConstant(
4352 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4353 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4354 }
4355
4356 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4357}
4358
4359SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4360 SelectionDAG &DAG) const {
4361 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4362 // Any additional optimization in this function should be recorded
4363 // in the cost tables.
4364 bool IsStrict = Op->isStrictFPOpcode();
4365 EVT VT = Op.getValueType();
4366 SDLoc dl(Op);
4367 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4368 EVT InVT = In.getValueType();
4369 unsigned Opc = Op.getOpcode();
4370 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4371
4372 if (VT.isScalableVector()) {
4373 if (InVT.getVectorElementType() == MVT::i1) {
4374 // We can't directly extend an SVE predicate; extend it first.
4375 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4376 EVT CastVT = getPromotedVTForPredicate(InVT);
4377 In = DAG.getNode(CastOpc, dl, CastVT, In);
4378 return DAG.getNode(Opc, dl, VT, In);
4379 }
4380
4381 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4383 return LowerToPredicatedOp(Op, DAG, Opcode);
4384 }
4385
4386 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4387 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4388 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4389
4390 // Promote bf16 conversions to f32.
4391 if (VT.getVectorElementType() == MVT::bf16) {
4392 EVT F32 = VT.changeElementType(MVT::f32);
4393 if (IsStrict) {
4394 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4395 {Op.getOperand(0), In});
4396 return DAG.getNode(
4397 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4398 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4399 }
4400 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4401 DAG.getNode(Op.getOpcode(), dl, F32, In),
4402 DAG.getIntPtrConstant(0, dl));
4403 }
4404
4405 uint64_t VTSize = VT.getFixedSizeInBits();
4406 uint64_t InVTSize = InVT.getFixedSizeInBits();
4407 if (VTSize < InVTSize) {
4408 MVT CastVT =
4410 InVT.getVectorNumElements());
4411 if (IsStrict) {
4412 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4413 {Op.getOperand(0), In});
4414 return DAG.getNode(
4415 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4416 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4417 }
4418 In = DAG.getNode(Opc, dl, CastVT, In);
4419 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4420 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4421 }
4422
4423 if (VTSize > InVTSize) {
4424 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4426 In = DAG.getNode(CastOpc, dl, CastVT, In);
4427 if (IsStrict)
4428 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4429 return DAG.getNode(Opc, dl, VT, In);
4430 }
4431
4432 // Use a scalar operation for conversions between single-element vectors of
4433 // the same size.
4434 if (VT.getVectorNumElements() == 1) {
4435 SDValue Extract = DAG.getNode(
4437 In, DAG.getConstant(0, dl, MVT::i64));
4438 EVT ScalarVT = VT.getScalarType();
4439 if (IsStrict)
4440 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4441 {Op.getOperand(0), Extract});
4442 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4443 }
4444
4445 return Op;
4446}
4447
4448SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4449 SelectionDAG &DAG) const {
4450 if (Op.getValueType().isVector())
4451 return LowerVectorINT_TO_FP(Op, DAG);
4452
4453 bool IsStrict = Op->isStrictFPOpcode();
4454 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4455
4456 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4457 Op->getOpcode() == ISD::SINT_TO_FP;
4458
4459 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4460 SDLoc dl(Op);
4461 if (IsStrict) {
4462 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4463 {Op.getOperand(0), SrcVal});
4464 return DAG.getNode(
4465 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4466 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4467 }
4468 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4469 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4470 DAG.getIntPtrConstant(0, dl));
4471 };
4472
4473 if (Op.getValueType() == MVT::bf16) {
4474 unsigned MaxWidth = IsSigned
4475 ? DAG.ComputeMaxSignificantBits(SrcVal)
4476 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4477 // bf16 conversions are promoted to f32 when converting from i16.
4478 if (MaxWidth <= 24) {
4479 return IntToFpViaPromotion(MVT::f32);
4480 }
4481
4482 // bf16 conversions are promoted to f64 when converting from i32.
4483 if (MaxWidth <= 53) {
4484 return IntToFpViaPromotion(MVT::f64);
4485 }
4486
4487 // We need to be careful about i64 -> bf16.
4488 // Consider an i32 22216703.
4489 // This number cannot be represented exactly as an f32 and so a itofp will
4490 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4491 // However, the correct bf16 was supposed to be 22151168.0
4492 // We need to use sticky rounding to get this correct.
4493 if (SrcVal.getValueType() == MVT::i64) {
4494 SDLoc DL(Op);
4495 // This algorithm is equivalent to the following:
4496 // uint64_t SrcHi = SrcVal & ~0xfffull;
4497 // uint64_t SrcLo = SrcVal & 0xfffull;
4498 // uint64_t Highest = SrcVal >> 53;
4499 // bool HasHighest = Highest != 0;
4500 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4501 // double Rounded = static_cast<double>(ToRound);
4502 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4503 // uint64_t HasLo = SrcLo != 0;
4504 // bool NeedsAdjustment = HasHighest & HasLo;
4505 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4506 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4507 // return static_cast<__bf16>(Adjusted);
4508 //
4509 // Essentially, what happens is that SrcVal either fits perfectly in a
4510 // double-precision value or it is too big. If it is sufficiently small,
4511 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4512 // ensure that u64 -> double has no rounding error by only using the 52
4513 // MSB of the input. The low order bits will get merged into a sticky bit
4514 // which will avoid issues incurred by double rounding.
4515
4516 // Signed conversion is more or less like so:
4517 // copysign((__bf16)abs(SrcVal), SrcVal)
4518 SDValue SignBit;
4519 if (IsSigned) {
4520 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4521 DAG.getConstant(1ull << 63, DL, MVT::i64));
4522 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4523 }
4524 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4525 DAG.getConstant(~0xfffull, DL, MVT::i64));
4526 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4527 DAG.getConstant(0xfffull, DL, MVT::i64));
4529 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4530 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4531 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4532 SDValue ToRound =
4533 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4534 SDValue Rounded =
4535 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4536 {Op.getOperand(0), ToRound})
4537 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4538
4539 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4540 if (SignBit) {
4541 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4542 }
4543
4544 SDValue HasHighest = DAG.getSetCC(
4545 DL,
4546 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4547 Highest, Zero64, ISD::SETNE);
4548
4549 SDValue HasLo = DAG.getSetCC(
4550 DL,
4551 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4552 SrcLo, Zero64, ISD::SETNE);
4553
4554 SDValue NeedsAdjustment =
4555 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4556 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4557
4558 SDValue AdjustedBits =
4559 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4560 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4561 return IsStrict
4563 {Op.getValueType(), MVT::Other},
4564 {Rounded.getValue(1), Adjusted,
4565 DAG.getIntPtrConstant(0, DL)})
4566 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4567 DAG.getIntPtrConstant(0, DL, true));
4568 }
4569 }
4570
4571 // f16 conversions are promoted to f32 when full fp16 is not supported.
4572 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4573 return IntToFpViaPromotion(MVT::f32);
4574 }
4575
4576 // i128 conversions are libcalls.
4577 if (SrcVal.getValueType() == MVT::i128)
4578 return SDValue();
4579
4580 // Other conversions are legal, unless it's to the completely software-based
4581 // fp128.
4582 if (Op.getValueType() != MVT::f128)
4583 return Op;
4584 return SDValue();
4585}
4586
4587SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4588 SelectionDAG &DAG) const {
4589 // For iOS, we want to call an alternative entry point: __sincos_stret,
4590 // which returns the values in two S / D registers.
4591 SDLoc dl(Op);
4592 SDValue Arg = Op.getOperand(0);
4593 EVT ArgVT = Arg.getValueType();
4594 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4595
4597 ArgListEntry Entry;
4598
4599 Entry.Node = Arg;
4600 Entry.Ty = ArgTy;
4601 Entry.IsSExt = false;
4602 Entry.IsZExt = false;
4603 Args.push_back(Entry);
4604
4605 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4606 : RTLIB::SINCOS_STRET_F32;
4607 const char *LibcallName = getLibcallName(LC);
4608 SDValue Callee =
4609 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4610
4611 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4613 CLI.setDebugLoc(dl)
4614 .setChain(DAG.getEntryNode())
4615 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4616
4617 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4618 return CallResult.first;
4619}
4620
4621static MVT getSVEContainerType(EVT ContentTy);
4622
4623SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4624 SelectionDAG &DAG) const {
4625 EVT OpVT = Op.getValueType();
4626 EVT ArgVT = Op.getOperand(0).getValueType();
4627
4629 return LowerFixedLengthBitcastToSVE(Op, DAG);
4630
4631 if (OpVT.isScalableVector()) {
4632 // Bitcasting between unpacked vector types of different element counts is
4633 // not a NOP because the live elements are laid out differently.
4634 // 01234567
4635 // e.g. nxv2i32 = XX??XX??
4636 // nxv4f16 = X?X?X?X?
4637 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4638 return SDValue();
4639
4640 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4641 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4642 "Expected int->fp bitcast!");
4643 SDValue ExtResult =
4645 Op.getOperand(0));
4646 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4647 }
4648 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4649 }
4650
4651 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4652 return SDValue();
4653
4654 // Bitcasts between f16 and bf16 are legal.
4655 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4656 return Op;
4657
4658 assert(ArgVT == MVT::i16);
4659 SDLoc DL(Op);
4660
4661 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4662 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4663 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4664}
4665
4666static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4667 if (OrigVT.getSizeInBits() >= 64)
4668 return OrigVT;
4669
4670 assert(OrigVT.isSimple() && "Expecting a simple value type");
4671
4672 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4673 switch (OrigSimpleTy) {
4674 default: llvm_unreachable("Unexpected Vector Type");
4675 case MVT::v2i8:
4676 case MVT::v2i16:
4677 return MVT::v2i32;
4678 case MVT::v4i8:
4679 return MVT::v4i16;
4680 }
4681}
4682
4684 const EVT &OrigTy,
4685 const EVT &ExtTy,
4686 unsigned ExtOpcode) {
4687 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4688 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4689 // 64-bits we need to insert a new extension so that it will be 64-bits.
4690 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4691 if (OrigTy.getSizeInBits() >= 64)
4692 return N;
4693
4694 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4695 EVT NewVT = getExtensionTo64Bits(OrigTy);
4696
4697 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4698}
4699
4700// Returns lane if Op extracts from a two-element vector and lane is constant
4701// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4702static std::optional<uint64_t>
4704 SDNode *OpNode = Op.getNode();
4705 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4706 return std::nullopt;
4707
4708 EVT VT = OpNode->getOperand(0).getValueType();
4709 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4710 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4711 return std::nullopt;
4712
4713 return C->getZExtValue();
4714}
4715
4717 bool isSigned) {
4718 EVT VT = N.getValueType();
4719
4720 if (N.getOpcode() != ISD::BUILD_VECTOR)
4721 return false;
4722
4723 for (const SDValue &Elt : N->op_values()) {
4724 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4725 unsigned EltSize = VT.getScalarSizeInBits();
4726 unsigned HalfSize = EltSize / 2;
4727 if (isSigned) {
4728 if (!isIntN(HalfSize, C->getSExtValue()))
4729 return false;
4730 } else {
4731 if (!isUIntN(HalfSize, C->getZExtValue()))
4732 return false;
4733 }
4734 continue;
4735 }
4736 return false;
4737 }
4738
4739 return true;
4740}
4741
4743 EVT VT = N.getValueType();
4744 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4745
4746 unsigned NumElts = VT.getVectorNumElements();
4747 unsigned OrigEltSize = VT.getScalarSizeInBits();
4748 unsigned EltSize = OrigEltSize / 2;
4749 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4750
4751 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4752 if (DAG.MaskedValueIsZero(N, HiBits))
4753 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4754
4755 if (ISD::isExtOpcode(N.getOpcode()))
4756 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4757 N.getOperand(0).getValueType(), VT,
4758 N.getOpcode());
4759
4760 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4761 SDLoc dl(N);
4763 for (unsigned i = 0; i != NumElts; ++i) {
4764 const APInt &CInt = N.getConstantOperandAPInt(i);
4765 // Element types smaller than 32 bits are not legal, so use i32 elements.
4766 // The values are implicitly truncated so sext vs. zext doesn't matter.
4767 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4768 }
4769 return DAG.getBuildVector(TruncVT, dl, Ops);
4770}
4771
4773 return N.getOpcode() == ISD::SIGN_EXTEND ||
4774 N.getOpcode() == ISD::ANY_EXTEND ||
4775 isExtendedBUILD_VECTOR(N, DAG, true);
4776}
4777
4779 return N.getOpcode() == ISD::ZERO_EXTEND ||
4780 N.getOpcode() == ISD::ANY_EXTEND ||
4781 isExtendedBUILD_VECTOR(N, DAG, false);
4782}
4783
4785 unsigned Opcode = N.getOpcode();
4786 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4787 SDValue N0 = N.getOperand(0);
4788 SDValue N1 = N.getOperand(1);
4789 return N0->hasOneUse() && N1->hasOneUse() &&
4790 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4791 }
4792 return false;
4793}
4794
4796 unsigned Opcode = N.getOpcode();
4797 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4798 SDValue N0 = N.getOperand(0);
4799 SDValue N1 = N.getOperand(1);
4800 return N0->hasOneUse() && N1->hasOneUse() &&
4801 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4802 }
4803 return false;
4804}
4805
4806SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4807 SelectionDAG &DAG) const {
4808 // The rounding mode is in bits 23:22 of the FPSCR.
4809 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4810 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4811 // so that the shift + and get folded into a bitfield extract.
4812 SDLoc dl(Op);
4813
4814 SDValue Chain = Op.getOperand(0);
4815 SDValue FPCR_64 = DAG.getNode(
4816 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4817 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4818 Chain = FPCR_64.getValue(1);
4819 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4820 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4821 DAG.getConstant(1U << 22, dl, MVT::i32));
4822 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4823 DAG.getConstant(22, dl, MVT::i32));
4824 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4825 DAG.getConstant(3, dl, MVT::i32));
4826 return DAG.getMergeValues({AND, Chain}, dl);
4827}
4828
4829SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4830 SelectionDAG &DAG) const {
4831 SDLoc DL(Op);
4832 SDValue Chain = Op->getOperand(0);
4833 SDValue RMValue = Op->getOperand(1);
4834
4835 // The rounding mode is in bits 23:22 of the FPCR.
4836 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4837 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4838 // ((arg - 1) & 3) << 22).
4839 //
4840 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4841 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4842 // generated llvm.set.rounding to ensure this condition.
4843
4844 // Calculate new value of FPCR[23:22].
4845 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4846 DAG.getConstant(1, DL, MVT::i32));
4847 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4848 DAG.getConstant(0x3, DL, MVT::i32));
4849 RMValue =
4850 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4851 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4852 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4853
4854 // Get current value of FPCR.
4855 SDValue Ops[] = {
4856 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4857 SDValue FPCR =
4858 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4859 Chain = FPCR.getValue(1);
4860 FPCR = FPCR.getValue(0);
4861
4862 // Put new rounding mode into FPSCR[23:22].
4863 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4864 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4865 DAG.getConstant(RMMask, DL, MVT::i64));
4866 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4867 SDValue Ops2[] = {
4868 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4869 FPCR};
4870 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4871}
4872
4873static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4874 SDLoc DL, bool &IsMLA) {
4875 bool IsN0SExt = isSignExtended(N0, DAG);
4876 bool IsN1SExt = isSignExtended(N1, DAG);
4877 if (IsN0SExt && IsN1SExt)
4878 return AArch64ISD::SMULL;
4879
4880 bool IsN0ZExt = isZeroExtended(N0, DAG);
4881 bool IsN1ZExt = isZeroExtended(N1, DAG);
4882
4883 if (IsN0ZExt && IsN1ZExt)
4884 return AArch64ISD::UMULL;
4885
4886 // Select SMULL if we can replace zext with sext.
4887 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4888 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4889 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4890 SDValue ZextOperand;
4891 if (IsN0ZExt)
4892 ZextOperand = N0.getOperand(0);
4893 else
4894 ZextOperand = N1.getOperand(0);
4895 if (DAG.SignBitIsZero(ZextOperand)) {
4896 SDValue NewSext =
4897 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4898 if (IsN0ZExt)
4899 N0 = NewSext;
4900 else
4901 N1 = NewSext;
4902 return AArch64ISD::SMULL;
4903 }
4904 }
4905
4906 // Select UMULL if we can replace the other operand with an extend.
4907 if (IsN0ZExt || IsN1ZExt) {
4908 EVT VT = N0.getValueType();
4910 VT.getScalarSizeInBits() / 2);
4911 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4912 return AArch64ISD::UMULL;
4913 }
4914
4915 if (!IsN1SExt && !IsN1ZExt)
4916 return 0;
4917
4918 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4919 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4920 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4921 IsMLA = true;
4922 return AArch64ISD::SMULL;
4923 }
4924 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4925 IsMLA = true;
4926 return AArch64ISD::UMULL;
4927 }
4928 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4929 std::swap(N0, N1);
4930 IsMLA = true;
4931 return AArch64ISD::UMULL;
4932 }
4933 return 0;
4934}
4935
4936SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4937 EVT VT = Op.getValueType();
4938
4939 bool OverrideNEON = !Subtarget->isNeonAvailable();
4940 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4941 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4942
4943 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4944 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4945 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
4946 "unexpected type for custom-lowering ISD::MUL");
4947 SDValue N0 = Op.getOperand(0);
4948 SDValue N1 = Op.getOperand(1);
4949 bool isMLA = false;
4950 EVT OVT = VT;
4951 if (VT.is64BitVector()) {
4952 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4953 isNullConstant(N0.getOperand(1)) &&
4955 isNullConstant(N1.getOperand(1))) {
4956 N0 = N0.getOperand(0);
4957 N1 = N1.getOperand(0);
4958 VT = N0.getValueType();
4959 } else {
4960 if (VT == MVT::v1i64) {
4961 if (Subtarget->hasSVE())
4962 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4963 // Fall through to expand this. It is not legal.
4964 return SDValue();
4965 } else
4966 // Other vector multiplications are legal.
4967 return Op;
4968 }
4969 }
4970
4971 SDLoc DL(Op);
4972 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
4973
4974 if (!NewOpc) {
4975 if (VT.getVectorElementType() == MVT::i64) {
4976 // If SVE is available then i64 vector multiplications can also be made
4977 // legal.
4978 if (Subtarget->hasSVE())
4979 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4980 // Fall through to expand this. It is not legal.
4981 return SDValue();
4982 } else
4983 // Other vector multiplications are legal.
4984 return Op;
4985 }
4986
4987 // Legalize to a S/UMULL instruction
4988 SDValue Op0;
4989 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
4990 if (!isMLA) {
4991 Op0 = skipExtensionForVectorMULL(N0, DAG);
4993 Op1.getValueType().is64BitVector() &&
4994 "unexpected types for extended operands to VMULL");
4995 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
4996 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
4997 DAG.getConstant(0, DL, MVT::i64));
4998 }
4999 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5000 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5001 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5004 EVT Op1VT = Op1.getValueType();
5005 return DAG.getNode(
5007 DAG.getNode(N0.getOpcode(), DL, VT,
5008 DAG.getNode(NewOpc, DL, VT,
5009 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5010 DAG.getNode(NewOpc, DL, VT,
5011 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5012 DAG.getConstant(0, DL, MVT::i64));
5013}
5014
5015static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5016 int Pattern) {
5017 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5018 return DAG.getConstant(1, DL, MVT::nxv1i1);
5019 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5020 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5021}
5022
5024 bool IsSigned, bool IsEqual) {
5025 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5026 !isa<ConstantSDNode>(Op.getOperand(2)))
5027 return SDValue();
5028
5029 SDLoc dl(Op);
5030 APInt X = Op.getConstantOperandAPInt(1);
5031 APInt Y = Op.getConstantOperandAPInt(2);
5032 bool Overflow;
5033 APInt NumActiveElems =
5034 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5035
5036 if (Overflow)
5037 return SDValue();
5038
5039 if (IsEqual) {
5040 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5041 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5042 : NumActiveElems.uadd_ov(One, Overflow);
5043 if (Overflow)
5044 return SDValue();
5045 }
5046
5047 std::optional<unsigned> PredPattern =
5049 unsigned MinSVEVectorSize = std::max(
5051 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5052 if (PredPattern != std::nullopt &&
5053 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5054 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5055
5056 return SDValue();
5057}
5058
5059// Returns a safe bitcast between two scalable vector predicates, where
5060// any newly created lanes from a widening bitcast are defined as zero.
5062 SDLoc DL(Op);
5063 EVT InVT = Op.getValueType();
5064
5065 assert(InVT.getVectorElementType() == MVT::i1 &&
5066 VT.getVectorElementType() == MVT::i1 &&
5067 "Expected a predicate-to-predicate bitcast");
5069 InVT.isScalableVector() &&
5070 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5071 "Only expect to cast between legal scalable predicate types!");
5072
5073 // Return the operand if the cast isn't changing type,
5074 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5075 if (InVT == VT)
5076 return Op;
5077
5078 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5079
5080 // We only have to zero the lanes if new lanes are being defined, e.g. when
5081 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5082 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5083 // we can return here.
5084 if (InVT.bitsGT(VT))
5085 return Reinterpret;
5086
5087 // Check if the other lanes are already known to be zeroed by
5088 // construction.
5090 return Reinterpret;
5091
5092 // Zero the newly introduced lanes.
5093 SDValue Mask = DAG.getConstant(1, DL, InVT);
5094 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5095 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5096}
5097
5098SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5099 SDValue Chain, SDLoc DL,
5100 EVT VT) const {
5101 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5103 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5104 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5107 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5109 RetTy, Callee, std::move(Args));
5110 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5111 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5112 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5113 Mask);
5114}
5115
5116// Lower an SME LDR/STR ZA intrinsic
5117// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5118// folded into the instruction
5119// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5120// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5121// and tile slice registers
5122// ldr(%tileslice, %ptr, %vecnum)
5123// ->
5124// %svl = rdsvl
5125// %ptr2 = %ptr + %svl * %vecnum
5126// %tileslice2 = %tileslice + %vecnum
5127// ldr [%tileslice2, 0], [%ptr2, 0]
5128// Case 3: If the vecnum is an immediate out of range, then the same is done as
5129// case 2, but the base and slice registers are modified by the greatest
5130// multiple of 15 lower than the vecnum and the remainder is folded into the
5131// instruction. This means that successive loads and stores that are offset from
5132// each other can share the same base and slice register updates.
5133// ldr(%tileslice, %ptr, 22)
5134// ldr(%tileslice, %ptr, 23)
5135// ->
5136// %svl = rdsvl
5137// %ptr2 = %ptr + %svl * 15
5138// %tileslice2 = %tileslice + 15
5139// ldr [%tileslice2, 7], [%ptr2, 7]
5140// ldr [%tileslice2, 8], [%ptr2, 8]
5141// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5142// operand and the immediate can be folded into the instruction, like case 2.
5143// ldr(%tileslice, %ptr, %vecnum + 7)
5144// ldr(%tileslice, %ptr, %vecnum + 8)
5145// ->
5146// %svl = rdsvl
5147// %ptr2 = %ptr + %svl * %vecnum
5148// %tileslice2 = %tileslice + %vecnum
5149// ldr [%tileslice2, 7], [%ptr2, 7]
5150// ldr [%tileslice2, 8], [%ptr2, 8]
5151// Case 5: The vecnum being an add of an immediate out of range is also handled,
5152// in which case the same remainder logic as case 3 is used.
5154 SDLoc DL(N);
5155
5156 SDValue TileSlice = N->getOperand(2);
5157 SDValue Base = N->getOperand(3);
5158 SDValue VecNum = N->getOperand(4);
5159 int32_t ConstAddend = 0;
5160 SDValue VarAddend = VecNum;
5161
5162 // If the vnum is an add of an immediate, we can fold it into the instruction
5163 if (VecNum.getOpcode() == ISD::ADD &&
5164 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5165 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5166 VarAddend = VecNum.getOperand(0);
5167 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5168 ConstAddend = ImmNode->getSExtValue();
5169 VarAddend = SDValue();
5170 }
5171
5172 int32_t ImmAddend = ConstAddend % 16;
5173 if (int32_t C = (ConstAddend - ImmAddend)) {
5174 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5175 VarAddend = VarAddend
5176 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5177 : CVal;
5178 }
5179
5180 if (VarAddend) {
5181 // Get the vector length that will be multiplied by vnum
5182 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5183 DAG.getConstant(1, DL, MVT::i32));
5184
5185 // Multiply SVL and vnum then add it to the base
5186 SDValue Mul = DAG.getNode(
5187 ISD::MUL, DL, MVT::i64,
5188 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5189 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5190 // Just add vnum to the tileslice
5191 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5192 }
5193
5195 DL, MVT::Other,
5196 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5197 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5198}
5199
5200SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5201 SelectionDAG &DAG) const {
5202 unsigned IntNo = Op.getConstantOperandVal(1);
5203 SDLoc DL(Op);
5204 switch (IntNo) {
5205 default:
5206 return SDValue(); // Don't custom lower most intrinsics.
5207 case Intrinsic::aarch64_prefetch: {
5208 SDValue Chain = Op.getOperand(0);
5209 SDValue Addr = Op.getOperand(2);
5210
5211 unsigned IsWrite = Op.getConstantOperandVal(3);
5212 unsigned Locality = Op.getConstantOperandVal(4);
5213 unsigned IsStream = Op.getConstantOperandVal(5);
5214 unsigned IsData = Op.getConstantOperandVal(6);
5215 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5216 (!IsData << 3) | // IsDataCache bit
5217 (Locality << 1) | // Cache level bits
5218 (unsigned)IsStream; // Stream bit
5219
5220 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5221 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5222 }
5223 case Intrinsic::aarch64_sme_str:
5224 case Intrinsic::aarch64_sme_ldr: {
5225 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5226 }
5227 case Intrinsic::aarch64_sme_za_enable:
5228 return DAG.getNode(
5229 AArch64ISD::SMSTART, DL, MVT::Other,
5230 Op->getOperand(0), // Chain
5231 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5232 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5233 case Intrinsic::aarch64_sme_za_disable:
5234 return DAG.getNode(
5235 AArch64ISD::SMSTOP, DL, MVT::Other,
5236 Op->getOperand(0), // Chain
5237 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5238 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5239 }
5240}
5241
5242SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5243 SelectionDAG &DAG) const {
5244 unsigned IntNo = Op.getConstantOperandVal(1);
5245 SDLoc DL(Op);
5246 switch (IntNo) {
5247 default:
5248 return SDValue(); // Don't custom lower most intrinsics.
5249 case Intrinsic::aarch64_mops_memset_tag: {
5250 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5251 SDValue Chain = Node->getChain();
5252 SDValue Dst = Op.getOperand(2);
5253 SDValue Val = Op.getOperand(3);
5254 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5255 SDValue Size = Op.getOperand(4);
5256 auto Alignment = Node->getMemOperand()->getAlign();
5257 bool IsVol = Node->isVolatile();
5258 auto DstPtrInfo = Node->getPointerInfo();
5259
5260 const auto &SDI =
5261 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5262 SDValue MS =
5263 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5264 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5265
5266 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5267 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5268 // LowerOperationWrapper will complain that the number of results has
5269 // changed.
5270 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5271 }
5272 }
5273}
5274
5275SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5276 SelectionDAG &DAG) const {
5277 unsigned IntNo = Op.getConstantOperandVal(0);
5278 SDLoc dl(Op);
5279 switch (IntNo) {
5280 default: return SDValue(); // Don't custom lower most intrinsics.
5281 case Intrinsic::thread_pointer: {
5282 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5283 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5284 }
5285 case Intrinsic::aarch64_neon_abs: {
5286 EVT Ty = Op.getValueType();
5287 if (Ty == MVT::i64) {
5288 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5289 Op.getOperand(1));
5290 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5291 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5292 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5293 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5294 } else {
5295 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5296 }
5297 }
5298 case Intrinsic::aarch64_neon_pmull64: {
5299 SDValue LHS = Op.getOperand(1);
5300 SDValue RHS = Op.getOperand(2);
5301
5302 std::optional<uint64_t> LHSLane =
5304 std::optional<uint64_t> RHSLane =
5306
5307 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5308 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5309
5310 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5311 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5312 // which ISel recognizes better. For example, generate a ldr into d*
5313 // registers as opposed to a GPR load followed by a fmov.
5314 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5315 std::optional<uint64_t> OtherLane,
5316 const SDLoc &dl,
5317 SelectionDAG &DAG) -> SDValue {
5318 // If the operand is an higher half itself, rewrite it to
5319 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5320 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5321 if (NLane && *NLane == 1)
5322 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5323 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5324
5325 // Operand N is not a higher half but the other operand is.
5326 if (OtherLane && *OtherLane == 1) {
5327 // If this operand is a lower half, rewrite it to
5328 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5329 // align lanes of two operands. A roundtrip sequence (to move from lane
5330 // 1 to lane 0) is like this:
5331 // mov x8, v0.d[1]
5332 // fmov d0, x8
5333 if (NLane && *NLane == 0)
5334 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5335 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5336 N.getOperand(0),
5337 DAG.getConstant(0, dl, MVT::i64)),
5338 DAG.getConstant(1, dl, MVT::i64));
5339
5340 // Otherwise just dup from main to all lanes.
5341 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5342 }
5343
5344 // Neither operand is an extract of higher half, so codegen may just use
5345 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5346 assert(N.getValueType() == MVT::i64 &&
5347 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5348 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5349 };
5350
5351 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5352 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5353
5354 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5355 }
5356 case Intrinsic::aarch64_neon_smax:
5357 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5358 Op.getOperand(1), Op.getOperand(2));
5359 case Intrinsic::aarch64_neon_umax:
5360 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5361 Op.getOperand(1), Op.getOperand(2));
5362 case Intrinsic::aarch64_neon_smin:
5363 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5364 Op.getOperand(1), Op.getOperand(2));
5365 case Intrinsic::aarch64_neon_umin:
5366 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5367 Op.getOperand(1), Op.getOperand(2));
5368 case Intrinsic::aarch64_neon_scalar_sqxtn:
5369 case Intrinsic::aarch64_neon_scalar_sqxtun:
5370 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5371 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5372 if (Op.getValueType() == MVT::i32)
5373 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5374 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5375 Op.getOperand(0),
5376 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5377 Op.getOperand(1))));
5378 return SDValue();
5379 }
5380 case Intrinsic::aarch64_sve_whilelo:
5381 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5382 /*IsEqual=*/false);
5383 case Intrinsic::aarch64_sve_whilelt:
5384 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5385 /*IsEqual=*/false);
5386 case Intrinsic::aarch64_sve_whilels:
5387 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5388 /*IsEqual=*/true);
5389 case Intrinsic::aarch64_sve_whilele:
5390 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5391 /*IsEqual=*/true);
5392 case Intrinsic::aarch64_sve_sunpkhi:
5393 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5394 Op.getOperand(1));
5395 case Intrinsic::aarch64_sve_sunpklo:
5396 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5397 Op.getOperand(1));
5398 case Intrinsic::aarch64_sve_uunpkhi:
5399 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5400 Op.getOperand(1));
5401 case Intrinsic::aarch64_sve_uunpklo:
5402 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5403 Op.getOperand(1));
5404 case Intrinsic::aarch64_sve_clasta_n:
5405 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5406 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5407 case Intrinsic::aarch64_sve_clastb_n:
5408 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5409 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5410 case Intrinsic::aarch64_sve_lasta:
5411 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5412 Op.getOperand(1), Op.getOperand(2));
5413 case Intrinsic::aarch64_sve_lastb:
5414 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5415 Op.getOperand(1), Op.getOperand(2));
5416 case Intrinsic::aarch64_sve_rev:
5417 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5418 Op.getOperand(1));
5419 case Intrinsic::aarch64_sve_tbl:
5420 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5421 Op.getOperand(1), Op.getOperand(2));
5422 case Intrinsic::aarch64_sve_trn1:
5423 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5424 Op.getOperand(1), Op.getOperand(2));
5425 case Intrinsic::aarch64_sve_trn2:
5426 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5427 Op.getOperand(1), Op.getOperand(2));
5428 case Intrinsic::aarch64_sve_uzp1:
5429 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5430 Op.getOperand(1), Op.getOperand(2));
5431 case Intrinsic::aarch64_sve_uzp2:
5432 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5433 Op.getOperand(1), Op.getOperand(2));
5434 case Intrinsic::aarch64_sve_zip1:
5435 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5436 Op.getOperand(1), Op.getOperand(2));
5437 case Intrinsic::aarch64_sve_zip2:
5438 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5439 Op.getOperand(1), Op.getOperand(2));
5440 case Intrinsic::aarch64_sve_splice:
5441 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5442 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5443 case Intrinsic::aarch64_sve_ptrue:
5444 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5445 case Intrinsic::aarch64_sve_clz:
5446 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5447 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5448 case Intrinsic::aarch64_sme_cntsb:
5449 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5450 DAG.getConstant(1, dl, MVT::i32));
5451 case Intrinsic::aarch64_sme_cntsh: {
5452 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5453 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5454 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5455 }
5456 case Intrinsic::aarch64_sme_cntsw: {
5457 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5458 DAG.getConstant(1, dl, MVT::i32));
5459 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5460 DAG.getConstant(2, dl, MVT::i32));
5461 }
5462 case Intrinsic::aarch64_sme_cntsd: {
5463 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5464 DAG.getConstant(1, dl, MVT::i32));
5465 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5466 DAG.getConstant(3, dl, MVT::i32));
5467 }
5468 case Intrinsic::aarch64_sve_cnt: {
5469 SDValue Data = Op.getOperand(3);
5470 // CTPOP only supports integer operands.
5471 if (Data.getValueType().isFloatingPoint())
5472 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5473 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5474 Op.getOperand(2), Data, Op.getOperand(1));
5475 }
5476 case Intrinsic::aarch64_sve_dupq_lane:
5477 return LowerDUPQLane(Op, DAG);
5478 case Intrinsic::aarch64_sve_convert_from_svbool:
5479 if (Op.getValueType() == MVT::aarch64svcount)
5480 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5481 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5482 case Intrinsic::aarch64_sve_convert_to_svbool:
5483 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5484 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5485 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5486 case Intrinsic::aarch64_sve_fneg:
5487 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5488 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5489 case Intrinsic::aarch64_sve_frintp:
5490 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5491 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5492 case Intrinsic::aarch64_sve_frintm:
5493 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5494 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5495 case Intrinsic::aarch64_sve_frinti:
5496 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5497 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5498 case Intrinsic::aarch64_sve_frintx:
5499 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5500 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5501 case Intrinsic::aarch64_sve_frinta:
5502 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5503 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5504 case Intrinsic::aarch64_sve_frintn:
5505 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5506 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5507 case Intrinsic::aarch64_sve_frintz:
5508 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5509 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5510 case Intrinsic::aarch64_sve_ucvtf:
5512 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5513 Op.getOperand(1));
5514 case Intrinsic::aarch64_sve_scvtf:
5516 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5517 Op.getOperand(1));
5518 case Intrinsic::aarch64_sve_fcvtzu:
5520 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5521 Op.getOperand(1));
5522 case Intrinsic::aarch64_sve_fcvtzs:
5524 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5525 Op.getOperand(1));
5526 case Intrinsic::aarch64_sve_fsqrt:
5527 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5528 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5529 case Intrinsic::aarch64_sve_frecpx:
5530 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5531 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5532 case Intrinsic::aarch64_sve_frecpe_x:
5533 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5534 Op.getOperand(1));
5535 case Intrinsic::aarch64_sve_frecps_x:
5536 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5537 Op.getOperand(1), Op.getOperand(2));
5538 case Intrinsic::aarch64_sve_frsqrte_x:
5539 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5540 Op.getOperand(1));
5541 case Intrinsic::aarch64_sve_frsqrts_x:
5542 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5543 Op.getOperand(1), Op.getOperand(2));
5544 case Intrinsic::aarch64_sve_fabs:
5545 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5546 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5547 case Intrinsic::aarch64_sve_abs:
5548 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5549 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5550 case Intrinsic::aarch64_sve_neg:
5551 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5552 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5553 case Intrinsic::aarch64_sve_insr: {
5554 SDValue Scalar = Op.getOperand(2);
5555 EVT ScalarTy = Scalar.getValueType();
5556 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5557 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5558
5559 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5560 Op.getOperand(1), Scalar);
5561 }
5562 case Intrinsic::aarch64_sve_rbit:
5564 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5565 Op.getOperand(1));
5566 case Intrinsic::aarch64_sve_revb:
5567 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5568 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5569 case Intrinsic::aarch64_sve_revh:
5570 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5571 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5572 case Intrinsic::aarch64_sve_revw:
5573 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5574 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5575 case Intrinsic::aarch64_sve_revd:
5576 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5577 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5578 case Intrinsic::aarch64_sve_sxtb:
5579 return DAG.getNode(
5581 Op.getOperand(2), Op.getOperand(3),
5582 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5583 Op.getOperand(1));
5584 case Intrinsic::aarch64_sve_sxth:
5585 return DAG.getNode(
5587 Op.getOperand(2), Op.getOperand(3),
5588 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5589 Op.getOperand(1));
5590 case Intrinsic::aarch64_sve_sxtw:
5591 return DAG.getNode(
5593 Op.getOperand(2), Op.getOperand(3),
5594 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5595 Op.getOperand(1));
5596 case Intrinsic::aarch64_sve_uxtb:
5597 return DAG.getNode(
5599 Op.getOperand(2), Op.getOperand(3),
5600 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5601 Op.getOperand(1));
5602 case Intrinsic::aarch64_sve_uxth:
5603 return DAG.getNode(
5605 Op.getOperand(2), Op.getOperand(3),
5606 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5607 Op.getOperand(1));
5608 case Intrinsic::aarch64_sve_uxtw:
5609 return DAG.getNode(
5611 Op.getOperand(2), Op.getOperand(3),
5612 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5613 Op.getOperand(1));
5614 case Intrinsic::localaddress: {
5615 const auto &MF = DAG.getMachineFunction();
5616 const auto *RegInfo = Subtarget->getRegisterInfo();
5617 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5618 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5619 Op.getSimpleValueType());
5620 }
5621
5622 case Intrinsic::eh_recoverfp: {
5623 // FIXME: This needs to be implemented to correctly handle highly aligned
5624 // stack objects. For now we simply return the incoming FP. Refer D53541
5625 // for more details.
5626 SDValue FnOp = Op.getOperand(1);
5627 SDValue IncomingFPOp = Op.getOperand(2);
5628 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5629 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5630 if (!Fn)
5632 "llvm.eh.recoverfp must take a function as the first argument");
5633 return IncomingFPOp;
5634 }
5635
5636 case Intrinsic::aarch64_neon_vsri:
5637 case Intrinsic::aarch64_neon_vsli:
5638 case Intrinsic::aarch64_sve_sri:
5639 case Intrinsic::aarch64_sve_sli: {
5640 EVT Ty = Op.getValueType();
5641
5642 if (!Ty.isVector())
5643 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5644
5645 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5646
5647 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5648 IntNo == Intrinsic::aarch64_sve_sri;
5649 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5650 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5651 Op.getOperand(3));
5652 }
5653
5654 case Intrinsic::aarch64_neon_srhadd:
5655 case Intrinsic::aarch64_neon_urhadd:
5656 case Intrinsic::aarch64_neon_shadd:
5657 case Intrinsic::aarch64_neon_uhadd: {
5658 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5659 IntNo == Intrinsic::aarch64_neon_shadd);
5660 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5661 IntNo == Intrinsic::aarch64_neon_urhadd);
5662 unsigned Opcode = IsSignedAdd
5663 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5664 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5665 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5666 Op.getOperand(2));
5667 }
5668 case Intrinsic::aarch64_neon_saddlp:
5669 case Intrinsic::aarch64_neon_uaddlp: {
5670 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5673 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5674 }
5675 case Intrinsic::aarch64_neon_sdot:
5676 case Intrinsic::aarch64_neon_udot:
5677 case Intrinsic::aarch64_sve_sdot:
5678 case Intrinsic::aarch64_sve_udot: {
5679 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5680 IntNo == Intrinsic::aarch64_sve_udot)
5683 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5684 Op.getOperand(2), Op.getOperand(3));
5685 }
5686 case Intrinsic::get_active_lane_mask: {
5687 SDValue ID =
5688 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5689 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5690 Op.getOperand(1), Op.getOperand(2));
5691 }
5692 case Intrinsic::aarch64_neon_uaddlv: {
5693 EVT OpVT = Op.getOperand(1).getValueType();
5694 EVT ResVT = Op.getValueType();
5695 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5696 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5697 // In order to avoid insert_subvector, used v4i32 than v2i32.
5698 SDValue UADDLV =
5699 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5700 SDValue EXTRACT_VEC_ELT =
5701 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5702 DAG.getConstant(0, dl, MVT::i64));
5703 return EXTRACT_VEC_ELT;
5704 }
5705 return SDValue();
5706 }
5707 case Intrinsic::experimental_cttz_elts: {
5708 SDValue NewCttzElts =
5709 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5710
5711 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5712 }
5713 }
5714}
5715
5716bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5717 if (VT.getVectorElementType() == MVT::i8 ||
5718 VT.getVectorElementType() == MVT::i16) {
5719 EltTy = MVT::i32;
5720 return true;
5721 }
5722 return false;
5723}
5724
5725bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5726 EVT DataVT) const {
5727 const EVT IndexVT = Extend.getOperand(0).getValueType();
5728 // SVE only supports implicit extension of 32-bit indices.
5729 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5730 return false;
5731
5732 // Indices cannot be smaller than the main data type.
5733 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5734 return false;
5735
5736 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5737 // element container type, which would violate the previous clause.
5738 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5739}
5740
5741bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5742 EVT ExtVT = ExtVal.getValueType();
5743 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5744 return false;
5745
5746 // It may be worth creating extending masked loads if there are multiple
5747 // masked loads using the same predicate. That way we'll end up creating
5748 // extending masked loads that may then get split by the legaliser. This
5749 // results in just one set of predicate unpacks at the start, instead of
5750 // multiple sets of vector unpacks after each load.
5751 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5752 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5753 // Disable extending masked loads for fixed-width for now, since the code
5754 // quality doesn't look great.
5755 if (!ExtVT.isScalableVector())
5756 return false;
5757
5758 unsigned NumExtMaskedLoads = 0;
5759 for (auto *U : Ld->getMask()->uses())
5760 if (isa<MaskedLoadSDNode>(U))
5761 NumExtMaskedLoads++;
5762
5763 if (NumExtMaskedLoads <= 1)
5764 return false;
5765 }
5766 }
5767
5768 return true;
5769}
5770
5771unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5772 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5773 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5775 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5777 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5779 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5781 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5783 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5785 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5787 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5789 };
5790 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5791 return AddrModes.find(Key)->second;
5792}
5793
5794unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5795 switch (Opcode) {
5796 default:
5797 llvm_unreachable("unimplemented opcode");
5798 return Opcode;
5813 }
5814}
5815
5816SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5817 SelectionDAG &DAG) const {
5818 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5819
5820 SDLoc DL(Op);
5821 SDValue Chain = MGT->getChain();
5822 SDValue PassThru = MGT->getPassThru();
5823 SDValue Mask = MGT->getMask();
5824 SDValue BasePtr = MGT->getBasePtr();
5825 SDValue Index = MGT->getIndex();
5826 SDValue Scale = MGT->getScale();
5827 EVT VT = Op.getValueType();
5828 EVT MemVT = MGT->getMemoryVT();
5829 ISD::LoadExtType ExtType = MGT->getExtensionType();
5830 ISD::MemIndexType IndexType = MGT->getIndexType();
5831
5832 // SVE supports zero (and so undef) passthrough values only, everything else
5833 // must be handled manually by an explicit select on the load's output.
5834 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5835 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5836 SDValue Load =
5837 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5838 MGT->getMemOperand(), IndexType, ExtType);
5839 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5840 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5841 }
5842
5843 bool IsScaled = MGT->isIndexScaled();
5844 bool IsSigned = MGT->isIndexSigned();
5845
5846 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5847 // must be calculated before hand.
5848 uint64_t ScaleVal = Scale->getAsZExtVal();
5849 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5850 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5851 EVT IndexVT = Index.getValueType();
5852 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5853 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5854 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5855
5856 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5857 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5858 MGT->getMemOperand(), IndexType, ExtType);
5859 }
5860
5861 // Lower fixed length gather to a scalable equivalent.
5862 if (VT.isFixedLengthVector()) {
5863 assert(Subtarget->useSVEForFixedLengthVectors() &&
5864 "Cannot lower when not using SVE for fixed vectors!");
5865
5866 // NOTE: Handle floating-point as if integer then bitcast the result.
5868 MemVT = MemVT.changeVectorElementTypeToInteger();
5869
5870 // Find the smallest integer fixed length vector we can use for the gather.
5871 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5872 if (DataVT.getVectorElementType() == MVT::i64 ||
5873 Index.getValueType().getVectorElementType() == MVT::i64 ||
5874 Mask.getValueType().getVectorElementType() == MVT::i64)
5875 PromotedVT = VT.changeVectorElementType(MVT::i64);
5876
5877 // Promote vector operands except for passthrough, which we know is either
5878 // undef or zero, and thus best constructed directly.
5879 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5880 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5881 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5882
5883 // A promoted result type forces the need for an extending load.
5884 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5885 ExtType = ISD::EXTLOAD;
5886
5887 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5888
5889 // Convert fixed length vector operands to scalable.
5890 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5891 Index = convertToScalableVector(DAG, ContainerVT, Index);
5893 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5894 : DAG.getConstant(0, DL, ContainerVT);
5895
5896 // Emit equivalent scalable vector gather.
5897 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5898 SDValue Load =
5899 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5900 Ops, MGT->getMemOperand(), IndexType, ExtType);
5901
5902 // Extract fixed length data then convert to the required result type.
5903 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5904 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5905 if (VT.isFloatingPoint())
5906 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5907
5908 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5909 }
5910
5911 // Everything else is legal.
5912 return Op;
5913}
5914
5915SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5916 SelectionDAG &DAG) const {
5917 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5918
5919 SDLoc DL(Op);
5920 SDValue Chain = MSC->getChain();
5921 SDValue StoreVal = MSC->getValue();
5922 SDValue Mask = MSC->getMask();
5923 SDValue BasePtr = MSC->getBasePtr();
5924 SDValue Index = MSC->getIndex();
5925 SDValue Scale = MSC->getScale();
5926 EVT VT = StoreVal.getValueType();
5927 EVT MemVT = MSC->getMemoryVT();
5928 ISD::MemIndexType IndexType = MSC->getIndexType();
5929 bool Truncating = MSC->isTruncatingStore();
5930
5931 bool IsScaled = MSC->isIndexScaled();
5932 bool IsSigned = MSC->isIndexSigned();
5933
5934 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5935 // must be calculated before hand.
5936 uint64_t ScaleVal = Scale->getAsZExtVal();
5937 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5938 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5939 EVT IndexVT = Index.getValueType();
5940 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5941 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5942 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5943
5944 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5945 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5946 MSC->getMemOperand(), IndexType, Truncating);
5947 }
5948
5949 // Lower fixed length scatter to a scalable equivalent.
5950 if (VT.isFixedLengthVector()) {
5951 assert(Subtarget->useSVEForFixedLengthVectors() &&
5952 "Cannot lower when not using SVE for fixed vectors!");
5953
5954 // Once bitcast we treat floating-point scatters as if integer.
5955 if (VT.isFloatingPoint()) {
5957 MemVT = MemVT.changeVectorElementTypeToInteger();
5958 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5959 }
5960
5961 // Find the smallest integer fixed length vector we can use for the scatter.
5962 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5963 if (VT.getVectorElementType() == MVT::i64 ||
5964 Index.getValueType().getVectorElementType() == MVT::i64 ||
5965 Mask.getValueType().getVectorElementType() == MVT::i64)
5966 PromotedVT = VT.changeVectorElementType(MVT::i64);
5967
5968 // Promote vector operands.
5969 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5970 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5971 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5972 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5973
5974 // A promoted value type forces the need for a truncating store.
5975 if (PromotedVT != VT)
5976 Truncating = true;
5977
5978 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5979
5980 // Convert fixed length vector operands to scalable.
5981 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5982 Index = convertToScalableVector(DAG, ContainerVT, Index);
5984 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
5985
5986 // Emit equivalent scalable vector scatter.
5987 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5988 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5989 MSC->getMemOperand(), IndexType, Truncating);
5990 }
5991
5992 // Everything else is legal.
5993 return Op;
5994}
5995
5996SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5997 SDLoc DL(Op);
5998 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
5999 assert(LoadNode && "Expected custom lowering of a masked load node");
6000 EVT VT = Op->getValueType(0);
6001
6002 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6003 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6004
6005 SDValue PassThru = LoadNode->getPassThru();
6006 SDValue Mask = LoadNode->getMask();
6007
6008 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6009 return Op;
6010
6012 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6013 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6014 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6015 LoadNode->getExtensionType());
6016
6017 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6018
6019 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6020}
6021
6022// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6024 EVT VT, EVT MemVT,
6025 SelectionDAG &DAG) {
6026 assert(VT.isVector() && "VT should be a vector type");
6027 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6028
6029 SDValue Value = ST->getValue();
6030
6031 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6032 // the word lane which represent the v4i8 subvector. It optimizes the store
6033 // to:
6034 //
6035 // xtn v0.8b, v0.8h
6036 // str s0, [x0]
6037
6038 SDValue Undef = DAG.getUNDEF(MVT::i16);
6039 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6040 {Undef, Undef, Undef, Undef});
6041
6042 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6043 Value, UndefVec);
6044 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6045
6046 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6047 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6048 Trunc, DAG.getConstant(0, DL, MVT::i64));
6049
6050 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6051 ST->getBasePtr(), ST->getMemOperand());
6052}
6053
6054// Custom lowering for any store, vector or scalar and/or default or with
6055// a truncate operations. Currently only custom lower truncate operation
6056// from vector v4i16 to v4i8 or volatile stores of i128.
6057SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6058 SelectionDAG &DAG) const {
6059 SDLoc Dl(Op);
6060 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6061 assert (StoreNode && "Can only custom lower store nodes");
6062
6063 SDValue Value = StoreNode->getValue();
6064
6065 EVT VT = Value.getValueType();
6066 EVT MemVT = StoreNode->getMemoryVT();
6067
6068 if (VT.isVector()) {
6070 VT,
6071 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6072 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6073
6074 unsigned AS = StoreNode->getAddressSpace();
6075 Align Alignment = StoreNode->getAlign();
6076 if (Alignment < MemVT.getStoreSize() &&
6077 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6078 StoreNode->getMemOperand()->getFlags(),
6079 nullptr)) {
6080 return scalarizeVectorStore(StoreNode, DAG);
6081 }
6082
6083 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6084 MemVT == MVT::v4i8) {
6085 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6086 }
6087 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6088 // the custom lowering, as there are no un-paired non-temporal stores and
6089 // legalization will break up 256 bit inputs.
6091 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6092 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6093 (MemVT.getScalarSizeInBits() == 8u ||
6094 MemVT.getScalarSizeInBits() == 16u ||
6095 MemVT.getScalarSizeInBits() == 32u ||
6096 MemVT.getScalarSizeInBits() == 64u)) {
6097 SDValue Lo =
6100 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6101 SDValue Hi =
6104 StoreNode->getValue(),
6105 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6107 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6108 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6109 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6110 return Result;
6111 }
6112 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6113 return LowerStore128(Op, DAG);
6114 } else if (MemVT == MVT::i64x8) {
6115 SDValue Value = StoreNode->getValue();
6116 assert(Value->getValueType(0) == MVT::i64x8);
6117 SDValue Chain = StoreNode->getChain();
6118 SDValue Base = StoreNode->getBasePtr();
6119 EVT PtrVT = Base.getValueType();
6120 for (unsigned i = 0; i < 8; i++) {
6121 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6122 Value, DAG.getConstant(i, Dl, MVT::i32));
6123 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6124 DAG.getConstant(i * 8, Dl, PtrVT));
6125 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6126 StoreNode->getOriginalAlign());
6127 }
6128 return Chain;
6129 }
6130
6131 return SDValue();
6132}
6133
6134/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6135SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6136 SelectionDAG &DAG) const {
6137 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6138 assert(StoreNode->getMemoryVT() == MVT::i128);
6139 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6140
6141 bool IsStoreRelease =
6143 if (StoreNode->isAtomic())
6144 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6145 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6148
6149 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6150 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6151 ? StoreNode->getOperand(1)
6152 : StoreNode->getOperand(2);
6153 SDLoc DL(Op);
6154 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6155 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6156 if (DAG.getDataLayout().isBigEndian())
6157 std::swap(StoreValue.first, StoreValue.second);
6159 Opcode, DL, DAG.getVTList(MVT::Other),
6160 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6161 StoreNode->getBasePtr()},
6162 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6163 return Result;
6164}
6165
6166SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6167 SelectionDAG &DAG) const {
6168 SDLoc DL(Op);
6169 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6170 assert(LoadNode && "Expected custom lowering of a load node");
6171
6172 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6174 SDValue Base = LoadNode->getBasePtr();
6175 SDValue Chain = LoadNode->getChain();
6176 EVT PtrVT = Base.getValueType();
6177 for (unsigned i = 0; i < 8; i++) {
6178 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6179 DAG.getConstant(i * 8, DL, PtrVT));
6180 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6181 LoadNode->getPointerInfo(),
6182 LoadNode->getOriginalAlign());
6183 Ops.push_back(Part);
6184 Chain = SDValue(Part.getNode(), 1);
6185 }
6186 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6187 return DAG.getMergeValues({Loaded, Chain}, DL);
6188 }
6189
6190 // Custom lowering for extending v4i8 vector loads.
6191 EVT VT = Op->getValueType(0);
6192 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6193
6194 if (LoadNode->getMemoryVT() != MVT::v4i8)
6195 return SDValue();
6196
6197 unsigned ExtType;
6198 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6199 ExtType = ISD::SIGN_EXTEND;
6200 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6201 LoadNode->getExtensionType() == ISD::EXTLOAD)
6202 ExtType = ISD::ZERO_EXTEND;
6203 else
6204 return SDValue();
6205
6206 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6207 LoadNode->getBasePtr(), MachinePointerInfo());
6208 SDValue Chain = Load.getValue(1);
6209 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6210 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6211 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6212 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6213 DAG.getConstant(0, DL, MVT::i64));
6214 if (VT == MVT::v4i32)
6215 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6216 return DAG.getMergeValues({Ext, Chain}, DL);
6217}
6218
6219// Generate SUBS and CSEL for integer abs.
6220SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6221 MVT VT = Op.getSimpleValueType();
6222
6223 if (VT.isVector())
6224 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6225
6226 SDLoc DL(Op);
6227 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6228 Op.getOperand(0));
6229 // Generate SUBS & CSEL.
6230 SDValue Cmp =
6231 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6232 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6233 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6234 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6235 Cmp.getValue(1));
6236}
6237
6239 SDValue Chain = Op.getOperand(0);
6240 SDValue Cond = Op.getOperand(1);
6241 SDValue Dest = Op.getOperand(2);
6242
6244 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6245 SDLoc dl(Op);
6246 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6247 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6248 Cmp);
6249 }
6250
6251 return SDValue();
6252}
6253
6254// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6255// FSHL is converted to FSHR before deciding what to do with it
6257 SDValue Shifts = Op.getOperand(2);
6258 // Check if the shift amount is a constant
6259 // If opcode is FSHL, convert it to FSHR
6260 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6261 SDLoc DL(Op);
6262 MVT VT = Op.getSimpleValueType();
6263
6264 if (Op.getOpcode() == ISD::FSHL) {
6265 unsigned int NewShiftNo =
6266 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6267 return DAG.getNode(
6268 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6269 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6270 } else if (Op.getOpcode() == ISD::FSHR) {
6271 return Op;
6272 }
6273 }
6274
6275 return SDValue();
6276}
6277
6279 SDValue X = Op.getOperand(0);
6280 EVT XScalarTy = X.getValueType();
6281 SDValue Exp = Op.getOperand(1);
6282
6283 SDLoc DL(Op);
6284 EVT XVT, ExpVT;
6285 switch (Op.getSimpleValueType().SimpleTy) {
6286 default:
6287 return SDValue();
6288 case MVT::bf16:
6289 case MVT::f16:
6290 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6291 [[fallthrough]];
6292 case MVT::f32:
6293 XVT = MVT::nxv4f32;
6294 ExpVT = MVT::nxv4i32;
6295 break;
6296 case MVT::f64:
6297 XVT = MVT::nxv2f64;
6298 ExpVT = MVT::nxv2i64;
6299 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6300 break;
6301 }
6302
6303 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6304 SDValue VX =
6305 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6306 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6307 DAG.getUNDEF(ExpVT), Exp, Zero);
6308 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6309 AArch64SVEPredPattern::all);
6310 SDValue FScale =
6312 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6313 VPg, VX, VExp);
6314 SDValue Final =
6315 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6316 if (X.getValueType() != XScalarTy)
6317 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6318 DAG.getIntPtrConstant(1, SDLoc(Op)));
6319 return Final;
6320}
6321
6323 SelectionDAG &DAG) const {
6324 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6325 LLVM_DEBUG(Op.dump());
6326
6327 switch (Op.getOpcode()) {
6328 default:
6329 llvm_unreachable("unimplemented operand");
6330 return SDValue();
6331 case ISD::BITCAST:
6332 return LowerBITCAST(Op, DAG);
6333 case ISD::GlobalAddress:
6334 return LowerGlobalAddress(Op, DAG);
6336 return LowerGlobalTLSAddress(Op, DAG);
6337 case ISD::SETCC:
6338 case ISD::STRICT_FSETCC:
6340 return LowerSETCC(Op, DAG);
6341 case ISD::SETCCCARRY:
6342 return LowerSETCCCARRY(Op, DAG);
6343 case ISD::BRCOND:
6344 return LowerBRCOND(Op, DAG);
6345 case ISD::BR_CC:
6346 return LowerBR_CC(Op, DAG);
6347 case ISD::SELECT:
6348 return LowerSELECT(Op, DAG);
6349 case ISD::SELECT_CC:
6350 return LowerSELECT_CC(Op, DAG);
6351 case ISD::JumpTable:
6352 return LowerJumpTable(Op, DAG);
6353 case ISD::BR_JT:
6354 return LowerBR_JT(Op, DAG);
6355 case ISD::ConstantPool:
6356 return LowerConstantPool(Op, DAG);
6357 case ISD::BlockAddress:
6358 return LowerBlockAddress(Op, DAG);
6359 case ISD::VASTART:
6360 return LowerVASTART(Op, DAG);
6361 case ISD::VACOPY:
6362 return LowerVACOPY(Op, DAG);
6363 case ISD::VAARG:
6364 return LowerVAARG(Op, DAG);
6365 case ISD::UADDO_CARRY:
6366 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6367 case ISD::USUBO_CARRY:
6368 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6369 case ISD::SADDO_CARRY:
6370 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6371 case ISD::SSUBO_CARRY:
6372 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6373 case ISD::SADDO:
6374 case ISD::UADDO:
6375 case ISD::SSUBO:
6376 case ISD::USUBO:
6377 case ISD::SMULO:
6378 case ISD::UMULO:
6379 return LowerXALUO(Op, DAG);
6380 case ISD::FADD:
6381 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6382 case ISD::FSUB:
6383 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6384 case ISD::FMUL:
6385 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6386 case ISD::FMA:
6387 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6388 case ISD::FDIV:
6389 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6390 case ISD::FNEG:
6391 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6392 case ISD::FCEIL:
6393 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6394 case ISD::FFLOOR:
6395 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6396 case ISD::FNEARBYINT:
6397 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6398 case ISD::FRINT:
6399 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6400 case ISD::FROUND:
6401 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6402 case ISD::FROUNDEVEN:
6403 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6404 case ISD::FTRUNC:
6405 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6406 case ISD::FSQRT:
6407 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6408 case ISD::FABS:
6409 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6410 case ISD::FP_ROUND:
6412 return LowerFP_ROUND(Op, DAG);
6413 case ISD::FP_EXTEND:
6414 return LowerFP_EXTEND(Op, DAG);
6415 case ISD::FRAMEADDR:
6416 return LowerFRAMEADDR(Op, DAG);
6417 case ISD::SPONENTRY:
6418 return LowerSPONENTRY(Op, DAG);
6419 case ISD::RETURNADDR:
6420 return LowerRETURNADDR(Op, DAG);
6422 return LowerADDROFRETURNADDR(Op, DAG);
6424 return LowerCONCAT_VECTORS(Op, DAG);
6426 return LowerINSERT_VECTOR_ELT(Op, DAG);
6428 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6429 case ISD::BUILD_VECTOR:
6430 return LowerBUILD_VECTOR(Op, DAG);
6432 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6434 return LowerVECTOR_SHUFFLE(Op, DAG);
6435 case ISD::SPLAT_VECTOR:
6436 return LowerSPLAT_VECTOR(Op, DAG);
6438 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6440 return LowerINSERT_SUBVECTOR(Op, DAG);
6441 case ISD::SDIV:
6442 case ISD::UDIV:
6443 return LowerDIV(Op, DAG);
6444 case ISD::SMIN:
6445 case ISD::UMIN:
6446 case ISD::SMAX:
6447 case ISD::UMAX:
6448 return LowerMinMax(Op, DAG);
6449 case ISD::SRA:
6450 case ISD::SRL:
6451 case ISD::SHL:
6452 return LowerVectorSRA_SRL_SHL(Op, DAG);
6453 case ISD::SHL_PARTS:
6454 case ISD::SRL_PARTS:
6455 case ISD::SRA_PARTS:
6456 return LowerShiftParts(Op, DAG);
6457 case ISD::CTPOP:
6458 case ISD::PARITY:
6459 return LowerCTPOP_PARITY(Op, DAG);
6460 case ISD::FCOPYSIGN:
6461 return LowerFCOPYSIGN(Op, DAG);
6462 case ISD::OR:
6463 return LowerVectorOR(Op, DAG);
6464 case ISD::XOR:
6465 return LowerXOR(Op, DAG);
6466 case ISD::PREFETCH:
6467 return LowerPREFETCH(Op, DAG);
6468 case ISD::SINT_TO_FP:
6469 case ISD::UINT_TO_FP:
6472 return LowerINT_TO_FP(Op, DAG);
6473 case ISD::FP_TO_SINT:
6474 case ISD::FP_TO_UINT:
6477 return LowerFP_TO_INT(Op, DAG);
6480 return LowerFP_TO_INT_SAT(Op, DAG);
6481 case ISD::FSINCOS:
6482 return LowerFSINCOS(Op, DAG);
6483 case ISD::GET_ROUNDING:
6484 return LowerGET_ROUNDING(Op, DAG);
6485 case ISD::SET_ROUNDING:
6486 return LowerSET_ROUNDING(Op, DAG);
6487 case ISD::MUL:
6488 return LowerMUL(Op, DAG);
6489 case ISD::MULHS:
6490 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6491 case ISD::MULHU:
6492 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6494 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6496 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6498 return LowerINTRINSIC_VOID(Op, DAG);
6499 case ISD::ATOMIC_STORE:
6500 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6501 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6502 return LowerStore128(Op, DAG);
6503 }
6504 return SDValue();
6505 case ISD::STORE:
6506 return LowerSTORE(Op, DAG);
6507 case ISD::MSTORE:
6508 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6509 case ISD::MGATHER:
6510 return LowerMGATHER(Op, DAG);
6511 case ISD::MSCATTER:
6512 return LowerMSCATTER(Op, DAG);
6514 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6515 case ISD::VECREDUCE_ADD:
6516 case ISD::VECREDUCE_AND:
6517 case ISD::VECREDUCE_OR:
6518 case ISD::VECREDUCE_XOR:
6528 return LowerVECREDUCE(Op, DAG);
6530 return LowerATOMIC_LOAD_AND(Op, DAG);
6532 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6533 case ISD::VSCALE:
6534 return LowerVSCALE(Op, DAG);
6535 case ISD::ANY_EXTEND:
6536 case ISD::SIGN_EXTEND:
6537 case ISD::ZERO_EXTEND:
6538 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6540 // Only custom lower when ExtraVT has a legal byte based element type.
6541 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6542 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6543 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6544 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6545 return SDValue();
6546
6547 return LowerToPredicatedOp(Op, DAG,
6549 }
6550 case ISD::TRUNCATE:
6551 return LowerTRUNCATE(Op, DAG);
6552 case ISD::MLOAD:
6553 return LowerMLOAD(Op, DAG);
6554 case ISD::LOAD:
6555 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6556 !Subtarget->isNeonAvailable()))
6557 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6558 return LowerLOAD(Op, DAG);
6559 case ISD::ADD:
6560 case ISD::AND:
6561 case ISD::SUB:
6562 return LowerToScalableOp(Op, DAG);
6563 case ISD::FMAXIMUM:
6564 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6565 case ISD::FMAXNUM:
6566 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6567 case ISD::FMINIMUM:
6568 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6569 case ISD::FMINNUM:
6570 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6571 case ISD::VSELECT:
6572 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6573 case ISD::ABS:
6574 return LowerABS(Op, DAG);
6575 case ISD::ABDS:
6576 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6577 case ISD::ABDU:
6578 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6579 case ISD::AVGFLOORS:
6580 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6581 case ISD::AVGFLOORU:
6582 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6583 case ISD::AVGCEILS:
6584 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6585 case ISD::AVGCEILU:
6586 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6587 case ISD::BITREVERSE:
6588 return LowerBitreverse(Op, DAG);
6589 case ISD::BSWAP:
6590 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6591 case ISD::CTLZ:
6592 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6593 case ISD::CTTZ:
6594 return LowerCTTZ(Op, DAG);
6595 case ISD::VECTOR_SPLICE:
6596 return LowerVECTOR_SPLICE(Op, DAG);
6598 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6600 return LowerVECTOR_INTERLEAVE(Op, DAG);
6601 case ISD::LROUND:
6602 case ISD::LLROUND:
6603 case ISD::LRINT:
6604 case ISD::LLRINT: {
6605 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6606 Op.getOperand(0).getValueType() == MVT::bf16) &&
6607 "Expected custom lowering of rounding operations only for f16");
6608 SDLoc DL(Op);
6609 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6610 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6611 }
6612 case ISD::STRICT_LROUND:
6614 case ISD::STRICT_LRINT:
6615 case ISD::STRICT_LLRINT: {
6616 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6617 Op.getOperand(1).getValueType() == MVT::bf16) &&
6618 "Expected custom lowering of rounding operations only for f16");
6619 SDLoc DL(Op);
6620 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6621 {Op.getOperand(0), Op.getOperand(1)});
6622 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6623 {Ext.getValue(1), Ext.getValue(0)});
6624 }
6625 case ISD::WRITE_REGISTER: {
6626 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6627 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6628 SDLoc DL(Op);
6629
6630 SDValue Chain = Op.getOperand(0);
6631 SDValue SysRegName = Op.getOperand(1);
6632 std::pair<SDValue, SDValue> Pair =
6633 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6634
6635 // chain = MSRR(chain, sysregname, lo, hi)
6636 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6637 SysRegName, Pair.first, Pair.second);
6638
6639 return Result;
6640 }
6641 case ISD::FSHL:
6642 case ISD::FSHR:
6643 return LowerFunnelShift(Op, DAG);
6644 case ISD::FLDEXP:
6645 return LowerFLDEXP(Op, DAG);
6646 }
6647}
6648
6650 return !Subtarget->useSVEForFixedLengthVectors();
6651}
6652
6654 EVT VT, bool OverrideNEON) const {
6655 if (!VT.isFixedLengthVector() || !VT.isSimple())
6656 return false;
6657
6658 // Don't use SVE for vectors we cannot scalarize if required.
6659 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6660 // Fixed length predicates should be promoted to i8.
6661 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6662 case MVT::i1:
6663 default:
6664 return false;
6665 case MVT::i8:
6666 case MVT::i16:
6667 case MVT::i32:
6668 case MVT::i64:
6669 case MVT::f16:
6670 case MVT::f32:
6671 case MVT::f64:
6672 break;
6673 }
6674
6675 // NEON-sized vectors can be emulated using SVE instructions.
6676 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6677 return Subtarget->hasSVEorSME();
6678
6679 // Ensure NEON MVTs only belong to a single register class.
6680 if (VT.getFixedSizeInBits() <= 128)
6681 return false;
6682
6683 // Ensure wider than NEON code generation is enabled.
6684 if (!Subtarget->useSVEForFixedLengthVectors())
6685 return false;
6686
6687 // Don't use SVE for types that don't fit.
6688 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6689 return false;
6690
6691 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6692 // the base fixed length SVE support in place.
6693 if (!VT.isPow2VectorType())
6694 return false;
6695
6696 return true;
6697}
6698
6699//===----------------------------------------------------------------------===//
6700// Calling Convention Implementation
6701//===----------------------------------------------------------------------===//
6702
6703static unsigned getIntrinsicID(const SDNode *N) {
6704 unsigned Opcode = N->getOpcode();
6705 switch (Opcode) {
6706 default:
6709 unsigned IID = N->getConstantOperandVal(0);
6710 if (IID < Intrinsic::num_intrinsics)
6711 return IID;
6713 }
6714 }
6715}
6716
6718 SDValue N1) const {
6719 if (!N0.hasOneUse())
6720 return false;
6721
6722 unsigned IID = getIntrinsicID(N1.getNode());
6723 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6724 if (IID == Intrinsic::aarch64_neon_umull ||
6725 N1.getOpcode() == AArch64ISD::UMULL ||
6726 IID == Intrinsic::aarch64_neon_smull ||
6728 return N0.getOpcode() != ISD::ADD;
6729
6730 return true;
6731}
6732
6733/// Selects the correct CCAssignFn for a given CallingConvention value.
6735 bool IsVarArg) const {
6736 switch (CC) {
6737 default:
6738 report_fatal_error("Unsupported calling convention.");
6739 case CallingConv::GHC:
6740 return CC_AArch64_GHC;
6741 case CallingConv::C:
6742 case CallingConv::Fast:
6746 case CallingConv::Swift:
6748 case CallingConv::Tail:
6749 case CallingConv::GRAAL:
6750 if (Subtarget->isTargetWindows()) {
6751 if (IsVarArg) {
6752 if (Subtarget->isWindowsArm64EC())
6755 }
6756 return CC_AArch64_Win64PCS;
6757 }
6758 if (!Subtarget->isTargetDarwin())
6759 return CC_AArch64_AAPCS;
6760 if (!IsVarArg)
6761 return CC_AArch64_DarwinPCS;
6764 case CallingConv::Win64:
6765 if (IsVarArg) {
6766 if (Subtarget->isWindowsArm64EC())
6769 }
6770 return CC_AArch64_Win64PCS;
6772 if (Subtarget->isWindowsArm64EC())
6779 return CC_AArch64_AAPCS;
6784 }
6785}
6786
6787CCAssignFn *
6789 switch (CC) {
6790 default:
6791 return RetCC_AArch64_AAPCS;
6795 if (Subtarget->isWindowsArm64EC())
6797 return RetCC_AArch64_AAPCS;
6798 }
6799}
6800
6801
6802unsigned
6803AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6804 SelectionDAG &DAG) const {
6806 MachineFrameInfo &MFI = MF.getFrameInfo();
6807
6808 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6809 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6810 DAG.getConstant(1, DL, MVT::i32));
6811 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6812 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6813 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6814 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6815 Chain = Buffer.getValue(1);
6816 MFI.CreateVariableSizedObject(Align(1), nullptr);
6817
6818 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6819 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6820
6821 // Store the buffer pointer to the TPIDR2 stack object.
6824 TPIDR2Obj,
6826 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6827
6828 // Set the reserved bytes (10-15) to zero
6829 EVT PtrTy = Ptr.getValueType();
6830 SDValue ReservedPtr =
6831 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6832 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6833 MPI);
6834 ReservedPtr =
6835 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6836 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6837 MPI);
6838
6839 return TPIDR2Obj;
6840}
6841
6842static bool isPassedInFPR(EVT VT) {
6843 return VT.isFixedLengthVector() ||
6844 (VT.isFloatingPoint() && !VT.isScalableVector());
6845}
6846
6847SDValue AArch64TargetLowering::LowerFormalArguments(
6848 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6849 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6850 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6852 const Function &F = MF.getFunction();
6853 MachineFrameInfo &MFI = MF.getFrameInfo();
6854 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6855 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6856 (isVarArg && Subtarget->isWindowsArm64EC());
6858
6860 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6862 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6863 FuncInfo->setIsSVECC(true);
6864
6865 // Assign locations to all of the incoming arguments.
6867 DenseMap<unsigned, SDValue> CopiedRegs;
6868 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6869
6870 // At this point, Ins[].VT may already be promoted to i32. To correctly
6871 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6872 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6873 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6874 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6875 // LocVT.
6876 unsigned NumArgs = Ins.size();
6877 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6878 unsigned CurArgIdx = 0;
6879 for (unsigned i = 0; i != NumArgs; ++i) {
6880 MVT ValVT = Ins[i].VT;
6881 if (Ins[i].isOrigArg()) {
6882 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6883 CurArgIdx = Ins[i].getOrigArgIndex();
6884
6885 // Get type of the original argument.
6886 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6887 /*AllowUnknown*/ true);
6888 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6889 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6890 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6891 ValVT = MVT::i8;
6892 else if (ActualMVT == MVT::i16)
6893 ValVT = MVT::i16;
6894 }
6895 bool UseVarArgCC = false;
6896 if (IsWin64)
6897 UseVarArgCC = isVarArg;
6898 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6899 bool Res =
6900 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6901 assert(!Res && "Call operand has unhandled type");
6902 (void)Res;
6903 }
6904
6906 bool IsLocallyStreaming =
6907 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6908 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6909 SDValue Glue = Chain.getValue(1);
6910
6911 SmallVector<SDValue, 16> ArgValues;
6912 unsigned ExtraArgLocs = 0;
6913 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6914 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6915
6916 if (Ins[i].Flags.isByVal()) {
6917 // Byval is used for HFAs in the PCS, but the system should work in a
6918 // non-compliant manner for larger structs.
6919 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6920 int Size = Ins[i].Flags.getByValSize();
6921 unsigned NumRegs = (Size + 7) / 8;
6922
6923 // FIXME: This works on big-endian for composite byvals, which are the common
6924 // case. It should also work for fundamental types too.
6925 unsigned FrameIdx =
6926 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6927 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6928 InVals.push_back(FrameIdxN);
6929
6930 continue;
6931 }
6932
6933 if (Ins[i].Flags.isSwiftAsync())
6935
6936 SDValue ArgValue;
6937 if (VA.isRegLoc()) {
6938 // Arguments stored in registers.
6939 EVT RegVT = VA.getLocVT();
6940 const TargetRegisterClass *RC;
6941
6942 if (RegVT == MVT::i32)
6943 RC = &AArch64::GPR32RegClass;
6944 else if (RegVT == MVT::i64)
6945 RC = &AArch64::GPR64RegClass;
6946 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6947 RC = &AArch64::FPR16RegClass;
6948 else if (RegVT == MVT::f32)
6949 RC = &AArch64::FPR32RegClass;
6950 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6951 RC = &AArch64::FPR64RegClass;
6952 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6953 RC = &AArch64::FPR128RegClass;
6954 else if (RegVT.isScalableVector() &&
6955 RegVT.getVectorElementType() == MVT::i1) {
6956 FuncInfo->setIsSVECC(true);
6957 RC = &AArch64::PPRRegClass;
6958 } else if (RegVT == MVT::aarch64svcount) {
6959 FuncInfo->setIsSVECC(true);
6960 RC = &AArch64::PPRRegClass;
6961 } else if (RegVT.isScalableVector()) {
6962 FuncInfo->setIsSVECC(true);
6963 RC = &AArch64::ZPRRegClass;
6964 } else
6965 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6966
6967 // Transform the arguments in physical registers into virtual ones.
6968 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
6969
6970 if (IsLocallyStreaming) {
6971 // LocallyStreamingFunctions must insert the SMSTART in the correct
6972 // position, so we use Glue to ensure no instructions can be scheduled
6973 // between the chain of:
6974 // t0: ch,glue = EntryNode
6975 // t1: res,ch,glue = CopyFromReg
6976 // ...
6977 // tn: res,ch,glue = CopyFromReg t(n-1), ..
6978 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6979 // ^^^^^^
6980 // This will be the new Chain/Root node.
6981 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
6982 Glue = ArgValue.getValue(2);
6983 if (isPassedInFPR(ArgValue.getValueType())) {
6984 ArgValue =
6986 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
6987 {ArgValue, Glue});
6988 Glue = ArgValue.getValue(1);
6989 }
6990 } else
6991 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
6992
6993 // If this is an 8, 16 or 32-bit value, it is really passed promoted
6994 // to 64 bits. Insert an assert[sz]ext to capture this, then
6995 // truncate to the right size.
6996 switch (VA.getLocInfo()) {
6997 default:
6998 llvm_unreachable("Unknown loc info!");
6999 case CCValAssign::Full:
7000 break;
7002 assert(
7003 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7004 "Indirect arguments should be scalable on most subtargets");
7005 break;
7006 case CCValAssign::BCvt:
7007 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7008 break;
7009 case CCValAssign::AExt:
7010 case CCValAssign::SExt:
7011 case CCValAssign::ZExt:
7012 break;
7014 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7015 DAG.getConstant(32, DL, RegVT));
7016 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7017 break;
7018 }
7019 } else { // VA.isRegLoc()
7020 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7021 unsigned ArgOffset = VA.getLocMemOffset();
7022 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7023 ? VA.getLocVT().getSizeInBits()
7024 : VA.getValVT().getSizeInBits()) / 8;
7025
7026 uint32_t BEAlign = 0;
7027 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7028 !Ins[i].Flags.isInConsecutiveRegs())
7029 BEAlign = 8 - ArgSize;
7030
7031 SDValue FIN;
7032 MachinePointerInfo PtrInfo;
7033 if (StackViaX4) {
7034 // In both the ARM64EC varargs convention and the thunk convention,
7035 // arguments on the stack are accessed relative to x4, not sp. In
7036 // the thunk convention, there's an additional offset of 32 bytes
7037 // to account for the shadow store.
7038 unsigned ObjOffset = ArgOffset + BEAlign;
7039 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7040 ObjOffset += 32;
7041 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7042 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7043 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7044 DAG.getConstant(ObjOffset, DL, MVT::i64));
7046 } else {
7047 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7048
7049 // Create load nodes to retrieve arguments from the stack.
7050 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7051 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7052 }
7053
7054 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7056 MVT MemVT = VA.getValVT();
7057
7058 switch (VA.getLocInfo()) {
7059 default:
7060 break;
7061 case CCValAssign::Trunc:
7062 case CCValAssign::BCvt:
7063 MemVT = VA.getLocVT();
7064 break;
7067 Subtarget->isWindowsArm64EC()) &&
7068 "Indirect arguments should be scalable on most subtargets");
7069 MemVT = VA.getLocVT();
7070 break;
7071 case CCValAssign::SExt:
7072 ExtType = ISD::SEXTLOAD;
7073 break;
7074 case CCValAssign::ZExt:
7075 ExtType = ISD::ZEXTLOAD;
7076 break;
7077 case CCValAssign::AExt:
7078 ExtType = ISD::EXTLOAD;
7079 break;
7080 }
7081
7082 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7083 MemVT);
7084 }
7085
7086 if (VA.getLocInfo() == CCValAssign::Indirect) {
7087 assert((VA.getValVT().isScalableVT() ||
7088 Subtarget->isWindowsArm64EC()) &&
7089 "Indirect arguments should be scalable on most subtargets");
7090
7091 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7092 unsigned NumParts = 1;
7093 if (Ins[i].Flags.isInConsecutiveRegs()) {
7094 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7095 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7096 ++NumParts;
7097 }
7098
7099 MVT PartLoad = VA.getValVT();
7100 SDValue Ptr = ArgValue;
7101
7102 // Ensure we generate all loads for each tuple part, whilst updating the
7103 // pointer after each load correctly using vscale.
7104 while (NumParts > 0) {
7105 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7106 InVals.push_back(ArgValue);
7107 NumParts--;
7108 if (NumParts > 0) {
7109 SDValue BytesIncrement;
7110 if (PartLoad.isScalableVector()) {
7111 BytesIncrement = DAG.getVScale(
7112 DL, Ptr.getValueType(),
7113 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7114 } else {
7115 BytesIncrement = DAG.getConstant(
7116 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7117 Ptr.getValueType());
7118 }
7120 Flags.setNoUnsignedWrap(true);
7121 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7122 BytesIncrement, Flags);
7123 ExtraArgLocs++;
7124 i++;
7125 }
7126 }
7127 } else {
7128 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7129 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7130 ArgValue, DAG.getValueType(MVT::i32));
7131
7132 // i1 arguments are zero-extended to i8 by the caller. Emit a
7133 // hint to reflect this.
7134 if (Ins[i].isOrigArg()) {
7135 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7136 if (OrigArg->getType()->isIntegerTy(1)) {
7137 if (!Ins[i].Flags.isZExt()) {
7138 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7139 ArgValue.getValueType(), ArgValue);
7140 }
7141 }
7142 }
7143
7144 InVals.push_back(ArgValue);
7145 }
7146 }
7147 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7148
7149 // Insert the SMSTART if this is a locally streaming function and
7150 // make sure it is Glued to the last CopyFromReg value.
7151 if (IsLocallyStreaming) {
7152 SDValue PStateSM;
7153 if (Attrs.hasStreamingCompatibleInterface()) {
7154 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7157 FuncInfo->setPStateSMReg(Reg);
7158 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7159 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7161 } else
7162 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7164
7165 // Ensure that the SMSTART happens after the CopyWithChain such that its
7166 // chain result is used.
7167 for (unsigned I=0; I<InVals.size(); ++I) {
7169 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7170 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7171 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7172 InVals[I].getValueType());
7173 }
7174 }
7175
7176 // varargs
7177 if (isVarArg) {
7178 if (!Subtarget->isTargetDarwin() || IsWin64) {
7179 // The AAPCS variadic function ABI is identical to the non-variadic
7180 // one. As a result there may be more arguments in registers and we should
7181 // save them for future reference.
7182 // Win64 variadic functions also pass arguments in registers, but all float
7183 // arguments are passed in integer registers.
7184 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7185 }
7186
7187 // This will point to the next argument passed via stack.
7188 unsigned VarArgsOffset = CCInfo.getStackSize();
7189 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7190 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7191 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7192 FuncInfo->setVarArgsStackIndex(
7193 MFI.CreateFixedObject(4, VarArgsOffset, true));
7194
7195 if (MFI.hasMustTailInVarArgFunc()) {
7196 SmallVector<MVT, 2> RegParmTypes;
7197 RegParmTypes.push_back(MVT::i64);
7198 RegParmTypes.push_back(MVT::f128);
7199 // Compute the set of forwarded registers. The rest are scratch.
7201 FuncInfo->getForwardedMustTailRegParms();
7202 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7204
7205 // Conservatively forward X8, since it might be used for aggregate return.
7206 if (!CCInfo.isAllocated(AArch64::X8)) {
7207 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7208 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7209 }
7210 }
7211 }
7212
7213 // On Windows, InReg pointers must be returned, so record the pointer in a
7214 // virtual register at the start of the function so it can be returned in the
7215 // epilogue.
7216 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7217 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7218 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7219 Ins[I].Flags.isInReg()) &&
7220 Ins[I].Flags.isSRet()) {
7221 assert(!FuncInfo->getSRetReturnReg());
7222
7223 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7224 Register Reg =
7226 FuncInfo->setSRetReturnReg(Reg);
7227
7228 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7229 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7230 break;
7231 }
7232 }
7233 }
7234
7235 unsigned StackArgSize = CCInfo.getStackSize();
7236 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7237 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7238 // This is a non-standard ABI so by fiat I say we're allowed to make full
7239 // use of the stack area to be popped, which must be aligned to 16 bytes in
7240 // any case:
7241 StackArgSize = alignTo(StackArgSize, 16);
7242
7243 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7244 // a multiple of 16.
7245 FuncInfo->setArgumentStackToRestore(StackArgSize);
7246
7247 // This realignment carries over to the available bytes below. Our own
7248 // callers will guarantee the space is free by giving an aligned value to
7249 // CALLSEQ_START.
7250 }
7251 // Even if we're not expected to free up the space, it's useful to know how
7252 // much is there while considering tail calls (because we can reuse it).
7253 FuncInfo->setBytesInStackArgArea(StackArgSize);
7254
7255 if (Subtarget->hasCustomCallingConv())
7257
7258 // Conservatively assume the function requires the lazy-save mechanism.
7259 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7260 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7261 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7262 }
7263
7264 return Chain;
7265}
7266
7267void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7268 SelectionDAG &DAG,
7269 const SDLoc &DL,
7270 SDValue &Chain) const {
7272 MachineFrameInfo &MFI = MF.getFrameInfo();
7274 auto PtrVT = getPointerTy(DAG.getDataLayout());
7275 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7276
7278
7280 unsigned NumGPRArgRegs = GPRArgRegs.size();
7281 if (Subtarget->isWindowsArm64EC()) {
7282 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7283 // functions.
7284 NumGPRArgRegs = 4;
7285 }
7286 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7287
7288 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7289 int GPRIdx = 0;
7290 if (GPRSaveSize != 0) {
7291 if (IsWin64) {
7292 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7293 if (GPRSaveSize & 15)
7294 // The extra size here, if triggered, will always be 8.
7295 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7296 } else
7297 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7298
7299 SDValue FIN;
7300 if (Subtarget->isWindowsArm64EC()) {
7301 // With the Arm64EC ABI, we reserve the save area as usual, but we
7302 // compute its address relative to x4. For a normal AArch64->AArch64
7303 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7304 // different address.
7305 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7306 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7307 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7308 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7309 } else {
7310 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7311 }
7312
7313 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7314 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7315 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7316 SDValue Store =
7317 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7319 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7320 : MachinePointerInfo::getStack(MF, i * 8));
7321 MemOps.push_back(Store);
7322 FIN =
7323 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7324 }
7325 }
7326 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7327 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7328
7329 if (Subtarget->hasFPARMv8() && !IsWin64) {
7331 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7332 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7333
7334 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7335 int FPRIdx = 0;
7336 if (FPRSaveSize != 0) {
7337 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7338
7339 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7340
7341 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7342 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7343 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7344
7345 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7346 MachinePointerInfo::getStack(MF, i * 16));
7347 MemOps.push_back(Store);
7348 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7349 DAG.getConstant(16, DL, PtrVT));
7350 }
7351 }
7352 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7353 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7354 }
7355
7356 if (!MemOps.empty()) {
7357 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7358 }
7359}
7360
7361/// LowerCallResult - Lower the result values of a call into the
7362/// appropriate copies out of appropriate physical registers.
7363SDValue AArch64TargetLowering::LowerCallResult(
7364 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7365 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7366 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7367 SDValue ThisVal, bool RequiresSMChange) const {
7368 DenseMap<unsigned, SDValue> CopiedRegs;
7369 // Copy all of the result registers out of their specified physreg.
7370 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7371 CCValAssign VA = RVLocs[i];
7372
7373 // Pass 'this' value directly from the argument to return value, to avoid
7374 // reg unit interference
7375 if (i == 0 && isThisReturn) {
7376 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7377 "unexpected return calling convention register assignment");
7378 InVals.push_back(ThisVal);
7379 continue;
7380 }
7381
7382 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7383 // allows one use of a physreg per block.
7384 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7385 if (!Val) {
7386 Val =
7387 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7388 Chain = Val.getValue(1);
7389 InGlue = Val.getValue(2);
7390 CopiedRegs[VA.getLocReg()] = Val;
7391 }
7392
7393 switch (VA.getLocInfo()) {
7394 default:
7395 llvm_unreachable("Unknown loc info!");
7396 case CCValAssign::Full:
7397 break;
7398 case CCValAssign::BCvt:
7399 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7400 break;
7402 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7403 DAG.getConstant(32, DL, VA.getLocVT()));
7404 [[fallthrough]];
7405 case CCValAssign::AExt:
7406 [[fallthrough]];
7407 case CCValAssign::ZExt:
7408 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7409 break;
7410 }
7411
7412 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7414 Val);
7415
7416 InVals.push_back(Val);
7417 }
7418
7419 return Chain;
7420}
7421
7422/// Return true if the calling convention is one that we can guarantee TCO for.
7423static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7424 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7426}
7427
7428/// Return true if we might ever do TCO for calls with this calling convention.
7430 switch (CC) {
7431 case CallingConv::C:
7435 case CallingConv::Swift:
7437 case CallingConv::Tail:
7438 case CallingConv::Fast:
7439 return true;
7440 default:
7441 return false;
7442 }
7443}
7444
7446 const AArch64Subtarget *Subtarget,
7448 CCState &CCInfo) {
7449 const SelectionDAG &DAG = CLI.DAG;
7450 CallingConv::ID CalleeCC = CLI.CallConv;
7451 bool IsVarArg = CLI.IsVarArg;
7452 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7453 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7454
7455 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7456 // for the shadow store.
7457 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7458 CCInfo.AllocateStack(32, Align(16));
7459
7460 unsigned NumArgs = Outs.size();
7461 for (unsigned i = 0; i != NumArgs; ++i) {
7462 MVT ArgVT = Outs[i].VT;
7463 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7464
7465 bool UseVarArgCC = false;
7466 if (IsVarArg) {
7467 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7468 // too, so use the vararg CC to force them to integer registers.
7469 if (IsCalleeWin64) {
7470 UseVarArgCC = true;
7471 } else {
7472 UseVarArgCC = !Outs[i].IsFixed;
7473 }
7474 }
7475
7476 if (!UseVarArgCC) {
7477 // Get type of the original argument.
7478 EVT ActualVT =
7479 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7480 /*AllowUnknown*/ true);
7481 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7482 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7483 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7484 ArgVT = MVT::i8;
7485 else if (ActualMVT == MVT::i16)
7486 ArgVT = MVT::i16;
7487 }
7488
7489 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7490 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7491 assert(!Res && "Call operand has unhandled type");
7492 (void)Res;
7493 }
7494}
7495
7496bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7497 const CallLoweringInfo &CLI) const {
7498 CallingConv::ID CalleeCC = CLI.CallConv;
7499 if (!mayTailCallThisCC(CalleeCC))
7500 return false;
7501
7502 SDValue Callee = CLI.Callee;
7503 bool IsVarArg = CLI.IsVarArg;
7504 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7505 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7506 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7507 const SelectionDAG &DAG = CLI.DAG;
7509 const Function &CallerF = MF.getFunction();
7510 CallingConv::ID CallerCC = CallerF.getCallingConv();
7511
7512 // SME Streaming functions are not eligible for TCO as they may require
7513 // the streaming mode or ZA to be restored after returning from the call.
7514 SMEAttrs CallerAttrs(MF.getFunction());
7515 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7516 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7517 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7518 CallerAttrs.hasStreamingBody())
7519 return false;
7520
7521 // Functions using the C or Fast calling convention that have an SVE signature
7522 // preserve more registers and should assume the SVE_VectorCall CC.
7523 // The check for matching callee-saved regs will determine whether it is
7524 // eligible for TCO.
7525 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7528
7529 bool CCMatch = CallerCC == CalleeCC;
7530
7531 // When using the Windows calling convention on a non-windows OS, we want
7532 // to back up and restore X18 in such functions; we can't do a tail call
7533 // from those functions.
7534 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7535 CalleeCC != CallingConv::Win64)
7536 return false;
7537
7538 // Byval parameters hand the function a pointer directly into the stack area
7539 // we want to reuse during a tail call. Working around this *is* possible (see
7540 // X86) but less efficient and uglier in LowerCall.
7541 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7542 e = CallerF.arg_end();
7543 i != e; ++i) {
7544 if (i->hasByValAttr())
7545 return false;
7546
7547 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7548 // In this case, it is necessary to save/restore X0 in the callee. Tail
7549 // call opt interferes with this. So we disable tail call opt when the
7550 // caller has an argument with "inreg" attribute.
7551
7552 // FIXME: Check whether the callee also has an "inreg" argument.
7553 if (i->hasInRegAttr())
7554 return false;
7555 }
7556
7557 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7558 return CCMatch;
7559
7560 // Externally-defined functions with weak linkage should not be
7561 // tail-called on AArch64 when the OS does not support dynamic
7562 // pre-emption of symbols, as the AAELF spec requires normal calls
7563 // to undefined weak functions to be replaced with a NOP or jump to the
7564 // next instruction. The behaviour of branch instructions in this
7565 // situation (as used for tail calls) is implementation-defined, so we
7566 // cannot rely on the linker replacing the tail call with a return.
7567 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7568 const GlobalValue *GV = G->getGlobal();
7570 if (GV->hasExternalWeakLinkage() &&
7571 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7572 return false;
7573 }
7574
7575 // Now we search for cases where we can use a tail call without changing the
7576 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7577 // concept.
7578
7579 // I want anyone implementing a new calling convention to think long and hard
7580 // about this assert.
7581 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7582 "Unexpected variadic calling convention");
7583
7584 LLVMContext &C = *DAG.getContext();
7585 // Check that the call results are passed in the same way.
7586 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7587 CCAssignFnForCall(CalleeCC, IsVarArg),
7588 CCAssignFnForCall(CallerCC, IsVarArg)))
7589 return false;
7590 // The callee has to preserve all registers the caller needs to preserve.
7591 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7592 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7593 if (!CCMatch) {
7594 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7595 if (Subtarget->hasCustomCallingConv()) {
7596 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7597 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7598 }
7599 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7600 return false;
7601 }
7602
7603 // Nothing more to check if the callee is taking no arguments
7604 if (Outs.empty())
7605 return true;
7606
7608 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7609
7610 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7611
7612 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7613 // When we are musttail, additional checks have been done and we can safely ignore this check
7614 // At least two cases here: if caller is fastcc then we can't have any
7615 // memory arguments (we'd be expected to clean up the stack afterwards). If
7616 // caller is C then we could potentially use its argument area.
7617
7618 // FIXME: for now we take the most conservative of these in both cases:
7619 // disallow all variadic memory operands.
7620 for (const CCValAssign &ArgLoc : ArgLocs)
7621 if (!ArgLoc.isRegLoc())
7622 return false;
7623 }
7624
7625 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7626
7627 // If any of the arguments is passed indirectly, it must be SVE, so the
7628 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7629 // allocate space on the stack. That is why we determine this explicitly here
7630 // the call cannot be a tailcall.
7631 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7632 assert((A.getLocInfo() != CCValAssign::Indirect ||
7633 A.getValVT().isScalableVector() ||
7634 Subtarget->isWindowsArm64EC()) &&
7635 "Expected value to be scalable");
7636 return A.getLocInfo() == CCValAssign::Indirect;
7637 }))
7638 return false;
7639
7640 // If the stack arguments for this call do not fit into our own save area then
7641 // the call cannot be made tail.
7642 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7643 return false;
7644
7645 const MachineRegisterInfo &MRI = MF.getRegInfo();
7646 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7647 return false;
7648
7649 return true;
7650}
7651
7652SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7653 SelectionDAG &DAG,
7654 MachineFrameInfo &MFI,
7655 int ClobberedFI) const {
7656 SmallVector<SDValue, 8> ArgChains;
7657 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7658 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7659
7660 // Include the original chain at the beginning of the list. When this is
7661 // used by target LowerCall hooks, this helps legalize find the
7662 // CALLSEQ_BEGIN node.
7663 ArgChains.push_back(Chain);
7664
7665 // Add a chain value for each stack argument corresponding
7666 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7667 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7668 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7669 if (FI->getIndex() < 0) {
7670 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7671 int64_t InLastByte = InFirstByte;
7672 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7673
7674 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7675 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7676 ArgChains.push_back(SDValue(L, 1));
7677 }
7678
7679 // Build a tokenfactor for all the chains.
7680 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7681}
7682
7683bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7684 bool TailCallOpt) const {
7685 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7686 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7687}
7688
7689// Check if the value is zero-extended from i1 to i8
7690static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7691 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7692 if (SizeInBits < 8)
7693 return false;
7694
7695 APInt RequredZero(SizeInBits, 0xFE);
7696 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7697 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7698 return ZExtBool;
7699}
7700
7701void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7702 SDNode *Node) const {
7703 // Live-in physreg copies that are glued to SMSTART are applied as
7704 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7705 // register allocator to pass call args in callee saved regs, without extra
7706 // copies to avoid these fake clobbers of actually-preserved GPRs.
7707 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7708 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7709 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7710 if (MachineOperand &MO = MI.getOperand(I);
7711 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7712 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7713 AArch64::GPR64RegClass.contains(MO.getReg())))
7714 MI.removeOperand(I);
7715
7716 // The SVE vector length can change when entering/leaving streaming mode.
7717 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7718 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7719 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7720 /*IsImplicit=*/true));
7721 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7722 /*IsImplicit=*/true));
7723 }
7724 }
7725
7726 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7727 // have nothing to do with VG, were it not that they are used to materialise a
7728 // frame-address. If they contain a frame-index to a scalable vector, this
7729 // will likely require an ADDVL instruction to materialise the address, thus
7730 // reading VG.
7731 const MachineFunction &MF = *MI.getMF();
7733 (MI.getOpcode() == AArch64::ADDXri ||
7734 MI.getOpcode() == AArch64::SUBXri)) {
7735 const MachineOperand &MO = MI.getOperand(1);
7736 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7738 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7739 /*IsImplicit=*/true));
7740 }
7741}
7742
7744 bool Enable, SDValue Chain,
7745 SDValue InGlue,
7746 unsigned Condition,
7747 SDValue PStateSM) const {
7750 FuncInfo->setHasStreamingModeChanges(true);
7751
7752 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7753 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7754 SDValue MSROp =
7755 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7756 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7757 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7758 if (Condition != AArch64SME::Always) {
7759 assert(PStateSM && "PStateSM should be defined");
7760 Ops.push_back(PStateSM);
7761 }
7762 Ops.push_back(RegMask);
7763
7764 if (InGlue)
7765 Ops.push_back(InGlue);
7766
7767 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7768 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7769}
7770
7771static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7772 const SMEAttrs &CalleeAttrs) {
7773 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7774 CallerAttrs.hasStreamingBody())
7775 return AArch64SME::Always;
7776 if (CalleeAttrs.hasNonStreamingInterface())
7778 if (CalleeAttrs.hasStreamingInterface())
7780
7781 llvm_unreachable("Unsupported attributes");
7782}
7783
7784/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7785/// and add input and output parameter nodes.
7786SDValue
7787AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7788 SmallVectorImpl<SDValue> &InVals) const {
7789 SelectionDAG &DAG = CLI.DAG;
7790 SDLoc &DL = CLI.DL;
7791 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7792 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7794 SDValue Chain = CLI.Chain;
7795 SDValue Callee = CLI.Callee;
7796 bool &IsTailCall = CLI.IsTailCall;
7797 CallingConv::ID &CallConv = CLI.CallConv;
7798 bool IsVarArg = CLI.IsVarArg;
7799
7802 bool IsThisReturn = false;
7803
7805 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7806 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7807 bool IsSibCall = false;
7808 bool GuardWithBTI = false;
7809
7810 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7811 !Subtarget->noBTIAtReturnTwice()) {
7812 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7813 }
7814
7815 // Analyze operands of the call, assigning locations to each operand.
7817 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7818
7819 if (IsVarArg) {
7820 unsigned NumArgs = Outs.size();
7821
7822 for (unsigned i = 0; i != NumArgs; ++i) {
7823 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7824 report_fatal_error("Passing SVE types to variadic functions is "
7825 "currently not supported");
7826 }
7827 }
7828
7829 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7830
7831 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7832 // Assign locations to each value returned by this call.
7834 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7835 *DAG.getContext());
7836 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7837
7838 // Check callee args/returns for SVE registers and set calling convention
7839 // accordingly.
7840 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7841 auto HasSVERegLoc = [](CCValAssign &Loc) {
7842 if (!Loc.isRegLoc())
7843 return false;
7844 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7845 AArch64::PPRRegClass.contains(Loc.getLocReg());
7846 };
7847 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7849 }
7850
7851 if (IsTailCall) {
7852 // Check if it's really possible to do a tail call.
7853 IsTailCall = isEligibleForTailCallOptimization(CLI);
7854
7855 // A sibling call is one where we're under the usual C ABI and not planning
7856 // to change that but can still do a tail call:
7857 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7858 CallConv != CallingConv::SwiftTail)
7859 IsSibCall = true;
7860
7861 if (IsTailCall)
7862 ++NumTailCalls;
7863 }
7864
7865 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7866 report_fatal_error("failed to perform tail call elimination on a call "
7867 "site marked musttail");
7868
7869 // Get a count of how many bytes are to be pushed on the stack.
7870 unsigned NumBytes = CCInfo.getStackSize();
7871
7872 if (IsSibCall) {
7873 // Since we're not changing the ABI to make this a tail call, the memory
7874 // operands are already available in the caller's incoming argument space.
7875 NumBytes = 0;
7876 }
7877
7878 // FPDiff is the byte offset of the call's argument area from the callee's.
7879 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7880 // by this amount for a tail call. In a sibling call it must be 0 because the
7881 // caller will deallocate the entire stack and the callee still expects its
7882 // arguments to begin at SP+0. Completely unused for non-tail calls.
7883 int FPDiff = 0;
7884
7885 if (IsTailCall && !IsSibCall) {
7886 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7887
7888 // Since callee will pop argument stack as a tail call, we must keep the
7889 // popped size 16-byte aligned.
7890 NumBytes = alignTo(NumBytes, 16);
7891
7892 // FPDiff will be negative if this tail call requires more space than we
7893 // would automatically have in our incoming argument space. Positive if we
7894 // can actually shrink the stack.
7895 FPDiff = NumReusableBytes - NumBytes;
7896
7897 // Update the required reserved area if this is the tail call requiring the
7898 // most argument stack space.
7899 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7900 FuncInfo->setTailCallReservedStack(-FPDiff);
7901
7902 // The stack pointer must be 16-byte aligned at all times it's used for a
7903 // memory operation, which in practice means at *all* times and in
7904 // particular across call boundaries. Therefore our own arguments started at
7905 // a 16-byte aligned SP and the delta applied for the tail call should
7906 // satisfy the same constraint.
7907 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7908 }
7909
7910 // Determine whether we need any streaming mode changes.
7911 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7912 if (CLI.CB)
7913 CalleeAttrs = SMEAttrs(*CLI.CB);
7914 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7915 CalleeAttrs = SMEAttrs(ES->getSymbol());
7916
7917 auto DescribeCallsite =
7919 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7920 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7921 R << ore::NV("Callee", ES->getSymbol());
7922 else if (CLI.CB && CLI.CB->getCalledFunction())
7923 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7924 else
7925 R << "unknown callee";
7926 R << "'";
7927 return R;
7928 };
7929
7930 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7931 if (RequiresLazySave) {
7932 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7934 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
7936 SDValue NumZaSaveSlicesAddr =
7937 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7938 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7939 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7940 DAG.getConstant(1, DL, MVT::i32));
7941 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7942 MPI, MVT::i16);
7943 Chain = DAG.getNode(
7944 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7945 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7946 TPIDR2ObjAddr);
7948 ORE.emit([&]() {
7949 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7950 CLI.CB)
7951 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7952 &MF.getFunction());
7953 return DescribeCallsite(R) << " sets up a lazy save for ZA";
7954 });
7955 }
7956
7957 SDValue PStateSM;
7958 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
7959 if (RequiresSMChange) {
7960 if (CallerAttrs.hasStreamingInterfaceOrBody())
7961 PStateSM = DAG.getConstant(1, DL, MVT::i64);
7962 else if (CallerAttrs.hasNonStreamingInterface())
7963 PStateSM = DAG.getConstant(0, DL, MVT::i64);
7964 else
7965 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7967 ORE.emit([&]() {
7968 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
7969 CLI.CB)
7970 : OptimizationRemarkAnalysis("sme", "SMETransition",
7971 &MF.getFunction());
7972 DescribeCallsite(R) << " requires a streaming mode transition";
7973 return R;
7974 });
7975 }
7976
7977 SDValue ZTFrameIdx;
7978 MachineFrameInfo &MFI = MF.getFrameInfo();
7979 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
7980
7981 // If the caller has ZT0 state which will not be preserved by the callee,
7982 // spill ZT0 before the call.
7983 if (ShouldPreserveZT0) {
7984 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
7985 ZTFrameIdx = DAG.getFrameIndex(
7986 ZTObj,
7988
7989 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
7990 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
7991 }
7992
7993 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
7994 // PSTATE.ZA before the call if there is no lazy-save active.
7995 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
7996 assert((!DisableZA || !RequiresLazySave) &&
7997 "Lazy-save should have PSTATE.SM=1 on entry to the function");
7998
7999 if (DisableZA)
8000 Chain = DAG.getNode(
8001 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8002 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8003 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8004
8005 // Adjust the stack pointer for the new arguments...
8006 // These operations are automatically eliminated by the prolog/epilog pass
8007 if (!IsSibCall)
8008 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8009
8010 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8012
8014 SmallSet<unsigned, 8> RegsUsed;
8015 SmallVector<SDValue, 8> MemOpChains;
8016 auto PtrVT = getPointerTy(DAG.getDataLayout());
8017
8018 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8019 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8020 for (const auto &F : Forwards) {
8021 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8022 RegsToPass.emplace_back(F.PReg, Val);
8023 }
8024 }
8025
8026 // Walk the register/memloc assignments, inserting copies/loads.
8027 unsigned ExtraArgLocs = 0;
8028 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8029 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8030 SDValue Arg = OutVals[i];
8031 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8032
8033 // Promote the value if needed.
8034 switch (VA.getLocInfo()) {
8035 default:
8036 llvm_unreachable("Unknown loc info!");
8037 case CCValAssign::Full:
8038 break;
8039 case CCValAssign::SExt:
8040 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8041 break;
8042 case CCValAssign::ZExt:
8043 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8044 break;
8045 case CCValAssign::AExt:
8046 if (Outs[i].ArgVT == MVT::i1) {
8047 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8048 //
8049 // Check if we actually have to do this, because the value may
8050 // already be zero-extended.
8051 //
8052 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8053 // and rely on DAGCombiner to fold this, because the following
8054 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8055 //
8056 // (ext (zext x)) -> (zext x)
8057 //
8058 // This will give us (zext i32), which we cannot remove, so
8059 // try to check this beforehand.
8060 if (!checkZExtBool(Arg, DAG)) {
8061 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8062 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8063 }
8064 }
8065 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8066 break;
8068 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8069 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8070 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8071 DAG.getConstant(32, DL, VA.getLocVT()));
8072 break;
8073 case CCValAssign::BCvt:
8074 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8075 break;
8076 case CCValAssign::Trunc:
8077 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8078 break;
8079 case CCValAssign::FPExt:
8080 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8081 break;
8083 bool isScalable = VA.getValVT().isScalableVT();
8084 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8085 "Indirect arguments should be scalable on most subtargets");
8086
8087 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8088 uint64_t PartSize = StoreSize;
8089 unsigned NumParts = 1;
8090 if (Outs[i].Flags.isInConsecutiveRegs()) {
8091 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8092 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8093 ++NumParts;
8094 StoreSize *= NumParts;
8095 }
8096
8097 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8098 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8099 MachineFrameInfo &MFI = MF.getFrameInfo();
8100 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8101 if (isScalable)
8103
8107 SDValue SpillSlot = Ptr;
8108
8109 // Ensure we generate all stores for each tuple part, whilst updating the
8110 // pointer after each store correctly using vscale.
8111 while (NumParts) {
8112 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8113 MemOpChains.push_back(Store);
8114
8115 NumParts--;
8116 if (NumParts > 0) {
8117 SDValue BytesIncrement;
8118 if (isScalable) {
8119 BytesIncrement = DAG.getVScale(
8120 DL, Ptr.getValueType(),
8121 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8122 } else {
8123 BytesIncrement = DAG.getConstant(
8124 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8125 Ptr.getValueType());
8126 }
8128 Flags.setNoUnsignedWrap(true);
8129
8130 MPI = MachinePointerInfo(MPI.getAddrSpace());
8131 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8132 BytesIncrement, Flags);
8133 ExtraArgLocs++;
8134 i++;
8135 }
8136 }
8137
8138 Arg = SpillSlot;
8139 break;
8140 }
8141
8142 if (VA.isRegLoc()) {
8143 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8144 Outs[0].VT == MVT::i64) {
8145 assert(VA.getLocVT() == MVT::i64 &&
8146 "unexpected calling convention register assignment");
8147 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8148 "unexpected use of 'returned'");
8149 IsThisReturn = true;
8150 }
8151 if (RegsUsed.count(VA.getLocReg())) {
8152 // If this register has already been used then we're trying to pack
8153 // parts of an [N x i32] into an X-register. The extension type will
8154 // take care of putting the two halves in the right place but we have to
8155 // combine them.
8156 SDValue &Bits =
8157 llvm::find_if(RegsToPass,
8158 [=](const std::pair<unsigned, SDValue> &Elt) {
8159 return Elt.first == VA.getLocReg();
8160 })
8161 ->second;
8162 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8163 // Call site info is used for function's parameter entry value
8164 // tracking. For now we track only simple cases when parameter
8165 // is transferred through whole register.
8167 [&VA](MachineFunction::ArgRegPair ArgReg) {
8168 return ArgReg.Reg == VA.getLocReg();
8169 });
8170 } else {
8171 // Add an extra level of indirection for streaming mode changes by
8172 // using a pseudo copy node that cannot be rematerialised between a
8173 // smstart/smstop and the call by the simple register coalescer.
8174 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8176 Arg.getValueType(), Arg);
8177 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8178 RegsUsed.insert(VA.getLocReg());
8179 const TargetOptions &Options = DAG.getTarget().Options;
8180 if (Options.EmitCallSiteInfo)
8181 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8182 }
8183 } else {
8184 assert(VA.isMemLoc());
8185
8186 SDValue DstAddr;
8187 MachinePointerInfo DstInfo;
8188
8189 // FIXME: This works on big-endian for composite byvals, which are the
8190 // common case. It should also work for fundamental types too.
8191 uint32_t BEAlign = 0;
8192 unsigned OpSize;
8193 if (VA.getLocInfo() == CCValAssign::Indirect ||
8195 OpSize = VA.getLocVT().getFixedSizeInBits();
8196 else
8197 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8198 : VA.getValVT().getSizeInBits();
8199 OpSize = (OpSize + 7) / 8;
8200 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8201 !Flags.isInConsecutiveRegs()) {
8202 if (OpSize < 8)
8203 BEAlign = 8 - OpSize;
8204 }
8205 unsigned LocMemOffset = VA.getLocMemOffset();
8206 int32_t Offset = LocMemOffset + BEAlign;
8207 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8208 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8209
8210 if (IsTailCall) {
8211 Offset = Offset + FPDiff;
8212 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8213
8214 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8215 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8216
8217 // Make sure any stack arguments overlapping with where we're storing
8218 // are loaded before this eventual operation. Otherwise they'll be
8219 // clobbered.
8220 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8221 } else {
8222 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8223
8224 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8225 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8226 }
8227
8228 if (Outs[i].Flags.isByVal()) {
8229 SDValue SizeNode =
8230 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8231 SDValue Cpy = DAG.getMemcpy(
8232 Chain, DL, DstAddr, Arg, SizeNode,
8233 Outs[i].Flags.getNonZeroByValAlign(),
8234 /*isVol = */ false, /*AlwaysInline = */ false,
8235 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8236
8237 MemOpChains.push_back(Cpy);
8238 } else {
8239 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8240 // promoted to a legal register type i32, we should truncate Arg back to
8241 // i1/i8/i16.
8242 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8243 VA.getValVT() == MVT::i16)
8244 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8245
8246 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8247 MemOpChains.push_back(Store);
8248 }
8249 }
8250 }
8251
8252 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8253 SDValue ParamPtr = StackPtr;
8254 if (IsTailCall) {
8255 // Create a dummy object at the top of the stack that can be used to get
8256 // the SP after the epilogue
8257 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8258 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8259 }
8260
8261 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8262 // describing the argument list. x4 contains the address of the
8263 // first stack parameter. x5 contains the size in bytes of all parameters
8264 // passed on the stack.
8265 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8266 RegsToPass.emplace_back(AArch64::X5,
8267 DAG.getConstant(NumBytes, DL, MVT::i64));
8268 }
8269
8270 if (!MemOpChains.empty())
8271 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8272
8273 SDValue InGlue;
8274 if (RequiresSMChange) {
8275 SDValue NewChain = changeStreamingMode(
8276 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8277 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8278 Chain = NewChain.getValue(0);
8279 InGlue = NewChain.getValue(1);
8280 }
8281
8282 // Build a sequence of copy-to-reg nodes chained together with token chain
8283 // and flag operands which copy the outgoing args into the appropriate regs.
8284 for (auto &RegToPass : RegsToPass) {
8285 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8286 RegToPass.second, InGlue);
8287 InGlue = Chain.getValue(1);
8288 }
8289
8290 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8291 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8292 // node so that legalize doesn't hack it.
8293 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8294 auto GV = G->getGlobal();
8295 unsigned OpFlags =
8297 if (OpFlags & AArch64II::MO_GOT) {
8298 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8299 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8300 } else {
8301 const GlobalValue *GV = G->getGlobal();
8302 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8303 }
8304 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8305 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8306 Subtarget->isTargetMachO()) ||
8308 const char *Sym = S->getSymbol();
8309 if (UseGot) {
8311 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8312 } else {
8313 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8314 }
8315 }
8316
8317 // We don't usually want to end the call-sequence here because we would tidy
8318 // the frame up *after* the call, however in the ABI-changing tail-call case
8319 // we've carefully laid out the parameters so that when sp is reset they'll be
8320 // in the correct location.
8321 if (IsTailCall && !IsSibCall) {
8322 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8323 InGlue = Chain.getValue(1);
8324 }
8325
8326 std::vector<SDValue> Ops;
8327 Ops.push_back(Chain);
8328 Ops.push_back(Callee);
8329
8330 if (IsTailCall) {
8331 // Each tail call may have to adjust the stack by a different amount, so
8332 // this information must travel along with the operation for eventual
8333 // consumption by emitEpilogue.
8334 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8335 }
8336
8337 // Add argument registers to the end of the list so that they are known live
8338 // into the call.
8339 for (auto &RegToPass : RegsToPass)
8340 Ops.push_back(DAG.getRegister(RegToPass.first,
8341 RegToPass.second.getValueType()));
8342
8343 // Add a register mask operand representing the call-preserved registers.
8344 const uint32_t *Mask;
8345 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8346 if (IsThisReturn) {
8347 // For 'this' returns, use the X0-preserving mask if applicable
8348 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8349 if (!Mask) {
8350 IsThisReturn = false;
8351 Mask = TRI->getCallPreservedMask(MF, CallConv);
8352 }
8353 } else
8354 Mask = TRI->getCallPreservedMask(MF, CallConv);
8355
8356 if (Subtarget->hasCustomCallingConv())
8357 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8358
8359 if (TRI->isAnyArgRegReserved(MF))
8360 TRI->emitReservedArgRegCallError(MF);
8361
8362 assert(Mask && "Missing call preserved mask for calling convention");
8363 Ops.push_back(DAG.getRegisterMask(Mask));
8364
8365 if (InGlue.getNode())
8366 Ops.push_back(InGlue);
8367
8368 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8369
8370 // If we're doing a tall call, use a TC_RETURN here rather than an
8371 // actual call instruction.
8372 if (IsTailCall) {
8374 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8375
8376 if (IsCFICall)
8377 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8378
8379 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8380 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8381 return Ret;
8382 }
8383
8384 unsigned CallOpc = AArch64ISD::CALL;
8385 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8386 // be expanded to the call, directly followed by a special marker sequence and
8387 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8388 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8389 assert(!IsTailCall &&
8390 "tail calls cannot be marked with clang.arc.attachedcall");
8391 CallOpc = AArch64ISD::CALL_RVMARKER;
8392
8393 // Add a target global address for the retainRV/claimRV runtime function
8394 // just before the call target.
8395 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8396 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8397 Ops.insert(Ops.begin() + 1, GA);
8398 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8400 } else if (GuardWithBTI) {
8401 CallOpc = AArch64ISD::CALL_BTI;
8402 }
8403
8404 // Returns a chain and a flag for retval copy to use.
8405 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8406
8407 if (IsCFICall)
8408 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8409
8410 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8411 InGlue = Chain.getValue(1);
8412 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8413
8414 uint64_t CalleePopBytes =
8415 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8416
8417 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8418 InGlue = Chain.getValue(1);
8419
8420 // Handle result values, copying them out of physregs into vregs that we
8421 // return.
8422 SDValue Result = LowerCallResult(
8423 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8424 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8425
8426 if (!Ins.empty())
8427 InGlue = Result.getValue(Result->getNumValues() - 1);
8428
8429 if (RequiresSMChange) {
8430 assert(PStateSM && "Expected a PStateSM to be set");
8432 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8433 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8434 }
8435
8436 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8437 // Unconditionally resume ZA.
8438 Result = DAG.getNode(
8439 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8440 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8441 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8442
8443 if (ShouldPreserveZT0)
8444 Result =
8445 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8446 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8447
8448 if (RequiresLazySave) {
8449 // Conditionally restore the lazy save using a pseudo node.
8450 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8451 SDValue RegMask = DAG.getRegisterMask(
8452 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8453 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8454 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8455 SDValue TPIDR2_EL0 = DAG.getNode(
8456 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8457 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8458
8459 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8460 // RESTORE_ZA pseudo.
8461 SDValue Glue;
8462 SDValue TPIDR2Block = DAG.getFrameIndex(
8464 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8465 Result =
8466 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8467 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8468 RestoreRoutine, RegMask, Result.getValue(1)});
8469
8470 // Finally reset the TPIDR2_EL0 register to 0.
8471 Result = DAG.getNode(
8472 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8473 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8474 DAG.getConstant(0, DL, MVT::i64));
8475 }
8476
8477 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8478 for (unsigned I = 0; I < InVals.size(); ++I) {
8479 // The smstart/smstop is chained as part of the call, but when the
8480 // resulting chain is discarded (which happens when the call is not part
8481 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8482 // smstart/smstop is chained to the result value. We can do that by doing
8483 // a vreg -> vreg copy.
8485 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8486 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8487 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8488 InVals[I].getValueType());
8489 }
8490 }
8491
8492 return Result;
8493}
8494
8495bool AArch64TargetLowering::CanLowerReturn(
8496 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8497 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8498 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8500 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8501 return CCInfo.CheckReturn(Outs, RetCC);
8502}
8503
8504SDValue
8505AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8506 bool isVarArg,
8508 const SmallVectorImpl<SDValue> &OutVals,
8509 const SDLoc &DL, SelectionDAG &DAG) const {
8510 auto &MF = DAG.getMachineFunction();
8511 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8512
8513 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8515 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8516 CCInfo.AnalyzeReturn(Outs, RetCC);
8517
8518 // Copy the result values into the output registers.
8519 SDValue Glue;
8521 SmallSet<unsigned, 4> RegsUsed;
8522 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8523 ++i, ++realRVLocIdx) {
8524 CCValAssign &VA = RVLocs[i];
8525 assert(VA.isRegLoc() && "Can only return in registers!");
8526 SDValue Arg = OutVals[realRVLocIdx];
8527
8528 switch (VA.getLocInfo()) {
8529 default:
8530 llvm_unreachable("Unknown loc info!");
8531 case CCValAssign::Full:
8532 if (Outs[i].ArgVT == MVT::i1) {
8533 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8534 // value. This is strictly redundant on Darwin (which uses "zeroext
8535 // i1"), but will be optimised out before ISel.
8536 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8537 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8538 }
8539 break;
8540 case CCValAssign::BCvt:
8541 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8542 break;
8543 case CCValAssign::AExt:
8544 case CCValAssign::ZExt:
8545 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8546 break;
8548 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8549 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8550 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8551 DAG.getConstant(32, DL, VA.getLocVT()));
8552 break;
8553 }
8554
8555 if (RegsUsed.count(VA.getLocReg())) {
8556 SDValue &Bits =
8557 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8558 return Elt.first == VA.getLocReg();
8559 })->second;
8560 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8561 } else {
8562 RetVals.emplace_back(VA.getLocReg(), Arg);
8563 RegsUsed.insert(VA.getLocReg());
8564 }
8565 }
8566
8567 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8568
8569 // Emit SMSTOP before returning from a locally streaming function
8570 SMEAttrs FuncAttrs(MF.getFunction());
8571 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8572 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8573 Register Reg = FuncInfo->getPStateSMReg();
8574 assert(Reg.isValid() && "PStateSM Register is invalid");
8575 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8576 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8577 /*Glue*/ SDValue(),
8579 } else
8580 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8581 /*Glue*/ SDValue(), AArch64SME::Always);
8582 Glue = Chain.getValue(1);
8583 }
8584
8585 SmallVector<SDValue, 4> RetOps(1, Chain);
8586 for (auto &RetVal : RetVals) {
8587 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8588 isPassedInFPR(RetVal.second.getValueType()))
8589 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8590 RetVal.second.getValueType(), RetVal.second);
8591 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8592 Glue = Chain.getValue(1);
8593 RetOps.push_back(
8594 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8595 }
8596
8597 // Windows AArch64 ABIs require that for returning structs by value we copy
8598 // the sret argument into X0 for the return.
8599 // We saved the argument into a virtual register in the entry block,
8600 // so now we copy the value out and into X0.
8601 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8602 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8604
8605 unsigned RetValReg = AArch64::X0;
8606 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8607 RetValReg = AArch64::X8;
8608 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8609 Glue = Chain.getValue(1);
8610
8611 RetOps.push_back(
8612 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8613 }
8614
8615 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8616 if (I) {
8617 for (; *I; ++I) {
8618 if (AArch64::GPR64RegClass.contains(*I))
8619 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8620 else if (AArch64::FPR64RegClass.contains(*I))
8621 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8622 else
8623 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8624 }
8625 }
8626
8627 RetOps[0] = Chain; // Update chain.
8628
8629 // Add the glue if we have it.
8630 if (Glue.getNode())
8631 RetOps.push_back(Glue);
8632
8633 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8634 // ARM64EC entry thunks use a special return sequence: instead of a regular
8635 // "ret" instruction, they need to explicitly call the emulator.
8636 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8637 SDValue Arm64ECRetDest =
8638 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8639 Arm64ECRetDest =
8640 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8641 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8643 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8644 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8645 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8646 }
8647
8648 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8649}
8650
8651//===----------------------------------------------------------------------===//
8652// Other Lowering Code
8653//===----------------------------------------------------------------------===//
8654
8655SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8656 SelectionDAG &DAG,
8657 unsigned Flag) const {
8658 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8659 N->getOffset(), Flag);
8660}
8661
8662SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8663 SelectionDAG &DAG,
8664 unsigned Flag) const {
8665 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8666}
8667
8668SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8669 SelectionDAG &DAG,
8670 unsigned Flag) const {
8671 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8672 N->getOffset(), Flag);
8673}
8674
8675SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8676 SelectionDAG &DAG,
8677 unsigned Flag) const {
8678 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8679}
8680
8681SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8682 SelectionDAG &DAG,
8683 unsigned Flag) const {
8684 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8685}
8686
8687// (loadGOT sym)
8688template <class NodeTy>
8689SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8690 unsigned Flags) const {
8691 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8692 SDLoc DL(N);
8693 EVT Ty = getPointerTy(DAG.getDataLayout());
8694 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8695 // FIXME: Once remat is capable of dealing with instructions with register
8696 // operands, expand this into two nodes instead of using a wrapper node.
8697 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8698}
8699
8700// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8701template <class NodeTy>
8702SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8703 unsigned Flags) const {
8704 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8705 SDLoc DL(N);
8706 EVT Ty = getPointerTy(DAG.getDataLayout());
8707 const unsigned char MO_NC = AArch64II::MO_NC;
8708 return DAG.getNode(
8710 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8711 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8712 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8713 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8714}
8715
8716// (addlow (adrp %hi(sym)) %lo(sym))
8717template <class NodeTy>
8718SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8719 unsigned Flags) const {
8720 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8721 SDLoc DL(N);
8722 EVT Ty = getPointerTy(DAG.getDataLayout());
8723 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8724 SDValue Lo = getTargetNode(N, Ty, DAG,
8727 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8728}
8729
8730// (adr sym)
8731template <class NodeTy>
8732SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8733 unsigned Flags) const {
8734 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8735 SDLoc DL(N);
8736 EVT Ty = getPointerTy(DAG.getDataLayout());
8737 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8738 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8739}
8740
8741SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8742 SelectionDAG &DAG) const {
8743 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8744 const GlobalValue *GV = GN->getGlobal();
8745 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8746
8747 if (OpFlags != AArch64II::MO_NO_FLAG)
8748 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8749 "unexpected offset in global node");
8750
8751 // This also catches the large code model case for Darwin, and tiny code
8752 // model with got relocations.
8753 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8754 return getGOT(GN, DAG, OpFlags);
8755 }
8756
8760 Result = getAddrLarge(GN, DAG, OpFlags);
8761 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8762 Result = getAddrTiny(GN, DAG, OpFlags);
8763 } else {
8764 Result = getAddr(GN, DAG, OpFlags);
8765 }
8766 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8767 SDLoc DL(GN);
8769 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8771 return Result;
8772}
8773
8774/// Convert a TLS address reference into the correct sequence of loads
8775/// and calls to compute the variable's address (for Darwin, currently) and
8776/// return an SDValue containing the final node.
8777
8778/// Darwin only has one TLS scheme which must be capable of dealing with the
8779/// fully general situation, in the worst case. This means:
8780/// + "extern __thread" declaration.
8781/// + Defined in a possibly unknown dynamic library.
8782///
8783/// The general system is that each __thread variable has a [3 x i64] descriptor
8784/// which contains information used by the runtime to calculate the address. The
8785/// only part of this the compiler needs to know about is the first xword, which
8786/// contains a function pointer that must be called with the address of the
8787/// entire descriptor in "x0".
8788///
8789/// Since this descriptor may be in a different unit, in general even the
8790/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8791/// is:
8792/// adrp x0, _var@TLVPPAGE
8793/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8794/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8795/// ; the function pointer
8796/// blr x1 ; Uses descriptor address in x0
8797/// ; Address of _var is now in x0.
8798///
8799/// If the address of _var's descriptor *is* known to the linker, then it can
8800/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8801/// a slight efficiency gain.
8802SDValue
8803AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8804 SelectionDAG &DAG) const {
8805 assert(Subtarget->isTargetDarwin() &&
8806 "This function expects a Darwin target");
8807
8808 SDLoc DL(Op);
8809 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8810 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8811 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8812
8813 SDValue TLVPAddr =
8814 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8815 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8816
8817 // The first entry in the descriptor is a function pointer that we must call
8818 // to obtain the address of the variable.
8819 SDValue Chain = DAG.getEntryNode();
8820 SDValue FuncTLVGet = DAG.getLoad(
8821 PtrMemVT, DL, Chain, DescAddr,
8823 Align(PtrMemVT.getSizeInBits() / 8),
8825 Chain = FuncTLVGet.getValue(1);
8826
8827 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8828 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8829
8831 MFI.setAdjustsStack(true);
8832
8833 // TLS calls preserve all registers except those that absolutely must be
8834 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8835 // silly).
8836 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8837 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8838 if (Subtarget->hasCustomCallingConv())
8839 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8840
8841 // Finally, we can make the call. This is just a degenerate version of a
8842 // normal AArch64 call node: x0 takes the address of the descriptor, and
8843 // returns the address of the variable in this thread.
8844 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8845 Chain =
8846 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8847 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8848 DAG.getRegisterMask(Mask), Chain.getValue(1));
8849 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8850}
8851
8852/// Convert a thread-local variable reference into a sequence of instructions to
8853/// compute the variable's address for the local exec TLS model of ELF targets.
8854/// The sequence depends on the maximum TLS area size.
8855SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8856 SDValue ThreadBase,
8857 const SDLoc &DL,
8858 SelectionDAG &DAG) const {
8859 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8860 SDValue TPOff, Addr;
8861
8862 switch (DAG.getTarget().Options.TLSSize) {
8863 default:
8864 llvm_unreachable("Unexpected TLS size");
8865
8866 case 12: {
8867 // mrs x0, TPIDR_EL0
8868 // add x0, x0, :tprel_lo12:a
8870 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8871 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8872 Var,
8873 DAG.getTargetConstant(0, DL, MVT::i32)),
8874 0);
8875 }
8876
8877 case 24: {
8878 // mrs x0, TPIDR_EL0
8879 // add x0, x0, :tprel_hi12:a
8880 // add x0, x0, :tprel_lo12_nc:a
8881 SDValue HiVar = DAG.getTargetGlobalAddress(
8882 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8883 SDValue LoVar = DAG.getTargetGlobalAddress(
8884 GV, DL, PtrVT, 0,
8886 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8887 HiVar,
8888 DAG.getTargetConstant(0, DL, MVT::i32)),
8889 0);
8890 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8891 LoVar,
8892 DAG.getTargetConstant(0, DL, MVT::i32)),
8893 0);
8894 }
8895
8896 case 32: {
8897 // mrs x1, TPIDR_EL0
8898 // movz x0, #:tprel_g1:a
8899 // movk x0, #:tprel_g0_nc:a
8900 // add x0, x1, x0
8901 SDValue HiVar = DAG.getTargetGlobalAddress(
8902 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8903 SDValue LoVar = DAG.getTargetGlobalAddress(
8904 GV, DL, PtrVT, 0,
8906 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8907 DAG.getTargetConstant(16, DL, MVT::i32)),
8908 0);
8909 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8910 DAG.getTargetConstant(0, DL, MVT::i32)),
8911 0);
8912 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8913 }
8914
8915 case 48: {
8916 // mrs x1, TPIDR_EL0
8917 // movz x0, #:tprel_g2:a
8918 // movk x0, #:tprel_g1_nc:a
8919 // movk x0, #:tprel_g0_nc:a
8920 // add x0, x1, x0
8921 SDValue HiVar = DAG.getTargetGlobalAddress(
8922 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8923 SDValue MiVar = DAG.getTargetGlobalAddress(
8924 GV, DL, PtrVT, 0,
8926 SDValue LoVar = DAG.getTargetGlobalAddress(
8927 GV, DL, PtrVT, 0,
8929 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8930 DAG.getTargetConstant(32, DL, MVT::i32)),
8931 0);
8932 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8933 DAG.getTargetConstant(16, DL, MVT::i32)),
8934 0);
8935 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8936 DAG.getTargetConstant(0, DL, MVT::i32)),
8937 0);
8938 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8939 }
8940 }
8941}
8942
8943/// When accessing thread-local variables under either the general-dynamic or
8944/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8945/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8946/// is a function pointer to carry out the resolution.
8947///
8948/// The sequence is:
8949/// adrp x0, :tlsdesc:var
8950/// ldr x1, [x0, #:tlsdesc_lo12:var]
8951/// add x0, x0, #:tlsdesc_lo12:var
8952/// .tlsdesccall var
8953/// blr x1
8954/// (TPIDR_EL0 offset now in x0)
8955///
8956/// The above sequence must be produced unscheduled, to enable the linker to
8957/// optimize/relax this sequence.
8958/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
8959/// above sequence, and expanded really late in the compilation flow, to ensure
8960/// the sequence is produced as per above.
8961SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
8962 const SDLoc &DL,
8963 SelectionDAG &DAG) const {
8964 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8965
8966 SDValue Chain = DAG.getEntryNode();
8967 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8968
8969 Chain =
8970 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
8971 SDValue Glue = Chain.getValue(1);
8972
8973 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
8974}
8975
8976SDValue
8977AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
8978 SelectionDAG &DAG) const {
8979 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
8980
8981 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8982
8984
8986 if (Model == TLSModel::LocalDynamic)
8988 }
8989
8991 Model != TLSModel::LocalExec)
8992 report_fatal_error("ELF TLS only supported in small memory model or "
8993 "in local exec TLS model");
8994 // Different choices can be made for the maximum size of the TLS area for a
8995 // module. For the small address model, the default TLS size is 16MiB and the
8996 // maximum TLS size is 4GiB.
8997 // FIXME: add tiny and large code model support for TLS access models other
8998 // than local exec. We currently generate the same code as small for tiny,
8999 // which may be larger than needed.
9000
9001 SDValue TPOff;
9002 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9003 SDLoc DL(Op);
9004 const GlobalValue *GV = GA->getGlobal();
9005
9006 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9007
9008 if (Model == TLSModel::LocalExec) {
9009 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9010 } else if (Model == TLSModel::InitialExec) {
9011 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9012 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9013 } else if (Model == TLSModel::LocalDynamic) {
9014 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9015 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9016 // the beginning of the module's TLS region, followed by a DTPREL offset
9017 // calculation.
9018
9019 // These accesses will need deduplicating if there's more than one.
9020 AArch64FunctionInfo *MFI =
9023
9024 // The call needs a relocation too for linker relaxation. It doesn't make
9025 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9026 // the address.
9027 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9029
9030 // Now we can calculate the offset from TPIDR_EL0 to this module's
9031 // thread-local area.
9032 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9033
9034 // Now use :dtprel_whatever: operations to calculate this variable's offset
9035 // in its thread-storage area.
9036 SDValue HiVar = DAG.getTargetGlobalAddress(
9037 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9038 SDValue LoVar = DAG.getTargetGlobalAddress(
9039 GV, DL, MVT::i64, 0,
9041
9042 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9043 DAG.getTargetConstant(0, DL, MVT::i32)),
9044 0);
9045 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9046 DAG.getTargetConstant(0, DL, MVT::i32)),
9047 0);
9048 } else if (Model == TLSModel::GeneralDynamic) {
9049 // The call needs a relocation too for linker relaxation. It doesn't make
9050 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9051 // the address.
9052 SDValue SymAddr =
9053 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9054
9055 // Finally we can make a call to calculate the offset from tpidr_el0.
9056 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9057 } else
9058 llvm_unreachable("Unsupported ELF TLS access model");
9059
9060 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9061}
9062
9063SDValue
9064AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9065 SelectionDAG &DAG) const {
9066 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9067
9068 SDValue Chain = DAG.getEntryNode();
9069 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9070 SDLoc DL(Op);
9071
9072 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9073
9074 // Load the ThreadLocalStoragePointer from the TEB
9075 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9076 SDValue TLSArray =
9077 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9078 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9079 Chain = TLSArray.getValue(1);
9080
9081 // Load the TLS index from the C runtime;
9082 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9083 // This also does the same as LOADgot, but using a generic i32 load,
9084 // while LOADgot only loads i64.
9085 SDValue TLSIndexHi =
9086 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9087 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9088 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9089 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9090 SDValue TLSIndex =
9091 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9092 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9093 Chain = TLSIndex.getValue(1);
9094
9095 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9096 // offset into the TLSArray.
9097 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9098 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9099 DAG.getConstant(3, DL, PtrVT));
9100 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9101 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9103 Chain = TLS.getValue(1);
9104
9105 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9106 const GlobalValue *GV = GA->getGlobal();
9107 SDValue TGAHi = DAG.getTargetGlobalAddress(
9108 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9109 SDValue TGALo = DAG.getTargetGlobalAddress(
9110 GV, DL, PtrVT, 0,
9112
9113 // Add the offset from the start of the .tls section (section base).
9114 SDValue Addr =
9115 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9116 DAG.getTargetConstant(0, DL, MVT::i32)),
9117 0);
9118 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9119 return Addr;
9120}
9121
9122SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9123 SelectionDAG &DAG) const {
9124 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9125 if (DAG.getTarget().useEmulatedTLS())
9126 return LowerToTLSEmulatedModel(GA, DAG);
9127
9128 if (Subtarget->isTargetDarwin())
9129 return LowerDarwinGlobalTLSAddress(Op, DAG);
9130 if (Subtarget->isTargetELF())
9131 return LowerELFGlobalTLSAddress(Op, DAG);
9132 if (Subtarget->isTargetWindows())
9133 return LowerWindowsGlobalTLSAddress(Op, DAG);
9134
9135 llvm_unreachable("Unexpected platform trying to use TLS");
9136}
9137
9138// Looks through \param Val to determine the bit that can be used to
9139// check the sign of the value. It returns the unextended value and
9140// the sign bit position.
9141std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9142 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9143 return {Val.getOperand(0),
9144 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9145 1};
9146
9147 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9148 return {Val.getOperand(0),
9149 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9150
9151 return {Val, Val.getValueSizeInBits() - 1};
9152}
9153
9154SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9155 SDValue Chain = Op.getOperand(0);
9156 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9157 SDValue LHS = Op.getOperand(2);
9158 SDValue RHS = Op.getOperand(3);
9159 SDValue Dest = Op.getOperand(4);
9160 SDLoc dl(Op);
9161
9163 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9164 // will not be produced, as they are conditional branch instructions that do
9165 // not set flags.
9166 bool ProduceNonFlagSettingCondBr =
9167 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9168
9169 // Handle f128 first, since lowering it will result in comparing the return
9170 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9171 // is expecting to deal with.
9172 if (LHS.getValueType() == MVT::f128) {
9173 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9174
9175 // If softenSetCCOperands returned a scalar, we need to compare the result
9176 // against zero to select between true and false values.
9177 if (!RHS.getNode()) {
9178 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9179 CC = ISD::SETNE;
9180 }
9181 }
9182
9183 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9184 // instruction.
9185 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9186 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9187 // Only lower legal XALUO ops.
9188 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9189 return SDValue();
9190
9191 // The actual operation with overflow check.
9193 SDValue Value, Overflow;
9194 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9195
9196 if (CC == ISD::SETNE)
9197 OFCC = getInvertedCondCode(OFCC);
9198 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9199
9200 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9201 Overflow);
9202 }
9203
9204 if (LHS.getValueType().isInteger()) {
9205 assert((LHS.getValueType() == RHS.getValueType()) &&
9206 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9207
9208 // If the RHS of the comparison is zero, we can potentially fold this
9209 // to a specialized branch.
9210 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9211 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9212 if (CC == ISD::SETEQ) {
9213 // See if we can use a TBZ to fold in an AND as well.
9214 // TBZ has a smaller branch displacement than CBZ. If the offset is
9215 // out of bounds, a late MI-layer pass rewrites branches.
9216 // 403.gcc is an example that hits this case.
9217 if (LHS.getOpcode() == ISD::AND &&
9218 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9219 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9220 SDValue Test = LHS.getOperand(0);
9221 uint64_t Mask = LHS.getConstantOperandVal(1);
9222 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9223 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9224 Dest);
9225 }
9226
9227 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9228 } else if (CC == ISD::SETNE) {
9229 // See if we can use a TBZ to fold in an AND as well.
9230 // TBZ has a smaller branch displacement than CBZ. If the offset is
9231 // out of bounds, a late MI-layer pass rewrites branches.
9232 // 403.gcc is an example that hits this case.
9233 if (LHS.getOpcode() == ISD::AND &&
9234 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9235 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9236 SDValue Test = LHS.getOperand(0);
9237 uint64_t Mask = LHS.getConstantOperandVal(1);
9238 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9239 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9240 Dest);
9241 }
9242
9243 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9244 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9245 // Don't combine AND since emitComparison converts the AND to an ANDS
9246 // (a.k.a. TST) and the test in the test bit and branch instruction
9247 // becomes redundant. This would also increase register pressure.
9248 uint64_t SignBitPos;
9249 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9250 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9251 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9252 }
9253 }
9254 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9255 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9256 // Don't combine AND since emitComparison converts the AND to an ANDS
9257 // (a.k.a. TST) and the test in the test bit and branch instruction
9258 // becomes redundant. This would also increase register pressure.
9259 uint64_t SignBitPos;
9260 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9261 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9262 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9263 }
9264
9265 SDValue CCVal;
9266 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9267 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9268 Cmp);
9269 }
9270
9271 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9272 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9273
9274 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9275 // clean. Some of them require two branches to implement.
9276 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9277 AArch64CC::CondCode CC1, CC2;
9278 changeFPCCToAArch64CC(CC, CC1, CC2);
9279 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9280 SDValue BR1 =
9281 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9282 if (CC2 != AArch64CC::AL) {
9283 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9284 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9285 Cmp);
9286 }
9287
9288 return BR1;
9289}
9290
9291SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9292 SelectionDAG &DAG) const {
9293 if (!Subtarget->hasNEON())
9294 return SDValue();
9295
9296 EVT VT = Op.getValueType();
9297 EVT IntVT = VT.changeTypeToInteger();
9298 SDLoc DL(Op);
9299
9300 SDValue In1 = Op.getOperand(0);
9301 SDValue In2 = Op.getOperand(1);
9302 EVT SrcVT = In2.getValueType();
9303
9304 if (!SrcVT.bitsEq(VT))
9305 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9306
9307 if (VT.isScalableVector())
9308 IntVT =
9310
9311 if (VT.isFixedLengthVector() &&
9312 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9313 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9314
9315 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9316 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9317
9318 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9319 return convertFromScalableVector(DAG, VT, Res);
9320 }
9321
9322 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9323 if (VT.isScalableVector())
9324 return getSVESafeBitCast(VT, Op, DAG);
9325
9326 return DAG.getBitcast(VT, Op);
9327 };
9328
9329 SDValue VecVal1, VecVal2;
9330 EVT VecVT;
9331 auto SetVecVal = [&](int Idx = -1) {
9332 if (!VT.isVector()) {
9333 VecVal1 =
9334 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9335 VecVal2 =
9336 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9337 } else {
9338 VecVal1 = BitCast(VecVT, In1, DAG);
9339 VecVal2 = BitCast(VecVT, In2, DAG);
9340 }
9341 };
9342 if (VT.isVector()) {
9343 VecVT = IntVT;
9344 SetVecVal();
9345 } else if (VT == MVT::f64) {
9346 VecVT = MVT::v2i64;
9347 SetVecVal(AArch64::dsub);
9348 } else if (VT == MVT::f32) {
9349 VecVT = MVT::v4i32;
9350 SetVecVal(AArch64::ssub);
9351 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9352 VecVT = MVT::v8i16;
9353 SetVecVal(AArch64::hsub);
9354 } else {
9355 llvm_unreachable("Invalid type for copysign!");
9356 }
9357
9358 unsigned BitWidth = In1.getScalarValueSizeInBits();
9359 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9360
9361 // We want to materialize a mask with every bit but the high bit set, but the
9362 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9363 // 64-bit elements. Instead, materialize all bits set and then negate that.
9364 if (VT == MVT::f64 || VT == MVT::v2f64) {
9365 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9366 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9367 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9368 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9369 }
9370
9371 SDValue BSP =
9372 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9373 if (VT == MVT::f16 || VT == MVT::bf16)
9374 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9375 if (VT == MVT::f32)
9376 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9377 if (VT == MVT::f64)
9378 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9379
9380 return BitCast(VT, BSP, DAG);
9381}
9382
9383SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9384 SelectionDAG &DAG) const {
9386 Attribute::NoImplicitFloat))
9387 return SDValue();
9388
9389 if (!Subtarget->hasNEON())
9390 return SDValue();
9391
9392 bool IsParity = Op.getOpcode() == ISD::PARITY;
9393 SDValue Val = Op.getOperand(0);
9394 SDLoc DL(Op);
9395 EVT VT = Op.getValueType();
9396
9397 // for i32, general parity function using EORs is more efficient compared to
9398 // using floating point
9399 if (VT == MVT::i32 && IsParity)
9400 return SDValue();
9401
9402 // If there is no CNT instruction available, GPR popcount can
9403 // be more efficiently lowered to the following sequence that uses
9404 // AdvSIMD registers/instructions as long as the copies to/from
9405 // the AdvSIMD registers are cheap.
9406 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9407 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9408 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9409 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9410 if (VT == MVT::i32 || VT == MVT::i64) {
9411 if (VT == MVT::i32)
9412 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9413 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9414
9415 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9416 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9417 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9418 DAG.getConstant(0, DL, MVT::i64));
9419
9420 if (IsParity)
9421 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9422 DAG.getConstant(1, DL, MVT::i32));
9423
9424 if (VT == MVT::i64)
9425 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9426 return UaddLV;
9427 } else if (VT == MVT::i128) {
9428 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9429
9430 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9431 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9432 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9433 DAG.getConstant(0, DL, MVT::i64));
9434
9435 if (IsParity)
9436 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9437 DAG.getConstant(1, DL, MVT::i32));
9438
9439 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9440 }
9441
9442 assert(!IsParity && "ISD::PARITY of vector types not supported");
9443
9444 if (VT.isScalableVector() ||
9446 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9447
9448 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9449 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9450 "Unexpected type for custom ctpop lowering");
9451
9452 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9453 Val = DAG.getBitcast(VT8Bit, Val);
9454 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9455
9456 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9457 unsigned EltSize = 8;
9458 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9459 while (EltSize != VT.getScalarSizeInBits()) {
9460 EltSize *= 2;
9461 NumElts /= 2;
9462 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9463 Val = DAG.getNode(
9464 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9465 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9466 }
9467
9468 return Val;
9469}
9470
9471SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9472 EVT VT = Op.getValueType();
9473 assert(VT.isScalableVector() ||
9475 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9476
9477 SDLoc DL(Op);
9478 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9479 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9480}
9481
9482SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9483 SelectionDAG &DAG) const {
9484
9485 EVT VT = Op.getValueType();
9486 SDLoc DL(Op);
9487 unsigned Opcode = Op.getOpcode();
9489 switch (Opcode) {
9490 default:
9491 llvm_unreachable("Wrong instruction");
9492 case ISD::SMAX:
9493 CC = ISD::SETGT;
9494 break;
9495 case ISD::SMIN:
9496 CC = ISD::SETLT;
9497 break;
9498 case ISD::UMAX:
9499 CC = ISD::SETUGT;
9500 break;
9501 case ISD::UMIN:
9502 CC = ISD::SETULT;
9503 break;
9504 }
9505
9506 if (VT.isScalableVector() ||
9508 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9509 switch (Opcode) {
9510 default:
9511 llvm_unreachable("Wrong instruction");
9512 case ISD::SMAX:
9513 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9514 case ISD::SMIN:
9515 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9516 case ISD::UMAX:
9517 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9518 case ISD::UMIN:
9519 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9520 }
9521 }
9522
9523 SDValue Op0 = Op.getOperand(0);
9524 SDValue Op1 = Op.getOperand(1);
9525 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9526 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9527}
9528
9529SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9530 SelectionDAG &DAG) const {
9531 EVT VT = Op.getValueType();
9532
9533 if (VT.isScalableVector() ||
9535 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9536 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9537
9538 SDLoc DL(Op);
9539 SDValue REVB;
9540 MVT VST;
9541
9542 switch (VT.getSimpleVT().SimpleTy) {
9543 default:
9544 llvm_unreachable("Invalid type for bitreverse!");
9545
9546 case MVT::v2i32: {
9547 VST = MVT::v8i8;
9548 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9549
9550 break;
9551 }
9552
9553 case MVT::v4i32: {
9554 VST = MVT::v16i8;
9555 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9556
9557 break;
9558 }
9559
9560 case MVT::v1i64: {
9561 VST = MVT::v8i8;
9562 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9563
9564 break;
9565 }
9566
9567 case MVT::v2i64: {
9568 VST = MVT::v16i8;
9569 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9570
9571 break;
9572 }
9573 }
9574
9575 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9576 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9577}
9578
9579// Check whether the continuous comparison sequence.
9580static bool
9581isOrXorChain(SDValue N, unsigned &Num,
9582 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9583 if (Num == MaxXors)
9584 return false;
9585
9586 // Skip the one-use zext
9587 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9588 N = N->getOperand(0);
9589
9590 // The leaf node must be XOR
9591 if (N->getOpcode() == ISD::XOR) {
9592 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9593 Num++;
9594 return true;
9595 }
9596
9597 // All the non-leaf nodes must be OR.
9598 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9599 return false;
9600
9601 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9602 isOrXorChain(N->getOperand(1), Num, WorkList))
9603 return true;
9604 return false;
9605}
9606
9607// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9609 SDValue LHS = N->getOperand(0);
9610 SDValue RHS = N->getOperand(1);
9611 SDLoc DL(N);
9612 EVT VT = N->getValueType(0);
9614
9615 // Only handle integer compares.
9616 if (N->getOpcode() != ISD::SETCC)
9617 return SDValue();
9618
9619 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9620 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9621 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9622 unsigned NumXors = 0;
9623 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9624 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9625 isOrXorChain(LHS, NumXors, WorkList)) {
9626 SDValue XOR0, XOR1;
9627 std::tie(XOR0, XOR1) = WorkList[0];
9628 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9629 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9630 for (unsigned I = 1; I < WorkList.size(); I++) {
9631 std::tie(XOR0, XOR1) = WorkList[I];
9632 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9633 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9634 }
9635
9636 // Exit early by inverting the condition, which help reduce indentations.
9637 return Cmp;
9638 }
9639
9640 return SDValue();
9641}
9642
9643SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9644
9645 if (Op.getValueType().isVector())
9646 return LowerVSETCC(Op, DAG);
9647
9648 bool IsStrict = Op->isStrictFPOpcode();
9649 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9650 unsigned OpNo = IsStrict ? 1 : 0;
9651 SDValue Chain;
9652 if (IsStrict)
9653 Chain = Op.getOperand(0);
9654 SDValue LHS = Op.getOperand(OpNo + 0);
9655 SDValue RHS = Op.getOperand(OpNo + 1);
9656 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9657 SDLoc dl(Op);
9658
9659 // We chose ZeroOrOneBooleanContents, so use zero and one.
9660 EVT VT = Op.getValueType();
9661 SDValue TVal = DAG.getConstant(1, dl, VT);
9662 SDValue FVal = DAG.getConstant(0, dl, VT);
9663
9664 // Handle f128 first, since one possible outcome is a normal integer
9665 // comparison which gets picked up by the next if statement.
9666 if (LHS.getValueType() == MVT::f128) {
9667 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9668 IsSignaling);
9669
9670 // If softenSetCCOperands returned a scalar, use it.
9671 if (!RHS.getNode()) {
9672 assert(LHS.getValueType() == Op.getValueType() &&
9673 "Unexpected setcc expansion!");
9674 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9675 }
9676 }
9677
9678 if (LHS.getValueType().isInteger()) {
9679 SDValue CCVal;
9681 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9682
9683 // Note that we inverted the condition above, so we reverse the order of
9684 // the true and false operands here. This will allow the setcc to be
9685 // matched to a single CSINC instruction.
9686 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9687 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9688 }
9689
9690 // Now we know we're dealing with FP values.
9691 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9692 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9693
9694 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9695 // and do the comparison.
9696 SDValue Cmp;
9697 if (IsStrict)
9698 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9699 else
9700 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9701
9702 AArch64CC::CondCode CC1, CC2;
9703 changeFPCCToAArch64CC(CC, CC1, CC2);
9704 SDValue Res;
9705 if (CC2 == AArch64CC::AL) {
9706 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9707 CC2);
9708 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9709
9710 // Note that we inverted the condition above, so we reverse the order of
9711 // the true and false operands here. This will allow the setcc to be
9712 // matched to a single CSINC instruction.
9713 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9714 } else {
9715 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9716 // totally clean. Some of them require two CSELs to implement. As is in
9717 // this case, we emit the first CSEL and then emit a second using the output
9718 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9719
9720 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9721 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9722 SDValue CS1 =
9723 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9724
9725 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9726 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9727 }
9728 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9729}
9730
9731SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9732 SelectionDAG &DAG) const {
9733
9734 SDValue LHS = Op.getOperand(0);
9735 SDValue RHS = Op.getOperand(1);
9736 EVT VT = LHS.getValueType();
9737 if (VT != MVT::i32 && VT != MVT::i64)
9738 return SDValue();
9739
9740 SDLoc DL(Op);
9741 SDValue Carry = Op.getOperand(2);
9742 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9743 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9744 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9745 LHS, RHS, InvCarry);
9746
9747 EVT OpVT = Op.getValueType();
9748 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9749 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9750
9751 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9753 SDValue CCVal =
9754 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9755 // Inputs are swapped because the condition is inverted. This will allow
9756 // matching with a single CSINC instruction.
9757 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9758 Cmp.getValue(1));
9759}
9760
9761SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9762 SDValue RHS, SDValue TVal,
9763 SDValue FVal, const SDLoc &dl,
9764 SelectionDAG &DAG) const {
9765 // Handle f128 first, because it will result in a comparison of some RTLIB
9766 // call result against zero.
9767 if (LHS.getValueType() == MVT::f128) {
9768 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9769
9770 // If softenSetCCOperands returned a scalar, we need to compare the result
9771 // against zero to select between true and false values.
9772 if (!RHS.getNode()) {
9773 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9774 CC = ISD::SETNE;
9775 }
9776 }
9777
9778 // Also handle f16, for which we need to do a f32 comparison.
9779 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9780 LHS.getValueType() == MVT::bf16) {
9781 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9782 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9783 }
9784
9785 // Next, handle integers.
9786 if (LHS.getValueType().isInteger()) {
9787 assert((LHS.getValueType() == RHS.getValueType()) &&
9788 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9789
9790 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9791 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9792 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9793 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9794 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9795 // supported types.
9796 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9797 CTVal->isOne() && CFVal->isAllOnes() &&
9798 LHS.getValueType() == TVal.getValueType()) {
9799 EVT VT = LHS.getValueType();
9800 SDValue Shift =
9801 DAG.getNode(ISD::SRA, dl, VT, LHS,
9802 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9803 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9804 }
9805
9806 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9807 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9808 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9809 // Both require less instructions than compare and conditional select.
9810 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9811 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9812 LHS.getValueType() == RHS.getValueType()) {
9813 EVT VT = LHS.getValueType();
9814 SDValue Shift =
9815 DAG.getNode(ISD::SRA, dl, VT, LHS,
9816 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9817
9818 if (CC == ISD::SETGT)
9819 Shift = DAG.getNOT(dl, Shift, VT);
9820
9821 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9822 }
9823
9824 unsigned Opcode = AArch64ISD::CSEL;
9825
9826 // If both the TVal and the FVal are constants, see if we can swap them in
9827 // order to for a CSINV or CSINC out of them.
9828 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9829 std::swap(TVal, FVal);
9830 std::swap(CTVal, CFVal);
9831 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9832 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9833 std::swap(TVal, FVal);
9834 std::swap(CTVal, CFVal);
9835 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9836 } else if (TVal.getOpcode() == ISD::XOR) {
9837 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9838 // with a CSINV rather than a CSEL.
9839 if (isAllOnesConstant(TVal.getOperand(1))) {
9840 std::swap(TVal, FVal);
9841 std::swap(CTVal, CFVal);
9842 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9843 }
9844 } else if (TVal.getOpcode() == ISD::SUB) {
9845 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9846 // that we can match with a CSNEG rather than a CSEL.
9847 if (isNullConstant(TVal.getOperand(0))) {
9848 std::swap(TVal, FVal);
9849 std::swap(CTVal, CFVal);
9850 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9851 }
9852 } else if (CTVal && CFVal) {
9853 const int64_t TrueVal = CTVal->getSExtValue();
9854 const int64_t FalseVal = CFVal->getSExtValue();
9855 bool Swap = false;
9856
9857 // If both TVal and FVal are constants, see if FVal is the
9858 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9859 // instead of a CSEL in that case.
9860 if (TrueVal == ~FalseVal) {
9861 Opcode = AArch64ISD::CSINV;
9862 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9863 TrueVal == -FalseVal) {
9864 Opcode = AArch64ISD::CSNEG;
9865 } else if (TVal.getValueType() == MVT::i32) {
9866 // If our operands are only 32-bit wide, make sure we use 32-bit
9867 // arithmetic for the check whether we can use CSINC. This ensures that
9868 // the addition in the check will wrap around properly in case there is
9869 // an overflow (which would not be the case if we do the check with
9870 // 64-bit arithmetic).
9871 const uint32_t TrueVal32 = CTVal->getZExtValue();
9872 const uint32_t FalseVal32 = CFVal->getZExtValue();
9873
9874 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9875 Opcode = AArch64ISD::CSINC;
9876
9877 if (TrueVal32 > FalseVal32) {
9878 Swap = true;
9879 }
9880 }
9881 } else {
9882 // 64-bit check whether we can use CSINC.
9883 const uint64_t TrueVal64 = TrueVal;
9884 const uint64_t FalseVal64 = FalseVal;
9885
9886 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9887 Opcode = AArch64ISD::CSINC;
9888
9889 if (TrueVal > FalseVal) {
9890 Swap = true;
9891 }
9892 }
9893 }
9894
9895 // Swap TVal and FVal if necessary.
9896 if (Swap) {
9897 std::swap(TVal, FVal);
9898 std::swap(CTVal, CFVal);
9899 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9900 }
9901
9902 if (Opcode != AArch64ISD::CSEL) {
9903 // Drop FVal since we can get its value by simply inverting/negating
9904 // TVal.
9905 FVal = TVal;
9906 }
9907 }
9908
9909 // Avoid materializing a constant when possible by reusing a known value in
9910 // a register. However, don't perform this optimization if the known value
9911 // is one, zero or negative one in the case of a CSEL. We can always
9912 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9913 // FVal, respectively.
9914 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9915 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9916 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9918 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9919 // "a != C ? x : a" to avoid materializing C.
9920 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9921 TVal = LHS;
9922 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9923 FVal = LHS;
9924 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9925 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9926 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9927 // avoid materializing C.
9929 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9930 Opcode = AArch64ISD::CSINV;
9931 TVal = LHS;
9932 FVal = DAG.getConstant(0, dl, FVal.getValueType());
9933 }
9934 }
9935
9936 SDValue CCVal;
9937 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9938 EVT VT = TVal.getValueType();
9939 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
9940 }
9941
9942 // Now we know we're dealing with FP values.
9943 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9944 LHS.getValueType() == MVT::f64);
9945 assert(LHS.getValueType() == RHS.getValueType());
9946 EVT VT = TVal.getValueType();
9947 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9948
9949 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9950 // clean. Some of them require two CSELs to implement.
9951 AArch64CC::CondCode CC1, CC2;
9952 changeFPCCToAArch64CC(CC, CC1, CC2);
9953
9954 if (DAG.getTarget().Options.UnsafeFPMath) {
9955 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
9956 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
9957 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
9958 if (RHSVal && RHSVal->isZero()) {
9959 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
9960 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
9961
9962 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
9963 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
9964 TVal = LHS;
9965 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
9966 CFVal && CFVal->isZero() &&
9967 FVal.getValueType() == LHS.getValueType())
9968 FVal = LHS;
9969 }
9970 }
9971
9972 // Emit first, and possibly only, CSEL.
9973 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9974 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9975
9976 // If we need a second CSEL, emit it, using the output of the first as the
9977 // RHS. We're effectively OR'ing the two CC's together.
9978 if (CC2 != AArch64CC::AL) {
9979 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9980 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9981 }
9982
9983 // Otherwise, return the output of the first CSEL.
9984 return CS1;
9985}
9986
9987SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
9988 SelectionDAG &DAG) const {
9989 EVT Ty = Op.getValueType();
9990 auto Idx = Op.getConstantOperandAPInt(2);
9991 int64_t IdxVal = Idx.getSExtValue();
9992 assert(Ty.isScalableVector() &&
9993 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
9994
9995 // We can use the splice instruction for certain index values where we are
9996 // able to efficiently generate the correct predicate. The index will be
9997 // inverted and used directly as the input to the ptrue instruction, i.e.
9998 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
9999 // splice predicate. However, we can only do this if we can guarantee that
10000 // there are enough elements in the vector, hence we check the index <= min
10001 // number of elements.
10002 std::optional<unsigned> PredPattern;
10003 if (Ty.isScalableVector() && IdxVal < 0 &&
10004 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10005 std::nullopt) {
10006 SDLoc DL(Op);
10007
10008 // Create a predicate where all but the last -IdxVal elements are false.
10009 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10010 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10011 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10012
10013 // Now splice the two inputs together using the predicate.
10014 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10015 Op.getOperand(1));
10016 }
10017
10018 // This will select to an EXT instruction, which has a maximum immediate
10019 // value of 255, hence 2048-bits is the maximum value we can lower.
10020 if (IdxVal >= 0 &&
10021 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
10022 return Op;
10023
10024 return SDValue();
10025}
10026
10027SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10028 SelectionDAG &DAG) const {
10029 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10030 SDValue LHS = Op.getOperand(0);
10031 SDValue RHS = Op.getOperand(1);
10032 SDValue TVal = Op.getOperand(2);
10033 SDValue FVal = Op.getOperand(3);
10034 SDLoc DL(Op);
10035 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10036}
10037
10038SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10039 SelectionDAG &DAG) const {
10040 SDValue CCVal = Op->getOperand(0);
10041 SDValue TVal = Op->getOperand(1);
10042 SDValue FVal = Op->getOperand(2);
10043 SDLoc DL(Op);
10044
10045 EVT Ty = Op.getValueType();
10046 if (Ty == MVT::aarch64svcount) {
10047 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10048 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10049 SDValue Sel =
10050 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10051 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10052 }
10053
10054 if (Ty.isScalableVector()) {
10055 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10056 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10057 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10058 }
10059
10060 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10061 // FIXME: Ideally this would be the same as above using i1 types, however
10062 // for the moment we can't deal with fixed i1 vector types properly, so
10063 // instead extend the predicate to a result type sized integer vector.
10064 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10065 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10066 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10067 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10068 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10069 }
10070
10071 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10072 // instruction.
10073 if (ISD::isOverflowIntrOpRes(CCVal)) {
10074 // Only lower legal XALUO ops.
10075 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10076 return SDValue();
10077
10079 SDValue Value, Overflow;
10080 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10081 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10082
10083 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10084 CCVal, Overflow);
10085 }
10086
10087 // Lower it the same way as we would lower a SELECT_CC node.
10089 SDValue LHS, RHS;
10090 if (CCVal.getOpcode() == ISD::SETCC) {
10091 LHS = CCVal.getOperand(0);
10092 RHS = CCVal.getOperand(1);
10093 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10094 } else {
10095 LHS = CCVal;
10096 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10097 CC = ISD::SETNE;
10098 }
10099
10100 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10101 // order to use FCSELSrrr
10102 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10103 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10104 DAG.getUNDEF(MVT::f32), TVal);
10105 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10106 DAG.getUNDEF(MVT::f32), FVal);
10107 }
10108
10109 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10110
10111 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10112 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10113 }
10114
10115 return Res;
10116}
10117
10118SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10119 SelectionDAG &DAG) const {
10120 // Jump table entries as PC relative offsets. No additional tweaking
10121 // is necessary here. Just get the address of the jump table.
10122 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10123
10126 !Subtarget->isTargetMachO())
10127 return getAddrLarge(JT, DAG);
10128 if (CM == CodeModel::Tiny)
10129 return getAddrTiny(JT, DAG);
10130 return getAddr(JT, DAG);
10131}
10132
10133SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10134 SelectionDAG &DAG) const {
10135 // Jump table entries as PC relative offsets. No additional tweaking
10136 // is necessary here. Just get the address of the jump table.
10137 SDLoc DL(Op);
10138 SDValue JT = Op.getOperand(1);
10139 SDValue Entry = Op.getOperand(2);
10140 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10141
10142 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10143 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10144
10145 SDNode *Dest =
10146 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10147 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10148 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10149 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10150}
10151
10152SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10153 SelectionDAG &DAG) const {
10154 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10156 if (CM == CodeModel::Large) {
10157 // Use the GOT for the large code model on iOS.
10158 if (Subtarget->isTargetMachO()) {
10159 return getGOT(CP, DAG);
10160 }
10162 return getAddrLarge(CP, DAG);
10163 } else if (CM == CodeModel::Tiny) {
10164 return getAddrTiny(CP, DAG);
10165 }
10166 return getAddr(CP, DAG);
10167}
10168
10169SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10170 SelectionDAG &DAG) const {
10171 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10173 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10175 return getAddrLarge(BA, DAG);
10176 } else if (CM == CodeModel::Tiny) {
10177 return getAddrTiny(BA, DAG);
10178 }
10179 return getAddr(BA, DAG);
10180}
10181
10182SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10183 SelectionDAG &DAG) const {
10184 AArch64FunctionInfo *FuncInfo =
10186
10187 SDLoc DL(Op);
10188 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10190 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10191 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10192 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10193 MachinePointerInfo(SV));
10194}
10195
10196SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10197 SelectionDAG &DAG) const {
10200
10201 SDLoc DL(Op);
10202 SDValue FR;
10203 if (Subtarget->isWindowsArm64EC()) {
10204 // With the Arm64EC ABI, we compute the address of the varargs save area
10205 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10206 // but calls from an entry thunk can pass in a different address.
10207 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10208 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10210 if (FuncInfo->getVarArgsGPRSize() > 0)
10211 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10212 else
10213 StackOffset = FuncInfo->getVarArgsStackOffset();
10214 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10215 DAG.getConstant(StackOffset, DL, MVT::i64));
10216 } else {
10217 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10218 ? FuncInfo->getVarArgsGPRIndex()
10219 : FuncInfo->getVarArgsStackIndex(),
10221 }
10222 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10223 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10224 MachinePointerInfo(SV));
10225}
10226
10227SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10228 SelectionDAG &DAG) const {
10229 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10230 // Standard, section B.3.
10233 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10234 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10235 auto PtrVT = getPointerTy(DAG.getDataLayout());
10236 SDLoc DL(Op);
10237
10238 SDValue Chain = Op.getOperand(0);
10239 SDValue VAList = Op.getOperand(1);
10240 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10242
10243 // void *__stack at offset 0
10244 unsigned Offset = 0;
10245 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10246 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10247 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10248 MachinePointerInfo(SV), Align(PtrSize)));
10249
10250 // void *__gr_top at offset 8 (4 on ILP32)
10251 Offset += PtrSize;
10252 int GPRSize = FuncInfo->getVarArgsGPRSize();
10253 if (GPRSize > 0) {
10254 SDValue GRTop, GRTopAddr;
10255
10256 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10257 DAG.getConstant(Offset, DL, PtrVT));
10258
10259 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10260 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10261 DAG.getConstant(GPRSize, DL, PtrVT));
10262 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10263
10264 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10266 Align(PtrSize)));
10267 }
10268
10269 // void *__vr_top at offset 16 (8 on ILP32)
10270 Offset += PtrSize;
10271 int FPRSize = FuncInfo->getVarArgsFPRSize();
10272 if (FPRSize > 0) {
10273 SDValue VRTop, VRTopAddr;
10274 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10275 DAG.getConstant(Offset, DL, PtrVT));
10276
10277 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10278 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10279 DAG.getConstant(FPRSize, DL, PtrVT));
10280 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10281
10282 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10284 Align(PtrSize)));
10285 }
10286
10287 // int __gr_offs at offset 24 (12 on ILP32)
10288 Offset += PtrSize;
10289 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10290 DAG.getConstant(Offset, DL, PtrVT));
10291 MemOps.push_back(
10292 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10293 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10294
10295 // int __vr_offs at offset 28 (16 on ILP32)
10296 Offset += 4;
10297 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10298 DAG.getConstant(Offset, DL, PtrVT));
10299 MemOps.push_back(
10300 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10301 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10302
10303 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10304}
10305
10306SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10307 SelectionDAG &DAG) const {
10309
10310 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10311 return LowerWin64_VASTART(Op, DAG);
10312 else if (Subtarget->isTargetDarwin())
10313 return LowerDarwin_VASTART(Op, DAG);
10314 else
10315 return LowerAAPCS_VASTART(Op, DAG);
10316}
10317
10318SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10319 SelectionDAG &DAG) const {
10320 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10321 // pointer.
10322 SDLoc DL(Op);
10323 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10324 unsigned VaListSize =
10325 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10326 ? PtrSize
10327 : Subtarget->isTargetILP32() ? 20 : 32;
10328 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10329 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10330
10331 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10332 DAG.getConstant(VaListSize, DL, MVT::i32),
10333 Align(PtrSize), false, false, false,
10334 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10335}
10336
10337SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10338 assert(Subtarget->isTargetDarwin() &&
10339 "automatic va_arg instruction only works on Darwin");
10340
10341 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10342 EVT VT = Op.getValueType();
10343 SDLoc DL(Op);
10344 SDValue Chain = Op.getOperand(0);
10345 SDValue Addr = Op.getOperand(1);
10346 MaybeAlign Align(Op.getConstantOperandVal(3));
10347 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10348 auto PtrVT = getPointerTy(DAG.getDataLayout());
10349 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10350 SDValue VAList =
10351 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10352 Chain = VAList.getValue(1);
10353 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10354
10355 if (VT.isScalableVector())
10356 report_fatal_error("Passing SVE types to variadic functions is "
10357 "currently not supported");
10358
10359 if (Align && *Align > MinSlotSize) {
10360 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10361 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10362 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10363 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10364 }
10365
10366 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10367 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10368
10369 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10370 // up to 64 bits. At the very least, we have to increase the striding of the
10371 // vaargs list to match this, and for FP values we need to introduce
10372 // FP_ROUND nodes as well.
10373 if (VT.isInteger() && !VT.isVector())
10374 ArgSize = std::max(ArgSize, MinSlotSize);
10375 bool NeedFPTrunc = false;
10376 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10377 ArgSize = 8;
10378 NeedFPTrunc = true;
10379 }
10380
10381 // Increment the pointer, VAList, to the next vaarg
10382 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10383 DAG.getConstant(ArgSize, DL, PtrVT));
10384 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10385
10386 // Store the incremented VAList to the legalized pointer
10387 SDValue APStore =
10388 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10389
10390 // Load the actual argument out of the pointer VAList
10391 if (NeedFPTrunc) {
10392 // Load the value as an f64.
10393 SDValue WideFP =
10394 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10395 // Round the value down to an f32.
10396 SDValue NarrowFP =
10397 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10398 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10399 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10400 // Merge the rounded value with the chain output of the load.
10401 return DAG.getMergeValues(Ops, DL);
10402 }
10403
10404 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10405}
10406
10407SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10408 SelectionDAG &DAG) const {
10410 MFI.setFrameAddressIsTaken(true);
10411
10412 EVT VT = Op.getValueType();
10413 SDLoc DL(Op);
10414 unsigned Depth = Op.getConstantOperandVal(0);
10415 SDValue FrameAddr =
10416 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10417 while (Depth--)
10418 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10420
10421 if (Subtarget->isTargetILP32())
10422 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10423 DAG.getValueType(VT));
10424
10425 return FrameAddr;
10426}
10427
10428SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10429 SelectionDAG &DAG) const {
10431
10432 EVT VT = getPointerTy(DAG.getDataLayout());
10433 SDLoc DL(Op);
10434 int FI = MFI.CreateFixedObject(4, 0, false);
10435 return DAG.getFrameIndex(FI, VT);
10436}
10437
10438#define GET_REGISTER_MATCHER
10439#include "AArch64GenAsmMatcher.inc"
10440
10441// FIXME? Maybe this could be a TableGen attribute on some registers and
10442// this table could be generated automatically from RegInfo.
10443Register AArch64TargetLowering::
10444getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10446 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10447 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10448 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10449 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10450 !MRI->isReservedReg(MF, Reg))
10451 Reg = 0;
10452 }
10453 if (Reg)
10454 return Reg;
10455 report_fatal_error(Twine("Invalid register name \""
10456 + StringRef(RegName) + "\"."));
10457}
10458
10459SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10460 SelectionDAG &DAG) const {
10462
10463 EVT VT = Op.getValueType();
10464 SDLoc DL(Op);
10465
10466 SDValue FrameAddr =
10467 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10469
10470 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10471}
10472
10473SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10474 SelectionDAG &DAG) const {
10476 MachineFrameInfo &MFI = MF.getFrameInfo();
10477 MFI.setReturnAddressIsTaken(true);
10478
10479 EVT VT = Op.getValueType();
10480 SDLoc DL(Op);
10481 unsigned Depth = Op.getConstantOperandVal(0);
10482 SDValue ReturnAddress;
10483 if (Depth) {
10484 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10486 ReturnAddress = DAG.getLoad(
10487 VT, DL, DAG.getEntryNode(),
10488 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10489 } else {
10490 // Return LR, which contains the return address. Mark it an implicit
10491 // live-in.
10492 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10493 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10494 }
10495
10496 // The XPACLRI instruction assembles to a hint-space instruction before
10497 // Armv8.3-A therefore this instruction can be safely used for any pre
10498 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10499 // that instead.
10500 SDNode *St;
10501 if (Subtarget->hasPAuth()) {
10502 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10503 } else {
10504 // XPACLRI operates on LR therefore we must move the operand accordingly.
10505 SDValue Chain =
10506 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10507 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10508 }
10509 return SDValue(St, 0);
10510}
10511
10512/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10513/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10514SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10515 SelectionDAG &DAG) const {
10516 SDValue Lo, Hi;
10517 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10518 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10519}
10520
10522 const GlobalAddressSDNode *GA) const {
10523 // Offsets are folded in the DAG combine rather than here so that we can
10524 // intelligently choose an offset based on the uses.
10525 return false;
10526}
10527
10529 bool OptForSize) const {
10530 bool IsLegal = false;
10531 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10532 // 16-bit case when target has full fp16 support.
10533 // We encode bf16 bit patterns as if they were fp16. This results in very
10534 // strange looking assembly but should populate the register with appropriate
10535 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10536 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10537 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10538 // FIXME: We should be able to handle f128 as well with a clever lowering.
10539 const APInt ImmInt = Imm.bitcastToAPInt();
10540 if (VT == MVT::f64)
10541 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10542 else if (VT == MVT::f32)
10543 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10544 else if (VT == MVT::f16 || VT == MVT::bf16)
10545 IsLegal =
10546 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10547 Imm.isPosZero();
10548
10549 // If we can not materialize in immediate field for fmov, check if the
10550 // value can be encoded as the immediate operand of a logical instruction.
10551 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10552 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10553 // generate that fmov.
10554 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10555 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10556 // however the mov+fmov sequence is always better because of the reduced
10557 // cache pressure. The timings are still the same if you consider
10558 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10559 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10562 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10563 IsLegal = Insn.size() <= Limit;
10564 }
10565
10566 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10567 << " imm value: "; Imm.dump(););
10568 return IsLegal;
10569}
10570
10571//===----------------------------------------------------------------------===//
10572// AArch64 Optimization Hooks
10573//===----------------------------------------------------------------------===//
10574
10575static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10576 SDValue Operand, SelectionDAG &DAG,
10577 int &ExtraSteps) {
10578 EVT VT = Operand.getValueType();
10579 if ((ST->hasNEON() &&
10580 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10581 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10582 VT == MVT::v4f32)) ||
10583 (ST->hasSVE() &&
10584 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10586 // For the reciprocal estimates, convergence is quadratic, so the number
10587 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10588 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10589 // the result for float (23 mantissa bits) is 2 and for double (52
10590 // mantissa bits) is 3.
10591 constexpr unsigned AccurateBits = 8;
10592 unsigned DesiredBits =
10594 ExtraSteps = DesiredBits <= AccurateBits
10595 ? 0
10596 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10597 }
10598
10599 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10600 }
10601
10602 return SDValue();
10603}
10604
10605SDValue
10606AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10607 const DenormalMode &Mode) const {
10608 SDLoc DL(Op);
10609 EVT VT = Op.getValueType();
10610 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10611 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10612 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10613}
10614
10615SDValue
10616AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10617 SelectionDAG &DAG) const {
10618 return Op;
10619}
10620
10621SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10622 SelectionDAG &DAG, int Enabled,
10623 int &ExtraSteps,
10624 bool &UseOneConst,
10625 bool Reciprocal) const {
10627 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10628 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10629 DAG, ExtraSteps)) {
10630 SDLoc DL(Operand);
10631 EVT VT = Operand.getValueType();
10632
10634 Flags.setAllowReassociation(true);
10635
10636 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10637 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10638 for (int i = ExtraSteps; i > 0; --i) {
10639 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10640 Flags);
10641 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10642 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10643 }
10644 if (!Reciprocal)
10645 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10646
10647 ExtraSteps = 0;
10648 return Estimate;
10649 }
10650
10651 return SDValue();
10652}
10653
10654SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10655 SelectionDAG &DAG, int Enabled,
10656 int &ExtraSteps) const {
10658 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10659 DAG, ExtraSteps)) {
10660 SDLoc DL(Operand);
10661 EVT VT = Operand.getValueType();
10662
10664 Flags.setAllowReassociation(true);
10665
10666 // Newton reciprocal iteration: E * (2 - X * E)
10667 // AArch64 reciprocal iteration instruction: (2 - M * N)
10668 for (int i = ExtraSteps; i > 0; --i) {
10669 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10670 Estimate, Flags);
10671 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10672 }
10673
10674 ExtraSteps = 0;
10675 return Estimate;
10676 }
10677
10678 return SDValue();
10679}
10680
10681//===----------------------------------------------------------------------===//
10682// AArch64 Inline Assembly Support
10683//===----------------------------------------------------------------------===//
10684
10685// Table of Constraints
10686// TODO: This is the current set of constraints supported by ARM for the
10687// compiler, not all of them may make sense.
10688//
10689// r - A general register
10690// w - An FP/SIMD register of some size in the range v0-v31
10691// x - An FP/SIMD register of some size in the range v0-v15
10692// I - Constant that can be used with an ADD instruction
10693// J - Constant that can be used with a SUB instruction
10694// K - Constant that can be used with a 32-bit logical instruction
10695// L - Constant that can be used with a 64-bit logical instruction
10696// M - Constant that can be used as a 32-bit MOV immediate
10697// N - Constant that can be used as a 64-bit MOV immediate
10698// Q - A memory reference with base register and no offset
10699// S - A symbolic address
10700// Y - Floating point constant zero
10701// Z - Integer constant zero
10702//
10703// Note that general register operands will be output using their 64-bit x
10704// register name, whatever the size of the variable, unless the asm operand
10705// is prefixed by the %w modifier. Floating-point and SIMD register operands
10706// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10707// %q modifier.
10708const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10709 // At this point, we have to lower this constraint to something else, so we
10710 // lower it to an "r" or "w". However, by doing this we will force the result
10711 // to be in register, while the X constraint is much more permissive.
10712 //
10713 // Although we are correct (we are free to emit anything, without
10714 // constraints), we might break use cases that would expect us to be more
10715 // efficient and emit something else.
10716 if (!Subtarget->hasFPARMv8())
10717 return "r";
10718
10719 if (ConstraintVT.isFloatingPoint())
10720 return "w";
10721
10722 if (ConstraintVT.isVector() &&
10723 (ConstraintVT.getSizeInBits() == 64 ||
10724 ConstraintVT.getSizeInBits() == 128))
10725 return "w";
10726
10727 return "r";
10728}
10729
10731
10732static std::optional<PredicateConstraint>
10735 .Case("Uph", PredicateConstraint::Uph)
10736 .Case("Upl", PredicateConstraint::Upl)
10737 .Case("Upa", PredicateConstraint::Upa)
10738 .Default(std::nullopt);
10739}
10740
10741static const TargetRegisterClass *
10743 if (VT != MVT::aarch64svcount &&
10744 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10745 return nullptr;
10746
10747 switch (Constraint) {
10748 case PredicateConstraint::Uph:
10749 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10750 : &AArch64::PPR_p8to15RegClass;
10751 case PredicateConstraint::Upl:
10752 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10753 : &AArch64::PPR_3bRegClass;
10754 case PredicateConstraint::Upa:
10755 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10756 : &AArch64::PPRRegClass;
10757 }
10758
10759 llvm_unreachable("Missing PredicateConstraint!");
10760}
10761
10763
10764static std::optional<ReducedGprConstraint>
10767 .Case("Uci", ReducedGprConstraint::Uci)
10768 .Case("Ucj", ReducedGprConstraint::Ucj)
10769 .Default(std::nullopt);
10770}
10771
10772static const TargetRegisterClass *
10774 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10775 return nullptr;
10776
10777 switch (Constraint) {
10778 case ReducedGprConstraint::Uci:
10779 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10780 case ReducedGprConstraint::Ucj:
10781 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10782 }
10783
10784 llvm_unreachable("Missing ReducedGprConstraint!");
10785}
10786
10787// The set of cc code supported is from
10788// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10791 .Case("{@cchi}", AArch64CC::HI)
10792 .Case("{@cccs}", AArch64CC::HS)
10793 .Case("{@cclo}", AArch64CC::LO)
10794 .Case("{@ccls}", AArch64CC::LS)
10795 .Case("{@cccc}", AArch64CC::LO)
10796 .Case("{@cceq}", AArch64CC::EQ)
10797 .Case("{@ccgt}", AArch64CC::GT)
10798 .Case("{@ccge}", AArch64CC::GE)
10799 .Case("{@cclt}", AArch64CC::LT)
10800 .Case("{@ccle}", AArch64CC::LE)
10801 .Case("{@cchs}", AArch64CC::HS)
10802 .Case("{@ccne}", AArch64CC::NE)
10803 .Case("{@ccvc}", AArch64CC::VC)
10804 .Case("{@ccpl}", AArch64CC::PL)
10805 .Case("{@ccvs}", AArch64CC::VS)
10806 .Case("{@ccmi}", AArch64CC::MI)
10808 return Cond;
10809}
10810
10811/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10812/// WZR, invert(<cond>)'.
10814 SelectionDAG &DAG) {
10815 return DAG.getNode(
10816 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10817 DAG.getConstant(0, DL, MVT::i32),
10818 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10819}
10820
10821// Lower @cc flag output via getSETCC.
10822SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10823 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10824 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10825 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10826 if (Cond == AArch64CC::Invalid)
10827 return SDValue();
10828 // The output variable should be a scalar integer.
10829 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10830 OpInfo.ConstraintVT.getSizeInBits() < 8)
10831 report_fatal_error("Flag output operand is of invalid type");
10832
10833 // Get NZCV register. Only update chain when copyfrom is glued.
10834 if (Glue.getNode()) {
10835 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10836 Chain = Glue.getValue(1);
10837 } else
10838 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10839 // Extract CC code.
10840 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10841
10843
10844 // Truncate or ZERO_EXTEND based on value types.
10845 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10846 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10847 else
10848 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10849
10850 return Result;
10851}
10852
10853/// getConstraintType - Given a constraint letter, return the type of
10854/// constraint it is for this target.
10856AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10857 if (Constraint.size() == 1) {
10858 switch (Constraint[0]) {
10859 default:
10860 break;
10861 case 'x':
10862 case 'w':
10863 case 'y':
10864 return C_RegisterClass;
10865 // An address with a single base register. Due to the way we
10866 // currently handle addresses it is the same as 'r'.
10867 case 'Q':
10868 return C_Memory;
10869 case 'I':
10870 case 'J':
10871 case 'K':
10872 case 'L':
10873 case 'M':
10874 case 'N':
10875 case 'Y':
10876 case 'Z':
10877 return C_Immediate;
10878 case 'z':
10879 case 'S': // A symbol or label reference with a constant offset
10880 return C_Other;
10881 }
10882 } else if (parsePredicateConstraint(Constraint))
10883 return C_RegisterClass;
10884 else if (parseReducedGprConstraint(Constraint))
10885 return C_RegisterClass;
10886 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10887 return C_Other;
10888 return TargetLowering::getConstraintType(Constraint);
10889}
10890
10891/// Examine constraint type and operand type and determine a weight value.
10892/// This object must already have been set up with the operand type
10893/// and the current alternative constraint selected.
10895AArch64TargetLowering::getSingleConstraintMatchWeight(
10896 AsmOperandInfo &info, const char *constraint) const {
10898 Value *CallOperandVal = info.CallOperandVal;
10899 // If we don't have a value, we can't do a match,
10900 // but allow it at the lowest weight.
10901 if (!CallOperandVal)
10902 return CW_Default;
10903 Type *type = CallOperandVal->getType();
10904 // Look at the constraint type.
10905 switch (*constraint) {
10906 default:
10908 break;
10909 case 'x':
10910 case 'w':
10911 case 'y':
10912 if (type->isFloatingPointTy() || type->isVectorTy())
10913 weight = CW_Register;
10914 break;
10915 case 'z':
10916 weight = CW_Constant;
10917 break;
10918 case 'U':
10919 if (parsePredicateConstraint(constraint) ||
10920 parseReducedGprConstraint(constraint))
10921 weight = CW_Register;
10922 break;
10923 }
10924 return weight;
10925}
10926
10927std::pair<unsigned, const TargetRegisterClass *>
10928AArch64TargetLowering::getRegForInlineAsmConstraint(
10929 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10930 if (Constraint.size() == 1) {
10931 switch (Constraint[0]) {
10932 case 'r':
10933 if (VT.isScalableVector())
10934 return std::make_pair(0U, nullptr);
10935 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10936 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10937 if (VT.getFixedSizeInBits() == 64)
10938 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10939 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10940 case 'w': {
10941 if (!Subtarget->hasFPARMv8())
10942 break;
10943 if (VT.isScalableVector()) {
10944 if (VT.getVectorElementType() != MVT::i1)
10945 return std::make_pair(0U, &AArch64::ZPRRegClass);
10946 return std::make_pair(0U, nullptr);
10947 }
10948 uint64_t VTSize = VT.getFixedSizeInBits();
10949 if (VTSize == 16)
10950 return std::make_pair(0U, &AArch64::FPR16RegClass);
10951 if (VTSize == 32)
10952 return std::make_pair(0U, &AArch64::FPR32RegClass);
10953 if (VTSize == 64)
10954 return std::make_pair(0U, &AArch64::FPR64RegClass);
10955 if (VTSize == 128)
10956 return std::make_pair(0U, &AArch64::FPR128RegClass);
10957 break;
10958 }
10959 // The instructions that this constraint is designed for can
10960 // only take 128-bit registers so just use that regclass.
10961 case 'x':
10962 if (!Subtarget->hasFPARMv8())
10963 break;
10964 if (VT.isScalableVector())
10965 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
10966 if (VT.getSizeInBits() == 128)
10967 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
10968 break;
10969 case 'y':
10970 if (!Subtarget->hasFPARMv8())
10971 break;
10972 if (VT.isScalableVector())
10973 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
10974 break;
10975 }
10976 } else {
10977 if (const auto PC = parsePredicateConstraint(Constraint))
10978 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
10979 return std::make_pair(0U, RegClass);
10980
10981 if (const auto RGC = parseReducedGprConstraint(Constraint))
10982 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
10983 return std::make_pair(0U, RegClass);
10984 }
10985 if (StringRef("{cc}").equals_insensitive(Constraint) ||
10987 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
10988
10989 if (Constraint == "{za}") {
10990 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
10991 }
10992
10993 if (Constraint == "{zt0}") {
10994 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
10995 }
10996
10997 // Use the default implementation in TargetLowering to convert the register
10998 // constraint into a member of a register class.
10999 std::pair<unsigned, const TargetRegisterClass *> Res;
11001
11002 // Not found as a standard register?
11003 if (!Res.second) {
11004 unsigned Size = Constraint.size();
11005 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11006 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11007 int RegNo;
11008 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11009 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11010 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11011 // By default we'll emit v0-v31 for this unless there's a modifier where
11012 // we'll emit the correct register as well.
11013 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11014 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11015 Res.second = &AArch64::FPR64RegClass;
11016 } else {
11017 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11018 Res.second = &AArch64::FPR128RegClass;
11019 }
11020 }
11021 }
11022 }
11023
11024 if (Res.second && !Subtarget->hasFPARMv8() &&
11025 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11026 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11027 return std::make_pair(0U, nullptr);
11028
11029 return Res;
11030}
11031
11033 llvm::Type *Ty,
11034 bool AllowUnknown) const {
11035 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11036 return EVT(MVT::i64x8);
11037
11038 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11039}
11040
11041/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11042/// vector. If it is invalid, don't add anything to Ops.
11043void AArch64TargetLowering::LowerAsmOperandForConstraint(
11044 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11045 SelectionDAG &DAG) const {
11046 SDValue Result;
11047
11048 // Currently only support length 1 constraints.
11049 if (Constraint.size() != 1)
11050 return;
11051
11052 char ConstraintLetter = Constraint[0];
11053 switch (ConstraintLetter) {
11054 default:
11055 break;
11056
11057 // This set of constraints deal with valid constants for various instructions.
11058 // Validate and return a target constant for them if we can.
11059 case 'z': {
11060 // 'z' maps to xzr or wzr so it needs an input of 0.
11061 if (!isNullConstant(Op))
11062 return;
11063
11064 if (Op.getValueType() == MVT::i64)
11065 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11066 else
11067 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11068 break;
11069 }
11070 case 'S':
11071 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11072 // supported for PIC while "s" isn't, making "s" less useful. We implement
11073 // "S" but not "s".
11075 break;
11076
11077 case 'I':
11078 case 'J':
11079 case 'K':
11080 case 'L':
11081 case 'M':
11082 case 'N':
11083 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11084 if (!C)
11085 return;
11086
11087 // Grab the value and do some validation.
11088 uint64_t CVal = C->getZExtValue();
11089 switch (ConstraintLetter) {
11090 // The I constraint applies only to simple ADD or SUB immediate operands:
11091 // i.e. 0 to 4095 with optional shift by 12
11092 // The J constraint applies only to ADD or SUB immediates that would be
11093 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11094 // instruction [or vice versa], in other words -1 to -4095 with optional
11095 // left shift by 12.
11096 case 'I':
11097 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11098 break;
11099 return;
11100 case 'J': {
11101 uint64_t NVal = -C->getSExtValue();
11102 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11103 CVal = C->getSExtValue();
11104 break;
11105 }
11106 return;
11107 }
11108 // The K and L constraints apply *only* to logical immediates, including
11109 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11110 // been removed and MOV should be used). So these constraints have to
11111 // distinguish between bit patterns that are valid 32-bit or 64-bit
11112 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11113 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11114 // versa.
11115 case 'K':
11116 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11117 break;
11118 return;
11119 case 'L':
11120 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11121 break;
11122 return;
11123 // The M and N constraints are a superset of K and L respectively, for use
11124 // with the MOV (immediate) alias. As well as the logical immediates they
11125 // also match 32 or 64-bit immediates that can be loaded either using a
11126 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11127 // (M) or 64-bit 0x1234000000000000 (N) etc.
11128 // As a note some of this code is liberally stolen from the asm parser.
11129 case 'M': {
11130 if (!isUInt<32>(CVal))
11131 return;
11132 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11133 break;
11134 if ((CVal & 0xFFFF) == CVal)
11135 break;
11136 if ((CVal & 0xFFFF0000ULL) == CVal)
11137 break;
11138 uint64_t NCVal = ~(uint32_t)CVal;
11139 if ((NCVal & 0xFFFFULL) == NCVal)
11140 break;
11141 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11142 break;
11143 return;
11144 }
11145 case 'N': {
11146 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11147 break;
11148 if ((CVal & 0xFFFFULL) == CVal)
11149 break;
11150 if ((CVal & 0xFFFF0000ULL) == CVal)
11151 break;
11152 if ((CVal & 0xFFFF00000000ULL) == CVal)
11153 break;
11154 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11155 break;
11156 uint64_t NCVal = ~CVal;
11157 if ((NCVal & 0xFFFFULL) == NCVal)
11158 break;
11159 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11160 break;
11161 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11162 break;
11163 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11164 break;
11165 return;
11166 }
11167 default:
11168 return;
11169 }
11170
11171 // All assembler immediates are 64-bit integers.
11172 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11173 break;
11174 }
11175
11176 if (Result.getNode()) {
11177 Ops.push_back(Result);
11178 return;
11179 }
11180
11181 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11182}
11183
11184//===----------------------------------------------------------------------===//
11185// AArch64 Advanced SIMD Support
11186//===----------------------------------------------------------------------===//
11187
11188/// WidenVector - Given a value in the V64 register class, produce the
11189/// equivalent value in the V128 register class.
11191 EVT VT = V64Reg.getValueType();
11192 unsigned NarrowSize = VT.getVectorNumElements();
11193 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11194 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11195 SDLoc DL(V64Reg);
11196
11197 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11198 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11199}
11200
11201/// getExtFactor - Determine the adjustment factor for the position when
11202/// generating an "extract from vector registers" instruction.
11203static unsigned getExtFactor(SDValue &V) {
11204 EVT EltType = V.getValueType().getVectorElementType();
11205 return EltType.getSizeInBits() / 8;
11206}
11207
11208// Check if a vector is built from one vector via extracted elements of
11209// another together with an AND mask, ensuring that all elements fit
11210// within range. This can be reconstructed using AND and NEON's TBL1.
11212 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11213 SDLoc dl(Op);
11214 EVT VT = Op.getValueType();
11215 assert(!VT.isScalableVector() &&
11216 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11217
11218 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11219 // directly to TBL1.
11220 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11221 return SDValue();
11222
11223 unsigned NumElts = VT.getVectorNumElements();
11224 assert((NumElts == 8 || NumElts == 16) &&
11225 "Need to have exactly 8 or 16 elements in vector.");
11226
11227 SDValue SourceVec;
11228 SDValue MaskSourceVec;
11229 SmallVector<SDValue, 16> AndMaskConstants;
11230
11231 for (unsigned i = 0; i < NumElts; ++i) {
11232 SDValue V = Op.getOperand(i);
11233 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11234 return SDValue();
11235
11236 SDValue OperandSourceVec = V.getOperand(0);
11237 if (!SourceVec)
11238 SourceVec = OperandSourceVec;
11239 else if (SourceVec != OperandSourceVec)
11240 return SDValue();
11241
11242 // This only looks at shuffles with elements that are
11243 // a) truncated by a constant AND mask extracted from a mask vector, or
11244 // b) extracted directly from a mask vector.
11245 SDValue MaskSource = V.getOperand(1);
11246 if (MaskSource.getOpcode() == ISD::AND) {
11247 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11248 return SDValue();
11249
11250 AndMaskConstants.push_back(MaskSource.getOperand(1));
11251 MaskSource = MaskSource->getOperand(0);
11252 } else if (!AndMaskConstants.empty()) {
11253 // Either all or no operands should have an AND mask.
11254 return SDValue();
11255 }
11256
11257 // An ANY_EXTEND may be inserted between the AND and the source vector
11258 // extraction. We don't care about that, so we can just skip it.
11259 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11260 MaskSource = MaskSource.getOperand(0);
11261
11262 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11263 return SDValue();
11264
11265 SDValue MaskIdx = MaskSource.getOperand(1);
11266 if (!isa<ConstantSDNode>(MaskIdx) ||
11267 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11268 return SDValue();
11269
11270 // We only apply this if all elements come from the same vector with the
11271 // same vector type.
11272 if (!MaskSourceVec) {
11273 MaskSourceVec = MaskSource->getOperand(0);
11274 if (MaskSourceVec.getValueType() != VT)
11275 return SDValue();
11276 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11277 return SDValue();
11278 }
11279 }
11280
11281 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11282 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11283 // insert, we know that the index in the mask must be smaller than the number
11284 // of elements in the source, or we would have an out-of-bounds access.
11285 if (NumElts == 8)
11286 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11287 DAG.getUNDEF(VT));
11288
11289 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11290 if (!AndMaskConstants.empty())
11291 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11292 DAG.getBuildVector(VT, dl, AndMaskConstants));
11293
11294 return DAG.getNode(
11296 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11297 MaskSourceVec);
11298}
11299
11300// Gather data to see if the operation can be modelled as a
11301// shuffle in combination with VEXTs.
11303 SelectionDAG &DAG) const {
11304 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11305 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11306 SDLoc dl(Op);
11307 EVT VT = Op.getValueType();
11308 assert(!VT.isScalableVector() &&
11309 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11310 unsigned NumElts = VT.getVectorNumElements();
11311
11312 struct ShuffleSourceInfo {
11313 SDValue Vec;
11314 unsigned MinElt;
11315 unsigned MaxElt;
11316
11317 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11318 // be compatible with the shuffle we intend to construct. As a result
11319 // ShuffleVec will be some sliding window into the original Vec.
11320 SDValue ShuffleVec;
11321
11322 // Code should guarantee that element i in Vec starts at element "WindowBase
11323 // + i * WindowScale in ShuffleVec".
11324 int WindowBase;
11325 int WindowScale;
11326
11327 ShuffleSourceInfo(SDValue Vec)
11328 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11329 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11330
11331 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11332 };
11333
11334 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11335 // node.
11337 for (unsigned i = 0; i < NumElts; ++i) {
11338 SDValue V = Op.getOperand(i);
11339 if (V.isUndef())
11340 continue;
11341 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11342 !isa<ConstantSDNode>(V.getOperand(1)) ||
11343 V.getOperand(0).getValueType().isScalableVector()) {
11344 LLVM_DEBUG(
11345 dbgs() << "Reshuffle failed: "
11346 "a shuffle can only come from building a vector from "
11347 "various elements of other fixed-width vectors, provided "
11348 "their indices are constant\n");
11349 return SDValue();
11350 }
11351
11352 // Add this element source to the list if it's not already there.
11353 SDValue SourceVec = V.getOperand(0);
11354 auto Source = find(Sources, SourceVec);
11355 if (Source == Sources.end())
11356 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11357
11358 // Update the minimum and maximum lane number seen.
11359 unsigned EltNo = V.getConstantOperandVal(1);
11360 Source->MinElt = std::min(Source->MinElt, EltNo);
11361 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11362 }
11363
11364 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11365 // better than moving to/from gpr registers for larger vectors.
11366 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11367 // Construct a mask for the tbl. We may need to adjust the index for types
11368 // larger than i8.
11370 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11371 for (unsigned I = 0; I < NumElts; ++I) {
11372 SDValue V = Op.getOperand(I);
11373 if (V.isUndef()) {
11374 for (unsigned OF = 0; OF < OutputFactor; OF++)
11375 Mask.push_back(-1);
11376 continue;
11377 }
11378 // Set the Mask lanes adjusted for the size of the input and output
11379 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11380 // output element, adjusted in their positions per input and output types.
11381 unsigned Lane = V.getConstantOperandVal(1);
11382 for (unsigned S = 0; S < Sources.size(); S++) {
11383 if (V.getOperand(0) == Sources[S].Vec) {
11384 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11385 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11386 for (unsigned OF = 0; OF < OutputFactor; OF++)
11387 Mask.push_back(InputBase + OF);
11388 break;
11389 }
11390 }
11391 }
11392
11393 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11394 // v16i8, and the TBLMask
11395 SmallVector<SDValue, 16> TBLOperands;
11396 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11397 ? Intrinsic::aarch64_neon_tbl3
11398 : Intrinsic::aarch64_neon_tbl4,
11399 dl, MVT::i32));
11400 for (unsigned i = 0; i < Sources.size(); i++) {
11401 SDValue Src = Sources[i].Vec;
11402 EVT SrcVT = Src.getValueType();
11403 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11404 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11405 "Expected a legally typed vector");
11406 if (SrcVT.is64BitVector())
11407 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11408 DAG.getUNDEF(MVT::v8i8));
11409 TBLOperands.push_back(Src);
11410 }
11411
11413 for (unsigned i = 0; i < Mask.size(); i++)
11414 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11415 assert((Mask.size() == 8 || Mask.size() == 16) &&
11416 "Expected a v8i8 or v16i8 Mask");
11417 TBLOperands.push_back(
11418 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11419
11420 SDValue Shuffle =
11422 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11423 return DAG.getBitcast(VT, Shuffle);
11424 }
11425
11426 if (Sources.size() > 2) {
11427 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11428 << "sensible when at most two source vectors are "
11429 << "involved\n");
11430 return SDValue();
11431 }
11432
11433 // Find out the smallest element size among result and two sources, and use
11434 // it as element size to build the shuffle_vector.
11435 EVT SmallestEltTy = VT.getVectorElementType();
11436 for (auto &Source : Sources) {
11437 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11438 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11439 SmallestEltTy = SrcEltTy;
11440 }
11441 }
11442 unsigned ResMultiplier =
11443 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11444 uint64_t VTSize = VT.getFixedSizeInBits();
11445 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11446 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11447
11448 // If the source vector is too wide or too narrow, we may nevertheless be able
11449 // to construct a compatible shuffle either by concatenating it with UNDEF or
11450 // extracting a suitable range of elements.
11451 for (auto &Src : Sources) {
11452 EVT SrcVT = Src.ShuffleVec.getValueType();
11453
11454 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11455 if (SrcVTSize == TypeSize::getFixed(VTSize))
11456 continue;
11457
11458 // This stage of the search produces a source with the same element type as
11459 // the original, but with a total width matching the BUILD_VECTOR output.
11460 EVT EltVT = SrcVT.getVectorElementType();
11461 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11462 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11463
11464 if (SrcVTSize.getFixedValue() < VTSize) {
11465 assert(2 * SrcVTSize == VTSize);
11466 // We can pad out the smaller vector for free, so if it's part of a
11467 // shuffle...
11468 Src.ShuffleVec =
11469 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11470 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11471 continue;
11472 }
11473
11474 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11475 LLVM_DEBUG(
11476 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11477 return SDValue();
11478 }
11479
11480 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11481 LLVM_DEBUG(
11482 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11483 return SDValue();
11484 }
11485
11486 if (Src.MinElt >= NumSrcElts) {
11487 // The extraction can just take the second half
11488 Src.ShuffleVec =
11489 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11490 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11491 Src.WindowBase = -NumSrcElts;
11492 } else if (Src.MaxElt < NumSrcElts) {
11493 // The extraction can just take the first half
11494 Src.ShuffleVec =
11495 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11496 DAG.getConstant(0, dl, MVT::i64));
11497 } else {
11498 // An actual VEXT is needed
11499 SDValue VEXTSrc1 =
11500 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11501 DAG.getConstant(0, dl, MVT::i64));
11502 SDValue VEXTSrc2 =
11503 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11504 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11505 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11506
11507 if (!SrcVT.is64BitVector()) {
11508 LLVM_DEBUG(
11509 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11510 "for SVE vectors.");
11511 return SDValue();
11512 }
11513
11514 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11515 VEXTSrc2,
11516 DAG.getConstant(Imm, dl, MVT::i32));
11517 Src.WindowBase = -Src.MinElt;
11518 }
11519 }
11520
11521 // Another possible incompatibility occurs from the vector element types. We
11522 // can fix this by bitcasting the source vectors to the same type we intend
11523 // for the shuffle.
11524 for (auto &Src : Sources) {
11525 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11526 if (SrcEltTy == SmallestEltTy)
11527 continue;
11528 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11529 if (DAG.getDataLayout().isBigEndian()) {
11530 Src.ShuffleVec =
11531 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11532 } else {
11533 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11534 }
11535 Src.WindowScale =
11536 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11537 Src.WindowBase *= Src.WindowScale;
11538 }
11539
11540 // Final check before we try to actually produce a shuffle.
11541 LLVM_DEBUG(for (auto Src
11542 : Sources)
11543 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11544
11545 // The stars all align, our next step is to produce the mask for the shuffle.
11546 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11547 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11548 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11549 SDValue Entry = Op.getOperand(i);
11550 if (Entry.isUndef())
11551 continue;
11552
11553 auto Src = find(Sources, Entry.getOperand(0));
11554 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11555
11556 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11557 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11558 // segment.
11559 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11560 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11561 VT.getScalarSizeInBits());
11562 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11563
11564 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11565 // starting at the appropriate offset.
11566 int *LaneMask = &Mask[i * ResMultiplier];
11567
11568 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11569 ExtractBase += NumElts * (Src - Sources.begin());
11570 for (int j = 0; j < LanesDefined; ++j)
11571 LaneMask[j] = ExtractBase + j;
11572 }
11573
11574 // Final check before we try to produce nonsense...
11575 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11576 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11577 return SDValue();
11578 }
11579
11580 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11581 for (unsigned i = 0; i < Sources.size(); ++i)
11582 ShuffleOps[i] = Sources[i].ShuffleVec;
11583
11584 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11585 ShuffleOps[1], Mask);
11586 SDValue V;
11587 if (DAG.getDataLayout().isBigEndian()) {
11588 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11589 } else {
11590 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11591 }
11592
11593 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11594 dbgs() << "Reshuffle, creating node: "; V.dump(););
11595
11596 return V;
11597}
11598
11599// check if an EXT instruction can handle the shuffle mask when the
11600// vector sources of the shuffle are the same.
11601static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11602 unsigned NumElts = VT.getVectorNumElements();
11603
11604 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11605 if (M[0] < 0)
11606 return false;
11607
11608 Imm = M[0];
11609
11610 // If this is a VEXT shuffle, the immediate value is the index of the first
11611 // element. The other shuffle indices must be the successive elements after
11612 // the first one.
11613 unsigned ExpectedElt = Imm;
11614 for (unsigned i = 1; i < NumElts; ++i) {
11615 // Increment the expected index. If it wraps around, just follow it
11616 // back to index zero and keep going.
11617 ++ExpectedElt;
11618 if (ExpectedElt == NumElts)
11619 ExpectedElt = 0;
11620
11621 if (M[i] < 0)
11622 continue; // ignore UNDEF indices
11623 if (ExpectedElt != static_cast<unsigned>(M[i]))
11624 return false;
11625 }
11626
11627 return true;
11628}
11629
11630// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11631// v4i32s. This is really a truncate, which we can construct out of (legal)
11632// concats and truncate nodes.
11634 if (V.getValueType() != MVT::v16i8)
11635 return SDValue();
11636 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11637
11638 for (unsigned X = 0; X < 4; X++) {
11639 // Check the first item in each group is an extract from lane 0 of a v4i32
11640 // or v4i16.
11641 SDValue BaseExt = V.getOperand(X * 4);
11642 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11643 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11644 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11645 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11646 BaseExt.getConstantOperandVal(1) != 0)
11647 return SDValue();
11648 SDValue Base = BaseExt.getOperand(0);
11649 // And check the other items are extracts from the same vector.
11650 for (unsigned Y = 1; Y < 4; Y++) {
11651 SDValue Ext = V.getOperand(X * 4 + Y);
11652 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11653 Ext.getOperand(0) != Base ||
11654 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11655 Ext.getConstantOperandVal(1) != Y)
11656 return SDValue();
11657 }
11658 }
11659
11660 // Turn the buildvector into a series of truncates and concates, which will
11661 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11662 // concat together to produce 2 v8i16. These are both truncated and concat
11663 // together.
11664 SDLoc DL(V);
11665 SDValue Trunc[4] = {
11666 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11667 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11668 for (SDValue &V : Trunc)
11669 if (V.getValueType() == MVT::v4i32)
11670 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11671 SDValue Concat0 =
11672 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11673 SDValue Concat1 =
11674 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11675 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11676 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11677 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11678}
11679
11680/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11681/// element width than the vector lane type. If that is the case the function
11682/// returns true and writes the value of the DUP instruction lane operand into
11683/// DupLaneOp
11684static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11685 unsigned &DupLaneOp) {
11686 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11687 "Only possible block sizes for wide DUP are: 16, 32, 64");
11688
11689 if (BlockSize <= VT.getScalarSizeInBits())
11690 return false;
11691 if (BlockSize % VT.getScalarSizeInBits() != 0)
11692 return false;
11693 if (VT.getSizeInBits() % BlockSize != 0)
11694 return false;
11695
11696 size_t SingleVecNumElements = VT.getVectorNumElements();
11697 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11698 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11699
11700 // We are looking for masks like
11701 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11702 // might be replaced by 'undefined'. BlockIndices will eventually contain
11703 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11704 // for the above examples)
11705 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11706 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11707 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11708 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11709 if (Elt < 0)
11710 continue;
11711 // For now we don't support shuffles that use the second operand
11712 if ((unsigned)Elt >= SingleVecNumElements)
11713 return false;
11714 if (BlockElts[I] < 0)
11715 BlockElts[I] = Elt;
11716 else if (BlockElts[I] != Elt)
11717 return false;
11718 }
11719
11720 // We found a candidate block (possibly with some undefs). It must be a
11721 // sequence of consecutive integers starting with a value divisible by
11722 // NumEltsPerBlock with some values possibly replaced by undef-s.
11723
11724 // Find first non-undef element
11725 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11726 assert(FirstRealEltIter != BlockElts.end() &&
11727 "Shuffle with all-undefs must have been caught by previous cases, "
11728 "e.g. isSplat()");
11729 if (FirstRealEltIter == BlockElts.end()) {
11730 DupLaneOp = 0;
11731 return true;
11732 }
11733
11734 // Index of FirstRealElt in BlockElts
11735 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11736
11737 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11738 return false;
11739 // BlockElts[0] must have the following value if it isn't undef:
11740 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11741
11742 // Check the first element
11743 if (Elt0 % NumEltsPerBlock != 0)
11744 return false;
11745 // Check that the sequence indeed consists of consecutive integers (modulo
11746 // undefs)
11747 for (size_t I = 0; I < NumEltsPerBlock; I++)
11748 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11749 return false;
11750
11751 DupLaneOp = Elt0 / NumEltsPerBlock;
11752 return true;
11753}
11754
11755// check if an EXT instruction can handle the shuffle mask when the
11756// vector sources of the shuffle are different.
11757static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11758 unsigned &Imm) {
11759 // Look for the first non-undef element.
11760 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11761
11762 // Benefit form APInt to handle overflow when calculating expected element.
11763 unsigned NumElts = VT.getVectorNumElements();
11764 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11765 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11766 // The following shuffle indices must be the successive elements after the
11767 // first real element.
11768 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11769 return Elt != ExpectedElt++ && Elt != -1;
11770 });
11771 if (FoundWrongElt)
11772 return false;
11773
11774 // The index of an EXT is the first element if it is not UNDEF.
11775 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11776 // value of the first element. E.g.
11777 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11778 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11779 // ExpectedElt is the last mask index plus 1.
11780 Imm = ExpectedElt.getZExtValue();
11781
11782 // There are two difference cases requiring to reverse input vectors.
11783 // For example, for vector <4 x i32> we have the following cases,
11784 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11785 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11786 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11787 // to reverse two input vectors.
11788 if (Imm < NumElts)
11789 ReverseEXT = true;
11790 else
11791 Imm -= NumElts;
11792
11793 return true;
11794}
11795
11796/// isREVMask - Check if a vector shuffle corresponds to a REV
11797/// instruction with the specified blocksize. (The order of the elements
11798/// within each block of the vector is reversed.)
11799static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11800 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11801 BlockSize == 128) &&
11802 "Only possible block sizes for REV are: 16, 32, 64, 128");
11803
11804 unsigned EltSz = VT.getScalarSizeInBits();
11805 unsigned NumElts = VT.getVectorNumElements();
11806 unsigned BlockElts = M[0] + 1;
11807 // If the first shuffle index is UNDEF, be optimistic.
11808 if (M[0] < 0)
11809 BlockElts = BlockSize / EltSz;
11810
11811 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11812 return false;
11813
11814 for (unsigned i = 0; i < NumElts; ++i) {
11815 if (M[i] < 0)
11816 continue; // ignore UNDEF indices
11817 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11818 return false;
11819 }
11820
11821 return true;
11822}
11823
11824static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11825 unsigned NumElts = VT.getVectorNumElements();
11826 if (NumElts % 2 != 0)
11827 return false;
11828 WhichResult = (M[0] == 0 ? 0 : 1);
11829 for (unsigned i = 0; i < NumElts; i += 2) {
11830 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11831 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11832 return false;
11833 }
11834 return true;
11835}
11836
11837/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11838/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11839/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11840static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11841 unsigned NumElts = VT.getVectorNumElements();
11842 if (NumElts % 2 != 0)
11843 return false;
11844 WhichResult = (M[0] == 0 ? 0 : 1);
11845 unsigned Idx = WhichResult * NumElts / 2;
11846 for (unsigned i = 0; i != NumElts; i += 2) {
11847 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11848 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11849 return false;
11850 Idx += 1;
11851 }
11852
11853 return true;
11854}
11855
11856/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11857/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11858/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11859static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11860 unsigned Half = VT.getVectorNumElements() / 2;
11861 WhichResult = (M[0] == 0 ? 0 : 1);
11862 for (unsigned j = 0; j != 2; ++j) {
11863 unsigned Idx = WhichResult;
11864 for (unsigned i = 0; i != Half; ++i) {
11865 int MIdx = M[i + j * Half];
11866 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11867 return false;
11868 Idx += 2;
11869 }
11870 }
11871
11872 return true;
11873}
11874
11875/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11876/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11877/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11878static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11879 unsigned NumElts = VT.getVectorNumElements();
11880 if (NumElts % 2 != 0)
11881 return false;
11882 WhichResult = (M[0] == 0 ? 0 : 1);
11883 for (unsigned i = 0; i < NumElts; i += 2) {
11884 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11885 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11886 return false;
11887 }
11888 return true;
11889}
11890
11891static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11892 bool &DstIsLeft, int &Anomaly) {
11893 if (M.size() != static_cast<size_t>(NumInputElements))
11894 return false;
11895
11896 int NumLHSMatch = 0, NumRHSMatch = 0;
11897 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11898
11899 for (int i = 0; i < NumInputElements; ++i) {
11900 if (M[i] == -1) {
11901 ++NumLHSMatch;
11902 ++NumRHSMatch;
11903 continue;
11904 }
11905
11906 if (M[i] == i)
11907 ++NumLHSMatch;
11908 else
11909 LastLHSMismatch = i;
11910
11911 if (M[i] == i + NumInputElements)
11912 ++NumRHSMatch;
11913 else
11914 LastRHSMismatch = i;
11915 }
11916
11917 if (NumLHSMatch == NumInputElements - 1) {
11918 DstIsLeft = true;
11919 Anomaly = LastLHSMismatch;
11920 return true;
11921 } else if (NumRHSMatch == NumInputElements - 1) {
11922 DstIsLeft = false;
11923 Anomaly = LastRHSMismatch;
11924 return true;
11925 }
11926
11927 return false;
11928}
11929
11930static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
11931 if (VT.getSizeInBits() != 128)
11932 return false;
11933
11934 unsigned NumElts = VT.getVectorNumElements();
11935
11936 for (int I = 0, E = NumElts / 2; I != E; I++) {
11937 if (Mask[I] != I)
11938 return false;
11939 }
11940
11941 int Offset = NumElts / 2;
11942 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
11943 if (Mask[I] != I + SplitLHS * Offset)
11944 return false;
11945 }
11946
11947 return true;
11948}
11949
11951 SDLoc DL(Op);
11952 EVT VT = Op.getValueType();
11953 SDValue V0 = Op.getOperand(0);
11954 SDValue V1 = Op.getOperand(1);
11955 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
11956
11959 return SDValue();
11960
11961 bool SplitV0 = V0.getValueSizeInBits() == 128;
11962
11963 if (!isConcatMask(Mask, VT, SplitV0))
11964 return SDValue();
11965
11966 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
11967 if (SplitV0) {
11968 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
11969 DAG.getConstant(0, DL, MVT::i64));
11970 }
11971 if (V1.getValueSizeInBits() == 128) {
11972 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
11973 DAG.getConstant(0, DL, MVT::i64));
11974 }
11975 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
11976}
11977
11978/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
11979/// the specified operations to build the shuffle. ID is the perfect-shuffle
11980//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
11981//table entry and LHS/RHS are the immediate inputs for this stage of the
11982//shuffle.
11984 SDValue V2, unsigned PFEntry, SDValue LHS,
11985 SDValue RHS, SelectionDAG &DAG,
11986 const SDLoc &dl) {
11987 unsigned OpNum = (PFEntry >> 26) & 0x0F;
11988 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
11989 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
11990
11991 enum {
11992 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
11993 OP_VREV,
11994 OP_VDUP0,
11995 OP_VDUP1,
11996 OP_VDUP2,
11997 OP_VDUP3,
11998 OP_VEXT1,
11999 OP_VEXT2,
12000 OP_VEXT3,
12001 OP_VUZPL, // VUZP, left result
12002 OP_VUZPR, // VUZP, right result
12003 OP_VZIPL, // VZIP, left result
12004 OP_VZIPR, // VZIP, right result
12005 OP_VTRNL, // VTRN, left result
12006 OP_VTRNR, // VTRN, right result
12007 OP_MOVLANE // Move lane. RHSID is the lane to move into
12008 };
12009
12010 if (OpNum == OP_COPY) {
12011 if (LHSID == (1 * 9 + 2) * 9 + 3)
12012 return LHS;
12013 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12014 return RHS;
12015 }
12016
12017 if (OpNum == OP_MOVLANE) {
12018 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12019 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12020 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12021 Elt = 3 - Elt;
12022 while (Elt > 0) {
12023 ID /= 9;
12024 Elt--;
12025 }
12026 return (ID % 9 == 8) ? -1 : ID % 9;
12027 };
12028
12029 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12030 // get the lane to move from the PFID, which is always from the
12031 // original vectors (V1 or V2).
12033 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12034 EVT VT = OpLHS.getValueType();
12035 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12036 unsigned ExtLane = 0;
12037 SDValue Input;
12038
12039 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12040 // convert into a higher type.
12041 if (RHSID & 0x4) {
12042 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12043 if (MaskElt == -1)
12044 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12045 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12046 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12047 Input = MaskElt < 2 ? V1 : V2;
12048 if (VT.getScalarSizeInBits() == 16) {
12049 Input = DAG.getBitcast(MVT::v2f32, Input);
12050 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12051 } else {
12052 assert(VT.getScalarSizeInBits() == 32 &&
12053 "Expected 16 or 32 bit shuffle elemements");
12054 Input = DAG.getBitcast(MVT::v2f64, Input);
12055 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12056 }
12057 } else {
12058 int MaskElt = getPFIDLane(ID, RHSID);
12059 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12060 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12061 Input = MaskElt < 4 ? V1 : V2;
12062 // Be careful about creating illegal types. Use f16 instead of i16.
12063 if (VT == MVT::v4i16) {
12064 Input = DAG.getBitcast(MVT::v4f16, Input);
12065 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12066 }
12067 }
12070 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12071 SDValue Ins =
12072 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12073 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12074 return DAG.getBitcast(VT, Ins);
12075 }
12076
12077 SDValue OpLHS, OpRHS;
12078 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12079 RHS, DAG, dl);
12080 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12081 RHS, DAG, dl);
12082 EVT VT = OpLHS.getValueType();
12083
12084 switch (OpNum) {
12085 default:
12086 llvm_unreachable("Unknown shuffle opcode!");
12087 case OP_VREV:
12088 // VREV divides the vector in half and swaps within the half.
12089 if (VT.getVectorElementType() == MVT::i32 ||
12090 VT.getVectorElementType() == MVT::f32)
12091 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12092 // vrev <4 x i16> -> REV32
12093 if (VT.getVectorElementType() == MVT::i16 ||
12094 VT.getVectorElementType() == MVT::f16 ||
12095 VT.getVectorElementType() == MVT::bf16)
12096 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12097 // vrev <4 x i8> -> REV16
12098 assert(VT.getVectorElementType() == MVT::i8);
12099 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12100 case OP_VDUP0:
12101 case OP_VDUP1:
12102 case OP_VDUP2:
12103 case OP_VDUP3: {
12104 EVT EltTy = VT.getVectorElementType();
12105 unsigned Opcode;
12106 if (EltTy == MVT::i8)
12107 Opcode = AArch64ISD::DUPLANE8;
12108 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12109 Opcode = AArch64ISD::DUPLANE16;
12110 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12111 Opcode = AArch64ISD::DUPLANE32;
12112 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12113 Opcode = AArch64ISD::DUPLANE64;
12114 else
12115 llvm_unreachable("Invalid vector element type?");
12116
12117 if (VT.getSizeInBits() == 64)
12118 OpLHS = WidenVector(OpLHS, DAG);
12119 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12120 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12121 }
12122 case OP_VEXT1:
12123 case OP_VEXT2:
12124 case OP_VEXT3: {
12125 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12126 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12127 DAG.getConstant(Imm, dl, MVT::i32));
12128 }
12129 case OP_VUZPL:
12130 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12131 case OP_VUZPR:
12132 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12133 case OP_VZIPL:
12134 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12135 case OP_VZIPR:
12136 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12137 case OP_VTRNL:
12138 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12139 case OP_VTRNR:
12140 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12141 }
12142}
12143
12145 SelectionDAG &DAG) {
12146 // Check to see if we can use the TBL instruction.
12147 SDValue V1 = Op.getOperand(0);
12148 SDValue V2 = Op.getOperand(1);
12149 SDLoc DL(Op);
12150
12151 EVT EltVT = Op.getValueType().getVectorElementType();
12152 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12153
12154 bool Swap = false;
12155 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12156 std::swap(V1, V2);
12157 Swap = true;
12158 }
12159
12160 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12161 // out of range values with 0s. We do need to make sure that any out-of-range
12162 // values are really out-of-range for a v16i8 vector.
12163 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12164 MVT IndexVT = MVT::v8i8;
12165 unsigned IndexLen = 8;
12166 if (Op.getValueSizeInBits() == 128) {
12167 IndexVT = MVT::v16i8;
12168 IndexLen = 16;
12169 }
12170
12172 for (int Val : ShuffleMask) {
12173 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12174 unsigned Offset = Byte + Val * BytesPerElt;
12175 if (Swap)
12176 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12177 if (IsUndefOrZero && Offset >= IndexLen)
12178 Offset = 255;
12179 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12180 }
12181 }
12182
12183 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12184 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12185
12186 SDValue Shuffle;
12187 if (IsUndefOrZero) {
12188 if (IndexLen == 8)
12189 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12190 Shuffle = DAG.getNode(
12191 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12192 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12193 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12194 } else {
12195 if (IndexLen == 8) {
12196 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12197 Shuffle = DAG.getNode(
12198 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12199 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12200 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12201 } else {
12202 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12203 // cannot currently represent the register constraints on the input
12204 // table registers.
12205 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12206 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12207 // IndexLen));
12208 Shuffle = DAG.getNode(
12209 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12210 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12211 V2Cst,
12212 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12213 }
12214 }
12215 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12216}
12217
12218static unsigned getDUPLANEOp(EVT EltType) {
12219 if (EltType == MVT::i8)
12220 return AArch64ISD::DUPLANE8;
12221 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12222 return AArch64ISD::DUPLANE16;
12223 if (EltType == MVT::i32 || EltType == MVT::f32)
12224 return AArch64ISD::DUPLANE32;
12225 if (EltType == MVT::i64 || EltType == MVT::f64)
12226 return AArch64ISD::DUPLANE64;
12227
12228 llvm_unreachable("Invalid vector element type?");
12229}
12230
12231static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12232 unsigned Opcode, SelectionDAG &DAG) {
12233 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12234 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12235 // Match: dup (bitcast (extract_subv X, C)), LaneC
12236 if (BitCast.getOpcode() != ISD::BITCAST ||
12238 return false;
12239
12240 // The extract index must align in the destination type. That may not
12241 // happen if the bitcast is from narrow to wide type.
12242 SDValue Extract = BitCast.getOperand(0);
12243 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12244 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12245 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12246 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12247 if (ExtIdxInBits % CastedEltBitWidth != 0)
12248 return false;
12249
12250 // Can't handle cases where vector size is not 128-bit
12251 if (!Extract.getOperand(0).getValueType().is128BitVector())
12252 return false;
12253
12254 // Update the lane value by offsetting with the scaled extract index.
12255 LaneC += ExtIdxInBits / CastedEltBitWidth;
12256
12257 // Determine the casted vector type of the wide vector input.
12258 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12259 // Examples:
12260 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12261 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12262 unsigned SrcVecNumElts =
12263 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12265 SrcVecNumElts);
12266 return true;
12267 };
12268 MVT CastVT;
12269 if (getScaledOffsetDup(V, Lane, CastVT)) {
12270 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12271 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12272 V.getOperand(0).getValueType().is128BitVector()) {
12273 // The lane is incremented by the index of the extract.
12274 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12275 Lane += V.getConstantOperandVal(1);
12276 V = V.getOperand(0);
12277 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12278 // The lane is decremented if we are splatting from the 2nd operand.
12279 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12280 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12281 Lane -= Idx * VT.getVectorNumElements() / 2;
12282 V = WidenVector(V.getOperand(Idx), DAG);
12283 } else if (VT.getSizeInBits() == 64) {
12284 // Widen the operand to 128-bit register with undef.
12285 V = WidenVector(V, DAG);
12286 }
12287 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12288}
12289
12290// Return true if we can get a new shuffle mask by checking the parameter mask
12291// array to test whether every two adjacent mask values are continuous and
12292// starting from an even number.
12294 SmallVectorImpl<int> &NewMask) {
12295 unsigned NumElts = VT.getVectorNumElements();
12296 if (NumElts % 2 != 0)
12297 return false;
12298
12299 NewMask.clear();
12300 for (unsigned i = 0; i < NumElts; i += 2) {
12301 int M0 = M[i];
12302 int M1 = M[i + 1];
12303
12304 // If both elements are undef, new mask is undef too.
12305 if (M0 == -1 && M1 == -1) {
12306 NewMask.push_back(-1);
12307 continue;
12308 }
12309
12310 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12311 NewMask.push_back(M1 / 2);
12312 continue;
12313 }
12314
12315 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12316 NewMask.push_back(M0 / 2);
12317 continue;
12318 }
12319
12320 NewMask.clear();
12321 return false;
12322 }
12323
12324 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12325 return true;
12326}
12327
12328// Try to widen element type to get a new mask value for a better permutation
12329// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12330// UZP1/2, TRN1/2, REV, INS, etc.
12331// For example:
12332// shufflevector <4 x i32> %a, <4 x i32> %b,
12333// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12334// is equivalent to:
12335// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12336// Finally, we can get:
12337// mov v0.d[0], v1.d[1]
12339 SDLoc DL(Op);
12340 EVT VT = Op.getValueType();
12341 EVT ScalarVT = VT.getVectorElementType();
12342 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12343 SDValue V0 = Op.getOperand(0);
12344 SDValue V1 = Op.getOperand(1);
12345 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12346
12347 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12348 // We need to make sure the wider element type is legal. Thus, ElementSize
12349 // should be not larger than 32 bits, and i1 type should also be excluded.
12350 if (ElementSize > 32 || ElementSize == 1)
12351 return SDValue();
12352
12353 SmallVector<int, 8> NewMask;
12354 if (isWideTypeMask(Mask, VT, NewMask)) {
12355 MVT NewEltVT = VT.isFloatingPoint()
12356 ? MVT::getFloatingPointVT(ElementSize * 2)
12357 : MVT::getIntegerVT(ElementSize * 2);
12358 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12359 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12360 V0 = DAG.getBitcast(NewVT, V0);
12361 V1 = DAG.getBitcast(NewVT, V1);
12362 return DAG.getBitcast(VT,
12363 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12364 }
12365 }
12366
12367 return SDValue();
12368}
12369
12370// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12372 ArrayRef<int> ShuffleMask,
12373 SelectionDAG &DAG) {
12374 SDValue Tbl1 = Op->getOperand(0);
12375 SDValue Tbl2 = Op->getOperand(1);
12376 SDLoc dl(Op);
12377 SDValue Tbl2ID =
12378 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12379
12380 EVT VT = Op.getValueType();
12381 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12382 Tbl1->getOperand(0) != Tbl2ID ||
12384 Tbl2->getOperand(0) != Tbl2ID)
12385 return SDValue();
12386
12387 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12388 Tbl2->getValueType(0) != MVT::v16i8)
12389 return SDValue();
12390
12391 SDValue Mask1 = Tbl1->getOperand(3);
12392 SDValue Mask2 = Tbl2->getOperand(3);
12393 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12394 for (unsigned I = 0; I < 16; I++) {
12395 if (ShuffleMask[I] < 16)
12396 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12397 else {
12398 auto *C =
12399 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12400 if (!C)
12401 return SDValue();
12402 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12403 }
12404 }
12405
12406 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12407 SDValue ID =
12408 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12409
12410 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12411 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12412 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12413}
12414
12415// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12416// but we don't have an appropriate instruction,
12417// so custom-lower it as ZIP1-with-zeros.
12418SDValue
12419AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12420 SelectionDAG &DAG) const {
12421 SDLoc dl(Op);
12422 EVT VT = Op.getValueType();
12423 SDValue SrcOp = Op.getOperand(0);
12424 EVT SrcVT = SrcOp.getValueType();
12425 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12426 "Unexpected extension factor.");
12427 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12428 // FIXME: support multi-step zipping?
12429 if (Scale != 2)
12430 return SDValue();
12431 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12432 return DAG.getBitcast(VT,
12433 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12434}
12435
12436SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12437 SelectionDAG &DAG) const {
12438 SDLoc dl(Op);
12439 EVT VT = Op.getValueType();
12440
12441 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12442
12443 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12444 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12445
12446 // Convert shuffles that are directly supported on NEON to target-specific
12447 // DAG nodes, instead of keeping them as shuffles and matching them again
12448 // during code selection. This is more efficient and avoids the possibility
12449 // of inconsistencies between legalization and selection.
12450 ArrayRef<int> ShuffleMask = SVN->getMask();
12451
12452 SDValue V1 = Op.getOperand(0);
12453 SDValue V2 = Op.getOperand(1);
12454
12455 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12456 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12457 "Unexpected VECTOR_SHUFFLE mask size!");
12458
12459 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12460 return Res;
12461
12462 if (SVN->isSplat()) {
12463 int Lane = SVN->getSplatIndex();
12464 // If this is undef splat, generate it via "just" vdup, if possible.
12465 if (Lane == -1)
12466 Lane = 0;
12467
12468 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12469 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12470 V1.getOperand(0));
12471 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12472 // constant. If so, we can just reference the lane's definition directly.
12473 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12474 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12475 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12476
12477 // Otherwise, duplicate from the lane of the input vector.
12478 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12479 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12480 }
12481
12482 // Check if the mask matches a DUP for a wider element
12483 for (unsigned LaneSize : {64U, 32U, 16U}) {
12484 unsigned Lane = 0;
12485 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12486 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12487 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12489 // Cast V1 to an integer vector with required lane size
12490 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12491 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12492 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12493 V1 = DAG.getBitcast(NewVecTy, V1);
12494 // Constuct the DUP instruction
12495 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12496 // Cast back to the original type
12497 return DAG.getBitcast(VT, V1);
12498 }
12499 }
12500
12501 if (isREVMask(ShuffleMask, VT, 64))
12502 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12503 if (isREVMask(ShuffleMask, VT, 32))
12504 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12505 if (isREVMask(ShuffleMask, VT, 16))
12506 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12507
12508 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12509 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12510 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12511 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12512 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12513 DAG.getConstant(8, dl, MVT::i32));
12514 }
12515
12516 bool ReverseEXT = false;
12517 unsigned Imm;
12518 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12519 if (ReverseEXT)
12520 std::swap(V1, V2);
12521 Imm *= getExtFactor(V1);
12522 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12523 DAG.getConstant(Imm, dl, MVT::i32));
12524 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12525 Imm *= getExtFactor(V1);
12526 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12527 DAG.getConstant(Imm, dl, MVT::i32));
12528 }
12529
12530 unsigned WhichResult;
12531 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12532 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12533 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12534 }
12535 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12536 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12537 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12538 }
12539 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12540 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12541 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12542 }
12543
12544 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12545 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12546 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12547 }
12548 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12549 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12550 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12551 }
12552 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12553 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12554 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12555 }
12556
12558 return Concat;
12559
12560 bool DstIsLeft;
12561 int Anomaly;
12562 int NumInputElements = V1.getValueType().getVectorNumElements();
12563 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12564 SDValue DstVec = DstIsLeft ? V1 : V2;
12565 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12566
12567 SDValue SrcVec = V1;
12568 int SrcLane = ShuffleMask[Anomaly];
12569 if (SrcLane >= NumInputElements) {
12570 SrcVec = V2;
12571 SrcLane -= VT.getVectorNumElements();
12572 }
12573 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12574
12575 EVT ScalarVT = VT.getVectorElementType();
12576
12577 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12578 ScalarVT = MVT::i32;
12579
12580 return DAG.getNode(
12581 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12582 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12583 DstLaneV);
12584 }
12585
12586 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12587 return NewSD;
12588
12589 // If the shuffle is not directly supported and it has 4 elements, use
12590 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12591 unsigned NumElts = VT.getVectorNumElements();
12592 if (NumElts == 4) {
12593 unsigned PFIndexes[4];
12594 for (unsigned i = 0; i != 4; ++i) {
12595 if (ShuffleMask[i] < 0)
12596 PFIndexes[i] = 8;
12597 else
12598 PFIndexes[i] = ShuffleMask[i];
12599 }
12600
12601 // Compute the index in the perfect shuffle table.
12602 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12603 PFIndexes[2] * 9 + PFIndexes[3];
12604 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12605 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12606 dl);
12607 }
12608
12609 return GenerateTBL(Op, ShuffleMask, DAG);
12610}
12611
12612SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12613 SelectionDAG &DAG) const {
12614 EVT VT = Op.getValueType();
12615
12616 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12617 return LowerToScalableOp(Op, DAG);
12618
12619 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12620 "Unexpected vector type!");
12621
12622 // We can handle the constant cases during isel.
12623 if (isa<ConstantSDNode>(Op.getOperand(0)))
12624 return Op;
12625
12626 // There isn't a natural way to handle the general i1 case, so we use some
12627 // trickery with whilelo.
12628 SDLoc DL(Op);
12629 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12630 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12631 DAG.getValueType(MVT::i1));
12632 SDValue ID =
12633 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12634 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12635 if (VT == MVT::nxv1i1)
12636 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12637 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12638 Zero, SplatVal),
12639 Zero);
12640 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12641}
12642
12643SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12644 SelectionDAG &DAG) const {
12645 SDLoc DL(Op);
12646
12647 EVT VT = Op.getValueType();
12648 if (!isTypeLegal(VT) || !VT.isScalableVector())
12649 return SDValue();
12650
12651 // Current lowering only supports the SVE-ACLE types.
12653 return SDValue();
12654
12655 // The DUPQ operation is indepedent of element type so normalise to i64s.
12656 SDValue Idx128 = Op.getOperand(2);
12657
12658 // DUPQ can be used when idx is in range.
12659 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12660 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12661 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12662 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12663 }
12664
12665 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12666
12667 // The ACLE says this must produce the same result as:
12668 // svtbl(data, svadd_x(svptrue_b64(),
12669 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12670 // index * 2))
12671 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12672 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12673
12674 // create the vector 0,1,0,1,...
12675 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12676 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12677
12678 // create the vector idx64,idx64+1,idx64,idx64+1,...
12679 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12680 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12681 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12682
12683 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12684 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12685 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12686}
12687
12688
12689static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12690 APInt &UndefBits) {
12691 EVT VT = BVN->getValueType(0);
12692 APInt SplatBits, SplatUndef;
12693 unsigned SplatBitSize;
12694 bool HasAnyUndefs;
12695 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12696 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12697
12698 for (unsigned i = 0; i < NumSplats; ++i) {
12699 CnstBits <<= SplatBitSize;
12700 UndefBits <<= SplatBitSize;
12701 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12702 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12703 }
12704
12705 return true;
12706 }
12707
12708 return false;
12709}
12710
12711// Try 64-bit splatted SIMD immediate.
12712static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12713 const APInt &Bits) {
12714 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12715 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12716 EVT VT = Op.getValueType();
12717 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12718
12721
12722 SDLoc dl(Op);
12723 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12724 DAG.getConstant(Value, dl, MVT::i32));
12725 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12726 }
12727 }
12728
12729 return SDValue();
12730}
12731
12732// Try 32-bit splatted SIMD immediate.
12733static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12734 const APInt &Bits,
12735 const SDValue *LHS = nullptr) {
12736 EVT VT = Op.getValueType();
12737 if (VT.isFixedLengthVector() &&
12739 return SDValue();
12740
12741 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12742 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12743 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12744 bool isAdvSIMDModImm = false;
12745 uint64_t Shift;
12746
12747 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12749 Shift = 0;
12750 }
12751 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12753 Shift = 8;
12754 }
12755 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12757 Shift = 16;
12758 }
12759 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12761 Shift = 24;
12762 }
12763
12764 if (isAdvSIMDModImm) {
12765 SDLoc dl(Op);
12766 SDValue Mov;
12767
12768 if (LHS)
12769 Mov = DAG.getNode(NewOp, dl, MovTy,
12770 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12771 DAG.getConstant(Value, dl, MVT::i32),
12772 DAG.getConstant(Shift, dl, MVT::i32));
12773 else
12774 Mov = DAG.getNode(NewOp, dl, MovTy,
12775 DAG.getConstant(Value, dl, MVT::i32),
12776 DAG.getConstant(Shift, dl, MVT::i32));
12777
12778 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12779 }
12780 }
12781
12782 return SDValue();
12783}
12784
12785// Try 16-bit splatted SIMD immediate.
12786static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12787 const APInt &Bits,
12788 const SDValue *LHS = nullptr) {
12789 EVT VT = Op.getValueType();
12790 if (VT.isFixedLengthVector() &&
12792 return SDValue();
12793
12794 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12795 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12796 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12797 bool isAdvSIMDModImm = false;
12798 uint64_t Shift;
12799
12800 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12802 Shift = 0;
12803 }
12804 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12806 Shift = 8;
12807 }
12808
12809 if (isAdvSIMDModImm) {
12810 SDLoc dl(Op);
12811 SDValue Mov;
12812
12813 if (LHS)
12814 Mov = DAG.getNode(NewOp, dl, MovTy,
12815 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12816 DAG.getConstant(Value, dl, MVT::i32),
12817 DAG.getConstant(Shift, dl, MVT::i32));
12818 else
12819 Mov = DAG.getNode(NewOp, dl, MovTy,
12820 DAG.getConstant(Value, dl, MVT::i32),
12821 DAG.getConstant(Shift, dl, MVT::i32));
12822
12823 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12824 }
12825 }
12826
12827 return SDValue();
12828}
12829
12830// Try 32-bit splatted SIMD immediate with shifted ones.
12832 SelectionDAG &DAG, const APInt &Bits) {
12833 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12834 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12835 EVT VT = Op.getValueType();
12836 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12837 bool isAdvSIMDModImm = false;
12838 uint64_t Shift;
12839
12840 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12842 Shift = 264;
12843 }
12844 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12846 Shift = 272;
12847 }
12848
12849 if (isAdvSIMDModImm) {
12850 SDLoc dl(Op);
12851 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12852 DAG.getConstant(Value, dl, MVT::i32),
12853 DAG.getConstant(Shift, dl, MVT::i32));
12854 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12855 }
12856 }
12857
12858 return SDValue();
12859}
12860
12861// Try 8-bit splatted SIMD immediate.
12862static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12863 const APInt &Bits) {
12864 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12865 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12866 EVT VT = Op.getValueType();
12867 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12868
12871
12872 SDLoc dl(Op);
12873 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12874 DAG.getConstant(Value, dl, MVT::i32));
12875 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12876 }
12877 }
12878
12879 return SDValue();
12880}
12881
12882// Try FP splatted SIMD immediate.
12883static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12884 const APInt &Bits) {
12885 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12886 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12887 EVT VT = Op.getValueType();
12888 bool isWide = (VT.getSizeInBits() == 128);
12889 MVT MovTy;
12890 bool isAdvSIMDModImm = false;
12891
12892 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12894 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12895 }
12896 else if (isWide &&
12897 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12899 MovTy = MVT::v2f64;
12900 }
12901
12902 if (isAdvSIMDModImm) {
12903 SDLoc dl(Op);
12904 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12905 DAG.getConstant(Value, dl, MVT::i32));
12906 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12907 }
12908 }
12909
12910 return SDValue();
12911}
12912
12913// Specialized code to quickly find if PotentialBVec is a BuildVector that
12914// consists of only the same constant int value, returned in reference arg
12915// ConstVal
12916static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12917 uint64_t &ConstVal) {
12918 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
12919 if (!Bvec)
12920 return false;
12921 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
12922 if (!FirstElt)
12923 return false;
12924 EVT VT = Bvec->getValueType(0);
12925 unsigned NumElts = VT.getVectorNumElements();
12926 for (unsigned i = 1; i < NumElts; ++i)
12927 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
12928 return false;
12929 ConstVal = FirstElt->getZExtValue();
12930 return true;
12931}
12932
12934 // Look through cast.
12935 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
12936 N = N.getOperand(0);
12937
12938 return ISD::isConstantSplatVectorAllZeros(N.getNode());
12939}
12940
12942 unsigned NumElts = N.getValueType().getVectorMinNumElements();
12943
12944 // Look through cast.
12945 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
12946 N = N.getOperand(0);
12947 // When reinterpreting from a type with fewer elements the "new" elements
12948 // are not active, so bail if they're likely to be used.
12949 if (N.getValueType().getVectorMinNumElements() < NumElts)
12950 return false;
12951 }
12952
12953 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
12954 return true;
12955
12956 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
12957 // or smaller than the implicit element type represented by N.
12958 // NOTE: A larger element count implies a smaller element type.
12959 if (N.getOpcode() == AArch64ISD::PTRUE &&
12960 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
12961 return N.getValueType().getVectorMinNumElements() >= NumElts;
12962
12963 // If we're compiling for a specific vector-length, we can check if the
12964 // pattern's VL equals that of the scalable vector at runtime.
12965 if (N.getOpcode() == AArch64ISD::PTRUE) {
12966 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12967 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
12968 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
12969 if (MaxSVESize && MinSVESize == MaxSVESize) {
12970 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
12971 unsigned PatNumElts =
12972 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
12973 return PatNumElts == (NumElts * VScale);
12974 }
12975 }
12976
12977 return false;
12978}
12979
12980// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
12981// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
12982// BUILD_VECTORs with constant element C1, C2 is a constant, and:
12983// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
12984// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
12985// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
12987 EVT VT = N->getValueType(0);
12988
12989 if (!VT.isVector())
12990 return SDValue();
12991
12992 SDLoc DL(N);
12993
12994 SDValue And;
12995 SDValue Shift;
12996
12997 SDValue FirstOp = N->getOperand(0);
12998 unsigned FirstOpc = FirstOp.getOpcode();
12999 SDValue SecondOp = N->getOperand(1);
13000 unsigned SecondOpc = SecondOp.getOpcode();
13001
13002 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13003 // a BICi in order to use an immediate instead of a register.
13004 // Is the other operand an shl or lshr? This will have been turned into:
13005 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13006 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13007 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13008 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13009 SecondOpc == AArch64ISD::SHL_PRED ||
13010 SecondOpc == AArch64ISD::SRL_PRED)) {
13011 And = FirstOp;
13012 Shift = SecondOp;
13013
13014 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13015 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13016 FirstOpc == AArch64ISD::SHL_PRED ||
13017 FirstOpc == AArch64ISD::SRL_PRED)) {
13018 And = SecondOp;
13019 Shift = FirstOp;
13020 } else
13021 return SDValue();
13022
13023 bool IsAnd = And.getOpcode() == ISD::AND;
13024 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13026 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13028
13029 // Is the shift amount constant and are all lanes active?
13030 uint64_t C2;
13031 if (ShiftHasPredOp) {
13032 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13033 return SDValue();
13034 APInt C;
13036 return SDValue();
13037 C2 = C.getZExtValue();
13038 } else if (ConstantSDNode *C2node =
13039 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13040 C2 = C2node->getZExtValue();
13041 else
13042 return SDValue();
13043
13044 APInt C1AsAPInt;
13045 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13046 if (IsAnd) {
13047 // Is the and mask vector all constant?
13048 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13049 return SDValue();
13050 } else {
13051 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13052 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13053 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13054 assert(C1nodeImm && C1nodeShift);
13055 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13056 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13057 }
13058
13059 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13060 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13061 // how much one can shift elements of a particular size?
13062 if (C2 > ElemSizeInBits)
13063 return SDValue();
13064
13065 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13066 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13067 if (C1AsAPInt != RequiredC1)
13068 return SDValue();
13069
13070 SDValue X = And.getOperand(0);
13071 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13072 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13073 : Shift.getOperand(1);
13074
13075 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13076 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13077
13078 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13079 LLVM_DEBUG(N->dump(&DAG));
13080 LLVM_DEBUG(dbgs() << "into: \n");
13081 LLVM_DEBUG(ResultSLI->dump(&DAG));
13082
13083 ++NumShiftInserts;
13084 return ResultSLI;
13085}
13086
13087SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13088 SelectionDAG &DAG) const {
13089 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13090 !Subtarget->isNeonAvailable()))
13091 return LowerToScalableOp(Op, DAG);
13092
13093 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13094 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13095 return Res;
13096
13097 EVT VT = Op.getValueType();
13098 if (VT.isScalableVector())
13099 return Op;
13100
13101 SDValue LHS = Op.getOperand(0);
13102 BuildVectorSDNode *BVN =
13103 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13104 if (!BVN) {
13105 // OR commutes, so try swapping the operands.
13106 LHS = Op.getOperand(1);
13107 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13108 }
13109 if (!BVN)
13110 return Op;
13111
13112 APInt DefBits(VT.getSizeInBits(), 0);
13113 APInt UndefBits(VT.getSizeInBits(), 0);
13114 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13115 SDValue NewOp;
13116
13117 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13118 DefBits, &LHS)) ||
13119 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13120 DefBits, &LHS)))
13121 return NewOp;
13122
13123 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13124 UndefBits, &LHS)) ||
13125 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13126 UndefBits, &LHS)))
13127 return NewOp;
13128 }
13129
13130 // We can always fall back to a non-immediate OR.
13131 return Op;
13132}
13133
13134// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13135// be truncated to fit element width.
13137 SelectionDAG &DAG) {
13138 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13139 SDLoc dl(Op);
13140 EVT VT = Op.getValueType();
13141 EVT EltTy= VT.getVectorElementType();
13142
13143 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13144 return Op;
13145
13147 for (SDValue Lane : Op->ops()) {
13148 // For integer vectors, type legalization would have promoted the
13149 // operands already. Otherwise, if Op is a floating-point splat
13150 // (with operands cast to integers), then the only possibilities
13151 // are constants and UNDEFs.
13152 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13153 APInt LowBits(EltTy.getSizeInBits(),
13154 CstLane->getZExtValue());
13155 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13156 } else if (Lane.getNode()->isUndef()) {
13157 Lane = DAG.getUNDEF(MVT::i32);
13158 } else {
13159 assert(Lane.getValueType() == MVT::i32 &&
13160 "Unexpected BUILD_VECTOR operand type");
13161 }
13162 Ops.push_back(Lane);
13163 }
13164 return DAG.getBuildVector(VT, dl, Ops);
13165}
13166
13168 const AArch64Subtarget *ST) {
13169 EVT VT = Op.getValueType();
13170 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13171 "Expected a legal NEON vector");
13172
13173 APInt DefBits(VT.getSizeInBits(), 0);
13174 APInt UndefBits(VT.getSizeInBits(), 0);
13175 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13176 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13177 auto TryMOVIWithBits = [&](APInt DefBits) {
13178 SDValue NewOp;
13179 if ((NewOp =
13180 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13181 (NewOp =
13182 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13183 (NewOp =
13184 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13185 (NewOp =
13186 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13187 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13188 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13189 return NewOp;
13190
13191 APInt NotDefBits = ~DefBits;
13192 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13193 NotDefBits)) ||
13195 NotDefBits)) ||
13196 (NewOp =
13197 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13198 return NewOp;
13199 return SDValue();
13200 };
13201 if (SDValue R = TryMOVIWithBits(DefBits))
13202 return R;
13203 if (SDValue R = TryMOVIWithBits(UndefBits))
13204 return R;
13205
13206 // See if a fneg of the constant can be materialized with a MOVI, etc
13207 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13208 // FNegate each sub-element of the constant
13209 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13210 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13211 .zext(VT.getSizeInBits());
13212 APInt NegBits(VT.getSizeInBits(), 0);
13213 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13214 for (unsigned i = 0; i < NumElts; i++)
13215 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13216 NegBits = DefBits ^ NegBits;
13217
13218 // Try to create the new constants with MOVI, and if so generate a fneg
13219 // for it.
13220 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13221 SDLoc DL(Op);
13222 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13223 return DAG.getNode(
13225 DAG.getNode(ISD::FNEG, DL, VFVT,
13226 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13227 }
13228 return SDValue();
13229 };
13230 SDValue R;
13231 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13232 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13233 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13234 return R;
13235 }
13236
13237 return SDValue();
13238}
13239
13240SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13241 SelectionDAG &DAG) const {
13242 EVT VT = Op.getValueType();
13243
13244 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13245 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13246 SDLoc DL(Op);
13247 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13248 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13249 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13250 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13251 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13252 }
13253
13254 // Revert to common legalisation for all other variants.
13255 return SDValue();
13256 }
13257
13258 // Try to build a simple constant vector.
13259 Op = NormalizeBuildVector(Op, DAG);
13260 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13261 // abort.
13262 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13263 return SDValue();
13264
13265 // Certain vector constants, used to express things like logical NOT and
13266 // arithmetic NEG, are passed through unmodified. This allows special
13267 // patterns for these operations to match, which will lower these constants
13268 // to whatever is proven necessary.
13269 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13270 if (BVN->isConstant()) {
13271 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13272 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13273 APInt Val(BitSize,
13274 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13275 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13276 return Op;
13277 }
13278 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13279 if (Const->isZero() && !Const->isNegative())
13280 return Op;
13281 }
13282
13283 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13284 return V;
13285
13286 // Scan through the operands to find some interesting properties we can
13287 // exploit:
13288 // 1) If only one value is used, we can use a DUP, or
13289 // 2) if only the low element is not undef, we can just insert that, or
13290 // 3) if only one constant value is used (w/ some non-constant lanes),
13291 // we can splat the constant value into the whole vector then fill
13292 // in the non-constant lanes.
13293 // 4) FIXME: If different constant values are used, but we can intelligently
13294 // select the values we'll be overwriting for the non-constant
13295 // lanes such that we can directly materialize the vector
13296 // some other way (MOVI, e.g.), we can be sneaky.
13297 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13298 SDLoc dl(Op);
13299 unsigned NumElts = VT.getVectorNumElements();
13300 bool isOnlyLowElement = true;
13301 bool usesOnlyOneValue = true;
13302 bool usesOnlyOneConstantValue = true;
13303 bool isConstant = true;
13304 bool AllLanesExtractElt = true;
13305 unsigned NumConstantLanes = 0;
13306 unsigned NumDifferentLanes = 0;
13307 unsigned NumUndefLanes = 0;
13308 SDValue Value;
13309 SDValue ConstantValue;
13310 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13311 unsigned ConsecutiveValCount = 0;
13312 SDValue PrevVal;
13313 for (unsigned i = 0; i < NumElts; ++i) {
13314 SDValue V = Op.getOperand(i);
13315 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13316 AllLanesExtractElt = false;
13317 if (V.isUndef()) {
13318 ++NumUndefLanes;
13319 continue;
13320 }
13321 if (i > 0)
13322 isOnlyLowElement = false;
13323 if (!isIntOrFPConstant(V))
13324 isConstant = false;
13325
13326 if (isIntOrFPConstant(V)) {
13327 ++NumConstantLanes;
13328 if (!ConstantValue.getNode())
13329 ConstantValue = V;
13330 else if (ConstantValue != V)
13331 usesOnlyOneConstantValue = false;
13332 }
13333
13334 if (!Value.getNode())
13335 Value = V;
13336 else if (V != Value) {
13337 usesOnlyOneValue = false;
13338 ++NumDifferentLanes;
13339 }
13340
13341 if (PrevVal != V) {
13342 ConsecutiveValCount = 0;
13343 PrevVal = V;
13344 }
13345
13346 // Keep different values and its last consecutive count. For example,
13347 //
13348 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13349 // t24, t24, t24, t24, t24, t24, t24, t24
13350 // t23 = consecutive count 8
13351 // t24 = consecutive count 8
13352 // ------------------------------------------------------------------
13353 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13354 // t24, t24, t24, t24, t24, t24, t24, t24
13355 // t23 = consecutive count 5
13356 // t24 = consecutive count 9
13357 DifferentValueMap[V] = ++ConsecutiveValCount;
13358 }
13359
13360 if (!Value.getNode()) {
13361 LLVM_DEBUG(
13362 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13363 return DAG.getUNDEF(VT);
13364 }
13365
13366 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13367 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13368 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13369 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13370 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13371 "SCALAR_TO_VECTOR node\n");
13372 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13373 }
13374
13375 if (AllLanesExtractElt) {
13376 SDNode *Vector = nullptr;
13377 bool Even = false;
13378 bool Odd = false;
13379 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13380 // the Odd pattern <1,3,5,...>.
13381 for (unsigned i = 0; i < NumElts; ++i) {
13382 SDValue V = Op.getOperand(i);
13383 const SDNode *N = V.getNode();
13384 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13385 Even = false;
13386 Odd = false;
13387 break;
13388 }
13389 SDValue N0 = N->getOperand(0);
13390
13391 // All elements are extracted from the same vector.
13392 if (!Vector) {
13393 Vector = N0.getNode();
13394 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13395 // BUILD_VECTOR.
13396 if (VT.getVectorElementType() !=
13398 break;
13399 } else if (Vector != N0.getNode()) {
13400 Odd = false;
13401 Even = false;
13402 break;
13403 }
13404
13405 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13406 // indices <1,3,5,...>.
13407 uint64_t Val = N->getConstantOperandVal(1);
13408 if (Val == 2 * i) {
13409 Even = true;
13410 continue;
13411 }
13412 if (Val - 1 == 2 * i) {
13413 Odd = true;
13414 continue;
13415 }
13416
13417 // Something does not match: abort.
13418 Odd = false;
13419 Even = false;
13420 break;
13421 }
13422 if (Even || Odd) {
13423 SDValue LHS =
13425 DAG.getConstant(0, dl, MVT::i64));
13426 SDValue RHS =
13428 DAG.getConstant(NumElts, dl, MVT::i64));
13429
13430 if (Even && !Odd)
13431 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13432 RHS);
13433 if (Odd && !Even)
13434 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13435 RHS);
13436 }
13437 }
13438
13439 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13440 // i32 and try again.
13441 if (usesOnlyOneValue) {
13442 if (!isConstant) {
13443 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13444 Value.getValueType() != VT) {
13445 LLVM_DEBUG(
13446 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13447 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13448 }
13449
13450 // This is actually a DUPLANExx operation, which keeps everything vectory.
13451
13452 SDValue Lane = Value.getOperand(1);
13453 Value = Value.getOperand(0);
13454 if (Value.getValueSizeInBits() == 64) {
13455 LLVM_DEBUG(
13456 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13457 "widening it\n");
13458 Value = WidenVector(Value, DAG);
13459 }
13460
13461 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13462 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13463 }
13464
13467 EVT EltTy = VT.getVectorElementType();
13468 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13469 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13470 LLVM_DEBUG(
13471 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13472 "BITCASTS, and try again\n");
13473 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13474 for (unsigned i = 0; i < NumElts; ++i)
13475 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13476 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13477 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13478 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13479 Val.dump(););
13480 Val = LowerBUILD_VECTOR(Val, DAG);
13481 if (Val.getNode())
13482 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13483 }
13484 }
13485
13486 // If we need to insert a small number of different non-constant elements and
13487 // the vector width is sufficiently large, prefer using DUP with the common
13488 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13489 // skip the constant lane handling below.
13490 bool PreferDUPAndInsert =
13491 !isConstant && NumDifferentLanes >= 1 &&
13492 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13493 NumDifferentLanes >= NumConstantLanes;
13494
13495 // If there was only one constant value used and for more than one lane,
13496 // start by splatting that value, then replace the non-constant lanes. This
13497 // is better than the default, which will perform a separate initialization
13498 // for each lane.
13499 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13500 // Firstly, try to materialize the splat constant.
13501 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13502 unsigned BitSize = VT.getScalarSizeInBits();
13503 APInt ConstantValueAPInt(1, 0);
13504 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13505 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13506 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13507 !ConstantValueAPInt.isAllOnes()) {
13508 Val = ConstantBuildVector(Val, DAG, Subtarget);
13509 if (!Val)
13510 // Otherwise, materialize the constant and splat it.
13511 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13512 }
13513
13514 // Now insert the non-constant lanes.
13515 for (unsigned i = 0; i < NumElts; ++i) {
13516 SDValue V = Op.getOperand(i);
13517 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13518 if (!isIntOrFPConstant(V))
13519 // Note that type legalization likely mucked about with the VT of the
13520 // source operand, so we may have to convert it here before inserting.
13521 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13522 }
13523 return Val;
13524 }
13525
13526 // This will generate a load from the constant pool.
13527 if (isConstant) {
13528 LLVM_DEBUG(
13529 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13530 "expansion\n");
13531 return SDValue();
13532 }
13533
13534 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13535 // v4i32s. This is really a truncate, which we can construct out of (legal)
13536 // concats and truncate nodes.
13538 return M;
13539
13540 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13541 if (NumElts >= 4) {
13542 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13543 return Shuffle;
13544
13545 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13546 return Shuffle;
13547 }
13548
13549 if (PreferDUPAndInsert) {
13550 // First, build a constant vector with the common element.
13551 SmallVector<SDValue, 8> Ops(NumElts, Value);
13552 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13553 // Next, insert the elements that do not match the common value.
13554 for (unsigned I = 0; I < NumElts; ++I)
13555 if (Op.getOperand(I) != Value)
13556 NewVector =
13557 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13558 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13559
13560 return NewVector;
13561 }
13562
13563 // If vector consists of two different values, try to generate two DUPs and
13564 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13565 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13567 // Check the consecutive count of the value is the half number of vector
13568 // elements. In this case, we can use CONCAT_VECTORS. For example,
13569 //
13570 // canUseVECTOR_CONCAT = true;
13571 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13572 // t24, t24, t24, t24, t24, t24, t24, t24
13573 //
13574 // canUseVECTOR_CONCAT = false;
13575 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13576 // t24, t24, t24, t24, t24, t24, t24, t24
13577 bool canUseVECTOR_CONCAT = true;
13578 for (auto Pair : DifferentValueMap) {
13579 // Check different values have same length which is NumElts / 2.
13580 if (Pair.second != NumElts / 2)
13581 canUseVECTOR_CONCAT = false;
13582 Vals.push_back(Pair.first);
13583 }
13584
13585 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13586 // CONCAT_VECTORs. For example,
13587 //
13588 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13589 // t24, t24, t24, t24, t24, t24, t24, t24
13590 // ==>
13591 // t26: v8i8 = AArch64ISD::DUP t23
13592 // t28: v8i8 = AArch64ISD::DUP t24
13593 // t29: v16i8 = concat_vectors t26, t28
13594 if (canUseVECTOR_CONCAT) {
13595 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13596 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13597 SubVT.getVectorNumElements() >= 2) {
13598 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13599 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13600 SDValue DUP1 =
13601 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13602 SDValue DUP2 =
13603 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13605 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13606 return CONCAT_VECTORS;
13607 }
13608 }
13609
13610 // Let's try to generate VECTOR_SHUFFLE. For example,
13611 //
13612 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13613 // ==>
13614 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13615 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13616 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13617 if (NumElts >= 8) {
13618 SmallVector<int, 16> MaskVec;
13619 // Build mask for VECTOR_SHUFLLE.
13620 SDValue FirstLaneVal = Op.getOperand(0);
13621 for (unsigned i = 0; i < NumElts; ++i) {
13622 SDValue Val = Op.getOperand(i);
13623 if (FirstLaneVal == Val)
13624 MaskVec.push_back(i);
13625 else
13626 MaskVec.push_back(i + NumElts);
13627 }
13628
13629 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13630 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13631 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13632 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13634 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13635 return VECTOR_SHUFFLE;
13636 }
13637 }
13638
13639 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13640 // know the default expansion would otherwise fall back on something even
13641 // worse. For a vector with one or two non-undef values, that's
13642 // scalar_to_vector for the elements followed by a shuffle (provided the
13643 // shuffle is valid for the target) and materialization element by element
13644 // on the stack followed by a load for everything else.
13645 if (!isConstant && !usesOnlyOneValue) {
13646 LLVM_DEBUG(
13647 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13648 "of INSERT_VECTOR_ELT\n");
13649
13650 SDValue Vec = DAG.getUNDEF(VT);
13651 SDValue Op0 = Op.getOperand(0);
13652 unsigned i = 0;
13653
13654 // Use SCALAR_TO_VECTOR for lane zero to
13655 // a) Avoid a RMW dependency on the full vector register, and
13656 // b) Allow the register coalescer to fold away the copy if the
13657 // value is already in an S or D register, and we're forced to emit an
13658 // INSERT_SUBREG that we can't fold anywhere.
13659 //
13660 // We also allow types like i8 and i16 which are illegal scalar but legal
13661 // vector element types. After type-legalization the inserted value is
13662 // extended (i32) and it is safe to cast them to the vector type by ignoring
13663 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13664 if (!Op0.isUndef()) {
13665 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13666 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13667 ++i;
13668 }
13669 LLVM_DEBUG(if (i < NumElts) dbgs()
13670 << "Creating nodes for the other vector elements:\n";);
13671 for (; i < NumElts; ++i) {
13672 SDValue V = Op.getOperand(i);
13673 if (V.isUndef())
13674 continue;
13675 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13676 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13677 }
13678 return Vec;
13679 }
13680
13681 LLVM_DEBUG(
13682 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13683 "better alternative\n");
13684 return SDValue();
13685}
13686
13687SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13688 SelectionDAG &DAG) const {
13689 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13690 !Subtarget->isNeonAvailable()))
13691 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13692
13693 assert(Op.getValueType().isScalableVector() &&
13694 isTypeLegal(Op.getValueType()) &&
13695 "Expected legal scalable vector type!");
13696
13697 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13698 unsigned NumOperands = Op->getNumOperands();
13699 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13700 "Unexpected number of operands in CONCAT_VECTORS");
13701
13702 if (NumOperands == 2)
13703 return Op;
13704
13705 // Concat each pair of subvectors and pack into the lower half of the array.
13706 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13707 while (ConcatOps.size() > 1) {
13708 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13709 SDValue V1 = ConcatOps[I];
13710 SDValue V2 = ConcatOps[I + 1];
13711 EVT SubVT = V1.getValueType();
13712 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13713 ConcatOps[I / 2] =
13714 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13715 }
13716 ConcatOps.resize(ConcatOps.size() / 2);
13717 }
13718 return ConcatOps[0];
13719 }
13720
13721 return SDValue();
13722}
13723
13724SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13725 SelectionDAG &DAG) const {
13726 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13727
13728 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13729 !Subtarget->isNeonAvailable()))
13730 return LowerFixedLengthInsertVectorElt(Op, DAG);
13731
13732 EVT VT = Op.getOperand(0).getValueType();
13733
13734 if (VT.getScalarType() == MVT::i1) {
13735 EVT VectorVT = getPromotedVTForPredicate(VT);
13736 SDLoc DL(Op);
13737 SDValue ExtendedVector =
13738 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13739 SDValue ExtendedValue =
13740 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13741 VectorVT.getScalarType().getSizeInBits() < 32
13742 ? MVT::i32
13743 : VectorVT.getScalarType());
13744 ExtendedVector =
13745 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13746 ExtendedValue, Op.getOperand(2));
13747 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13748 }
13749
13750 // Check for non-constant or out of range lane.
13751 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13752 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13753 return SDValue();
13754
13755 return Op;
13756}
13757
13758SDValue
13759AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13760 SelectionDAG &DAG) const {
13761 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13762 EVT VT = Op.getOperand(0).getValueType();
13763
13764 if (VT.getScalarType() == MVT::i1) {
13765 // We can't directly extract from an SVE predicate; extend it first.
13766 // (This isn't the only possible lowering, but it's straightforward.)
13767 EVT VectorVT = getPromotedVTForPredicate(VT);
13768 SDLoc DL(Op);
13769 SDValue Extend =
13770 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13771 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13772 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13773 Extend, Op.getOperand(1));
13774 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13775 }
13776
13777 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13778 return LowerFixedLengthExtractVectorElt(Op, DAG);
13779
13780 // Check for non-constant or out of range lane.
13781 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13782 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13783 return SDValue();
13784
13785 // Insertion/extraction are legal for V128 types.
13786 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13787 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13788 VT == MVT::v8f16 || VT == MVT::v8bf16)
13789 return Op;
13790
13791 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13792 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13793 VT != MVT::v4bf16)
13794 return SDValue();
13795
13796 // For V64 types, we perform extraction by expanding the value
13797 // to a V128 type and perform the extraction on that.
13798 SDLoc DL(Op);
13799 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13800 EVT WideTy = WideVec.getValueType();
13801
13802 EVT ExtrTy = WideTy.getVectorElementType();
13803 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13804 ExtrTy = MVT::i32;
13805
13806 // For extractions, we just return the result directly.
13807 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13808 Op.getOperand(1));
13809}
13810
13811SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13812 SelectionDAG &DAG) const {
13813 assert(Op.getValueType().isFixedLengthVector() &&
13814 "Only cases that extract a fixed length vector are supported!");
13815
13816 EVT InVT = Op.getOperand(0).getValueType();
13817 unsigned Idx = Op.getConstantOperandVal(1);
13818 unsigned Size = Op.getValueSizeInBits();
13819
13820 // If we don't have legal types yet, do nothing
13821 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13822 return SDValue();
13823
13824 if (InVT.isScalableVector()) {
13825 // This will be matched by custom code during ISelDAGToDAG.
13826 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13827 return Op;
13828
13829 return SDValue();
13830 }
13831
13832 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13833 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13834 return Op;
13835
13836 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13837 // that directly.
13838 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13839 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13840 return Op;
13841
13842 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13843 SDLoc DL(Op);
13844
13845 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13846 SDValue NewInVec =
13847 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13848
13849 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13850 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13851 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13852 }
13853
13854 return SDValue();
13855}
13856
13857SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13858 SelectionDAG &DAG) const {
13859 assert(Op.getValueType().isScalableVector() &&
13860 "Only expect to lower inserts into scalable vectors!");
13861
13862 EVT InVT = Op.getOperand(1).getValueType();
13863 unsigned Idx = Op.getConstantOperandVal(2);
13864
13865 SDValue Vec0 = Op.getOperand(0);
13866 SDValue Vec1 = Op.getOperand(1);
13867 SDLoc DL(Op);
13868 EVT VT = Op.getValueType();
13869
13870 if (InVT.isScalableVector()) {
13871 if (!isTypeLegal(VT))
13872 return SDValue();
13873
13874 // Break down insert_subvector into simpler parts.
13875 if (VT.getVectorElementType() == MVT::i1) {
13876 unsigned NumElts = VT.getVectorMinNumElements();
13877 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13878
13879 SDValue Lo, Hi;
13880 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13881 DAG.getVectorIdxConstant(0, DL));
13882 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13883 DAG.getVectorIdxConstant(NumElts / 2, DL));
13884 if (Idx < (NumElts / 2))
13885 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13887 else
13888 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13889 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13890
13891 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13892 }
13893
13894 // Ensure the subvector is half the size of the main vector.
13895 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13896 return SDValue();
13897
13898 // Here narrow and wide refers to the vector element types. After "casting"
13899 // both vectors must have the same bit length and so because the subvector
13900 // has fewer elements, those elements need to be bigger.
13903
13904 // NOP cast operands to the largest legal vector of the same element count.
13905 if (VT.isFloatingPoint()) {
13906 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13907 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13908 } else {
13909 // Legal integer vectors are already their largest so Vec0 is fine as is.
13910 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13911 }
13912
13913 // To replace the top/bottom half of vector V with vector SubV we widen the
13914 // preserved half of V, concatenate this to SubV (the order depending on the
13915 // half being replaced) and then narrow the result.
13916 SDValue Narrow;
13917 if (Idx == 0) {
13918 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
13919 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
13920 } else {
13922 "Invalid subvector index!");
13923 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
13924 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
13925 }
13926
13927 return getSVESafeBitCast(VT, Narrow, DAG);
13928 }
13929
13930 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
13931 // This will be matched by custom code during ISelDAGToDAG.
13932 if (Vec0.isUndef())
13933 return Op;
13934
13935 std::optional<unsigned> PredPattern =
13937 auto PredTy = VT.changeVectorElementType(MVT::i1);
13938 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
13939 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
13940 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
13941 }
13942
13943 return SDValue();
13944}
13945
13946static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
13947 if (Op.getOpcode() != AArch64ISD::DUP &&
13948 Op.getOpcode() != ISD::SPLAT_VECTOR &&
13949 Op.getOpcode() != ISD::BUILD_VECTOR)
13950 return false;
13951
13952 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
13953 !isAllConstantBuildVector(Op, SplatVal))
13954 return false;
13955
13956 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
13957 !isa<ConstantSDNode>(Op->getOperand(0)))
13958 return false;
13959
13960 SplatVal = Op->getConstantOperandVal(0);
13961 if (Op.getValueType().getVectorElementType() != MVT::i64)
13962 SplatVal = (int32_t)SplatVal;
13963
13964 Negated = false;
13965 if (isPowerOf2_64(SplatVal))
13966 return true;
13967
13968 Negated = true;
13969 if (isPowerOf2_64(-SplatVal)) {
13970 SplatVal = -SplatVal;
13971 return true;
13972 }
13973
13974 return false;
13975}
13976
13977SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
13978 EVT VT = Op.getValueType();
13979 SDLoc dl(Op);
13980
13981 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
13982 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
13983
13984 assert(VT.isScalableVector() && "Expected a scalable vector.");
13985
13986 bool Signed = Op.getOpcode() == ISD::SDIV;
13987 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
13988
13989 bool Negated;
13990 uint64_t SplatVal;
13991 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
13992 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
13993 SDValue Res =
13994 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
13995 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
13996 if (Negated)
13997 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
13998
13999 return Res;
14000 }
14001
14002 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14003 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14004
14005 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14006 // operations, and truncate the result.
14007 EVT WidenedVT;
14008 if (VT == MVT::nxv16i8)
14009 WidenedVT = MVT::nxv8i16;
14010 else if (VT == MVT::nxv8i16)
14011 WidenedVT = MVT::nxv4i32;
14012 else
14013 llvm_unreachable("Unexpected Custom DIV operation");
14014
14015 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14016 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14017 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14018 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14019 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14020 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14021 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14022 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14023 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14024}
14025
14027 // Currently no fixed length shuffles that require SVE are legal.
14028 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14029 return false;
14030
14031 if (VT.getVectorNumElements() == 4 &&
14032 (VT.is128BitVector() || VT.is64BitVector())) {
14033 unsigned Cost = getPerfectShuffleCost(M);
14034 if (Cost <= 1)
14035 return true;
14036 }
14037
14038 bool DummyBool;
14039 int DummyInt;
14040 unsigned DummyUnsigned;
14041
14042 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
14043 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
14044 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14045 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
14046 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
14047 isZIPMask(M, VT, DummyUnsigned) ||
14048 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14049 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14050 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14051 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
14052 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14053}
14054
14056 EVT VT) const {
14057 // Just delegate to the generic legality, clear masks aren't special.
14058 return isShuffleMaskLegal(M, VT);
14059}
14060
14061/// getVShiftImm - Check if this is a valid build_vector for the immediate
14062/// operand of a vector shift operation, where all the elements of the
14063/// build_vector must have the same constant integer value.
14064static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14065 // Ignore bit_converts.
14066 while (Op.getOpcode() == ISD::BITCAST)
14067 Op = Op.getOperand(0);
14068 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14069 APInt SplatBits, SplatUndef;
14070 unsigned SplatBitSize;
14071 bool HasAnyUndefs;
14072 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14073 HasAnyUndefs, ElementBits) ||
14074 SplatBitSize > ElementBits)
14075 return false;
14076 Cnt = SplatBits.getSExtValue();
14077 return true;
14078}
14079
14080/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14081/// operand of a vector shift left operation. That value must be in the range:
14082/// 0 <= Value < ElementBits for a left shift; or
14083/// 0 <= Value <= ElementBits for a long left shift.
14084static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14085 assert(VT.isVector() && "vector shift count is not a vector type");
14086 int64_t ElementBits = VT.getScalarSizeInBits();
14087 if (!getVShiftImm(Op, ElementBits, Cnt))
14088 return false;
14089 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14090}
14091
14092/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14093/// operand of a vector shift right operation. The value must be in the range:
14094/// 1 <= Value <= ElementBits for a right shift; or
14095static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14096 assert(VT.isVector() && "vector shift count is not a vector type");
14097 int64_t ElementBits = VT.getScalarSizeInBits();
14098 if (!getVShiftImm(Op, ElementBits, Cnt))
14099 return false;
14100 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14101}
14102
14103SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14104 SelectionDAG &DAG) const {
14105 EVT VT = Op.getValueType();
14106
14107 if (VT.getScalarType() == MVT::i1) {
14108 // Lower i1 truncate to `(x & 1) != 0`.
14109 SDLoc dl(Op);
14110 EVT OpVT = Op.getOperand(0).getValueType();
14111 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14112 SDValue One = DAG.getConstant(1, dl, OpVT);
14113 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14114 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14115 }
14116
14117 if (!VT.isVector() || VT.isScalableVector())
14118 return SDValue();
14119
14120 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14121 !Subtarget->isNeonAvailable()))
14122 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14123
14124 return SDValue();
14125}
14126
14127// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14128// possibly a truncated type, it tells how many bits of the value are to be
14129// used.
14131 SelectionDAG &DAG,
14132 unsigned &ShiftValue,
14133 SDValue &RShOperand) {
14134 if (Shift->getOpcode() != ISD::SRL)
14135 return false;
14136
14137 EVT VT = Shift.getValueType();
14138 assert(VT.isScalableVT());
14139
14140 auto ShiftOp1 =
14141 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14142 if (!ShiftOp1)
14143 return false;
14144
14145 ShiftValue = ShiftOp1->getZExtValue();
14146 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14147 return false;
14148
14149 SDValue Add = Shift->getOperand(0);
14150 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14151 return false;
14152
14154 "ResVT must be truncated or same type as the shift.");
14155 // Check if an overflow can lead to incorrect results.
14156 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14157 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14158 return false;
14159
14160 auto AddOp1 =
14161 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14162 if (!AddOp1)
14163 return false;
14164 uint64_t AddValue = AddOp1->getZExtValue();
14165 if (AddValue != 1ULL << (ShiftValue - 1))
14166 return false;
14167
14168 RShOperand = Add->getOperand(0);
14169 return true;
14170}
14171
14172SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14173 SelectionDAG &DAG) const {
14174 EVT VT = Op.getValueType();
14175 SDLoc DL(Op);
14176 int64_t Cnt;
14177
14178 if (!Op.getOperand(1).getValueType().isVector())
14179 return Op;
14180 unsigned EltSize = VT.getScalarSizeInBits();
14181
14182 switch (Op.getOpcode()) {
14183 case ISD::SHL:
14184 if (VT.isScalableVector() ||
14186 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14187
14188 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14189 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14190 DAG.getConstant(Cnt, DL, MVT::i32));
14191 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14192 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14193 MVT::i32),
14194 Op.getOperand(0), Op.getOperand(1));
14195 case ISD::SRA:
14196 case ISD::SRL:
14197 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14198 SDValue RShOperand;
14199 unsigned ShiftValue;
14200 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14201 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14202 getPredicateForVector(DAG, DL, VT), RShOperand,
14203 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14204 }
14205
14206 if (VT.isScalableVector() ||
14207 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14208 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14210 return LowerToPredicatedOp(Op, DAG, Opc);
14211 }
14212
14213 // Right shift immediate
14214 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14215 unsigned Opc =
14216 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14217 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14218 DAG.getConstant(Cnt, DL, MVT::i32));
14219 }
14220
14221 // Right shift register. Note, there is not a shift right register
14222 // instruction, but the shift left register instruction takes a signed
14223 // value, where negative numbers specify a right shift.
14224 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14225 : Intrinsic::aarch64_neon_ushl;
14226 // negate the shift amount
14227 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14228 Op.getOperand(1));
14229 SDValue NegShiftLeft =
14231 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14232 NegShift);
14233 return NegShiftLeft;
14234 }
14235
14236 llvm_unreachable("unexpected shift opcode");
14237}
14238
14240 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14241 const SDLoc &dl, SelectionDAG &DAG) {
14242 EVT SrcVT = LHS.getValueType();
14243 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14244 "function only supposed to emit natural comparisons");
14245
14246 APInt SplatValue;
14247 APInt SplatUndef;
14248 unsigned SplatBitSize = 0;
14249 bool HasAnyUndefs;
14250
14251 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14252 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14253 SplatBitSize, HasAnyUndefs);
14254
14255 bool IsZero = IsCnst && SplatValue == 0;
14256 bool IsOne =
14257 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14258 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14259
14260 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14261 switch (CC) {
14262 default:
14263 return SDValue();
14264 case AArch64CC::NE: {
14265 SDValue Fcmeq;
14266 if (IsZero)
14267 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14268 else
14269 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14270 return DAG.getNOT(dl, Fcmeq, VT);
14271 }
14272 case AArch64CC::EQ:
14273 if (IsZero)
14274 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14275 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14276 case AArch64CC::GE:
14277 if (IsZero)
14278 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14279 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14280 case AArch64CC::GT:
14281 if (IsZero)
14282 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14283 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14284 case AArch64CC::LE:
14285 if (!NoNans)
14286 return SDValue();
14287 // If we ignore NaNs then we can use to the LS implementation.
14288 [[fallthrough]];
14289 case AArch64CC::LS:
14290 if (IsZero)
14291 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14292 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14293 case AArch64CC::LT:
14294 if (!NoNans)
14295 return SDValue();
14296 // If we ignore NaNs then we can use to the MI implementation.
14297 [[fallthrough]];
14298 case AArch64CC::MI:
14299 if (IsZero)
14300 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14301 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14302 }
14303 }
14304
14305 switch (CC) {
14306 default:
14307 return SDValue();
14308 case AArch64CC::NE: {
14309 SDValue Cmeq;
14310 if (IsZero)
14311 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14312 else
14313 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14314 return DAG.getNOT(dl, Cmeq, VT);
14315 }
14316 case AArch64CC::EQ:
14317 if (IsZero)
14318 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14319 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14320 case AArch64CC::GE:
14321 if (IsZero)
14322 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14323 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14324 case AArch64CC::GT:
14325 if (IsZero)
14326 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14327 if (IsMinusOne)
14328 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14329 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14330 case AArch64CC::LE:
14331 if (IsZero)
14332 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14333 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14334 case AArch64CC::LS:
14335 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14336 case AArch64CC::LO:
14337 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14338 case AArch64CC::LT:
14339 if (IsZero)
14340 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14341 if (IsOne)
14342 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14343 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14344 case AArch64CC::HI:
14345 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14346 case AArch64CC::HS:
14347 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14348 }
14349}
14350
14351SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14352 SelectionDAG &DAG) const {
14353 if (Op.getValueType().isScalableVector())
14354 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14355
14356 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14357 !Subtarget->isNeonAvailable()))
14358 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14359
14360 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14361 SDValue LHS = Op.getOperand(0);
14362 SDValue RHS = Op.getOperand(1);
14363 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14364 SDLoc dl(Op);
14365
14366 if (LHS.getValueType().getVectorElementType().isInteger()) {
14367 assert(LHS.getValueType() == RHS.getValueType());
14369 SDValue Cmp =
14370 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14371 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14372 }
14373
14374 // Lower isnan(x) | isnan(never-nan) to x != x.
14375 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14376 if (CC == ISD::SETUO || CC == ISD::SETO) {
14377 bool OneNaN = false;
14378 if (LHS == RHS) {
14379 OneNaN = true;
14380 } else if (DAG.isKnownNeverNaN(RHS)) {
14381 OneNaN = true;
14382 RHS = LHS;
14383 } else if (DAG.isKnownNeverNaN(LHS)) {
14384 OneNaN = true;
14385 LHS = RHS;
14386 }
14387 if (OneNaN) {
14389 }
14390 }
14391
14392 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14393
14394 // Make v4f16 (only) fcmp operations utilise vector instructions
14395 // v8f16 support will be a litle more complicated
14396 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14397 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14398 if (LHS.getValueType().getVectorNumElements() == 4) {
14399 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14400 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14401 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14402 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14403 CmpVT = MVT::v4i32;
14404 } else
14405 return SDValue();
14406 }
14407
14408 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14409 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14410 LHS.getValueType().getVectorElementType() != MVT::f128);
14411
14412 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14413 // clean. Some of them require two branches to implement.
14414 AArch64CC::CondCode CC1, CC2;
14415 bool ShouldInvert;
14416 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14417
14418 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14419 SDValue Cmp =
14420 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14421 if (!Cmp.getNode())
14422 return SDValue();
14423
14424 if (CC2 != AArch64CC::AL) {
14425 SDValue Cmp2 =
14426 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14427 if (!Cmp2.getNode())
14428 return SDValue();
14429
14430 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14431 }
14432
14433 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14434
14435 if (ShouldInvert)
14436 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14437
14438 return Cmp;
14439}
14440
14441static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14442 SelectionDAG &DAG) {
14443 SDValue VecOp = ScalarOp.getOperand(0);
14444 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14445 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14446 DAG.getConstant(0, DL, MVT::i64));
14447}
14448
14449static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14450 SDLoc DL, SelectionDAG &DAG) {
14451 unsigned ScalarOpcode;
14452 switch (Opcode) {
14453 case ISD::VECREDUCE_AND:
14454 ScalarOpcode = ISD::AND;
14455 break;
14456 case ISD::VECREDUCE_OR:
14457 ScalarOpcode = ISD::OR;
14458 break;
14459 case ISD::VECREDUCE_XOR:
14460 ScalarOpcode = ISD::XOR;
14461 break;
14462 default:
14463 llvm_unreachable("Expected bitwise vector reduction");
14464 return SDValue();
14465 }
14466
14467 EVT VecVT = Vec.getValueType();
14468 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14469 "Expected power-of-2 length vector");
14470
14471 EVT ElemVT = VecVT.getVectorElementType();
14472
14473 SDValue Result;
14474 unsigned NumElems = VecVT.getVectorNumElements();
14475
14476 // Special case for boolean reductions
14477 if (ElemVT == MVT::i1) {
14478 // Split large vectors into smaller ones
14479 if (NumElems > 16) {
14480 SDValue Lo, Hi;
14481 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14482 EVT HalfVT = Lo.getValueType();
14483 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14484 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14485 }
14486
14487 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14488 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14489 // this element size leads to the best codegen, since e.g. setcc results
14490 // might need to be truncated otherwise.
14491 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14492
14493 // any_ext doesn't work with umin/umax, so only use it for uadd.
14494 unsigned ExtendOp =
14495 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14496 SDValue Extended = DAG.getNode(
14497 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14498 switch (ScalarOpcode) {
14499 case ISD::AND:
14500 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14501 break;
14502 case ISD::OR:
14503 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14504 break;
14505 case ISD::XOR:
14506 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14507 break;
14508 default:
14509 llvm_unreachable("Unexpected Opcode");
14510 }
14511
14512 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14513 } else {
14514 // Iteratively split the vector in half and combine using the bitwise
14515 // operation until it fits in a 64 bit register.
14516 while (VecVT.getSizeInBits() > 64) {
14517 SDValue Lo, Hi;
14518 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14519 VecVT = Lo.getValueType();
14520 NumElems = VecVT.getVectorNumElements();
14521 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14522 }
14523
14524 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14525
14526 // Do the remaining work on a scalar since it allows the code generator to
14527 // combine the shift and bitwise operation into one instruction and since
14528 // integer instructions can have higher throughput than vector instructions.
14529 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14530
14531 // Iteratively combine the lower and upper halves of the scalar using the
14532 // bitwise operation, halving the relevant region of the scalar in each
14533 // iteration, until the relevant region is just one element of the original
14534 // vector.
14535 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14536 SDValue ShiftAmount =
14537 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14538 SDValue Shifted =
14539 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14540 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14541 }
14542
14543 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14544 }
14545
14546 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14547}
14548
14549SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14550 SelectionDAG &DAG) const {
14551 SDValue Src = Op.getOperand(0);
14552
14553 // Try to lower fixed length reductions to SVE.
14554 EVT SrcVT = Src.getValueType();
14555 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14556 Op.getOpcode() == ISD::VECREDUCE_AND ||
14557 Op.getOpcode() == ISD::VECREDUCE_OR ||
14558 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14559 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14560 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14561 SrcVT.getVectorElementType() == MVT::i64);
14562 if (SrcVT.isScalableVector() ||
14564 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14565
14566 if (SrcVT.getVectorElementType() == MVT::i1)
14567 return LowerPredReductionToSVE(Op, DAG);
14568
14569 switch (Op.getOpcode()) {
14570 case ISD::VECREDUCE_ADD:
14571 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14572 case ISD::VECREDUCE_AND:
14573 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14574 case ISD::VECREDUCE_OR:
14575 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14577 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14579 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14581 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14583 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14584 case ISD::VECREDUCE_XOR:
14585 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14587 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14589 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14591 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14593 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14595 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14596 default:
14597 llvm_unreachable("Unhandled fixed length reduction");
14598 }
14599 }
14600
14601 // Lower NEON reductions.
14602 SDLoc dl(Op);
14603 switch (Op.getOpcode()) {
14604 case ISD::VECREDUCE_AND:
14605 case ISD::VECREDUCE_OR:
14606 case ISD::VECREDUCE_XOR:
14607 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14608 Op.getValueType(), dl, DAG);
14609 case ISD::VECREDUCE_ADD:
14610 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14612 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14614 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14616 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14618 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14619 default:
14620 llvm_unreachable("Unhandled reduction");
14621 }
14622}
14623
14624SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14625 SelectionDAG &DAG) const {
14626 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14627 // No point replacing if we don't have the relevant instruction/libcall anyway
14628 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14629 return SDValue();
14630
14631 // LSE has an atomic load-clear instruction, but not a load-and.
14632 SDLoc dl(Op);
14633 MVT VT = Op.getSimpleValueType();
14634 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14635 SDValue RHS = Op.getOperand(2);
14636 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14637 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14638 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14639 Op.getOperand(0), Op.getOperand(1), RHS,
14640 AN->getMemOperand());
14641}
14642
14643SDValue
14644AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14645 SelectionDAG &DAG) const {
14646
14647 SDLoc dl(Op);
14648 // Get the inputs.
14649 SDNode *Node = Op.getNode();
14650 SDValue Chain = Op.getOperand(0);
14651 SDValue Size = Op.getOperand(1);
14653 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14654 EVT VT = Node->getValueType(0);
14655
14657 "no-stack-arg-probe")) {
14658 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14659 Chain = SP.getValue(1);
14660 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14661 if (Align)
14662 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14663 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14664 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14665 SDValue Ops[2] = {SP, Chain};
14666 return DAG.getMergeValues(Ops, dl);
14667 }
14668
14669 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14670
14671 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14673 PtrVT, 0);
14674
14675 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14676 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14677 if (Subtarget->hasCustomCallingConv())
14678 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14679
14680 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14681 DAG.getConstant(4, dl, MVT::i64));
14682 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14683 Chain =
14684 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14685 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14686 DAG.getRegisterMask(Mask), Chain.getValue(1));
14687 // To match the actual intent better, we should read the output from X15 here
14688 // again (instead of potentially spilling it to the stack), but rereading Size
14689 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14690 // here.
14691
14692 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14693 DAG.getConstant(4, dl, MVT::i64));
14694
14695 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14696 Chain = SP.getValue(1);
14697 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14698 if (Align)
14699 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14700 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14701 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14702
14703 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14704
14705 SDValue Ops[2] = {SP, Chain};
14706 return DAG.getMergeValues(Ops, dl);
14707}
14708
14709SDValue
14710AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14711 SelectionDAG &DAG) const {
14712 // Get the inputs.
14713 SDNode *Node = Op.getNode();
14714 SDValue Chain = Op.getOperand(0);
14715 SDValue Size = Op.getOperand(1);
14716
14718 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14719 SDLoc dl(Op);
14720 EVT VT = Node->getValueType(0);
14721
14722 // Construct the new SP value in a GPR.
14723 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14724 Chain = SP.getValue(1);
14725 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14726 if (Align)
14727 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14728 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14729
14730 // Set the real SP to the new value with a probing loop.
14731 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14732 SDValue Ops[2] = {SP, Chain};
14733 return DAG.getMergeValues(Ops, dl);
14734}
14735
14736SDValue
14737AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14738 SelectionDAG &DAG) const {
14740
14741 if (Subtarget->isTargetWindows())
14742 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14743 else if (hasInlineStackProbe(MF))
14744 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14745 else
14746 return SDValue();
14747}
14748
14749// When x and y are extended, lower:
14750// avgfloor(x, y) -> (x + y) >> 1
14751// avgceil(x, y) -> (x + y + 1) >> 1
14752
14753// Otherwise, lower to:
14754// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14755// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14756SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14757 unsigned NewOp) const {
14758 if (Subtarget->hasSVE2())
14759 return LowerToPredicatedOp(Op, DAG, NewOp);
14760
14761 SDLoc dl(Op);
14762 SDValue OpA = Op->getOperand(0);
14763 SDValue OpB = Op->getOperand(1);
14764 EVT VT = Op.getValueType();
14765 bool IsCeil =
14766 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14767 bool IsSigned =
14768 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14769 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14770
14771 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14772
14773 auto IsZeroExtended = [&DAG](SDValue &Node) {
14774 KnownBits Known = DAG.computeKnownBits(Node, 0);
14775 return Known.Zero.isSignBitSet();
14776 };
14777
14778 auto IsSignExtended = [&DAG](SDValue &Node) {
14779 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14780 };
14781
14782 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14783 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14784 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14785 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14786 if (IsCeil)
14787 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14788 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14789 }
14790
14791 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14792 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14793
14794 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14795 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14796 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14797 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14798}
14799
14800SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14801 SelectionDAG &DAG) const {
14802 EVT VT = Op.getValueType();
14803 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14804
14805 SDLoc DL(Op);
14806 APInt MulImm = Op.getConstantOperandAPInt(0);
14807 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14808 VT);
14809}
14810
14811/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14812template <unsigned NumVecs>
14813static bool
14817 // Retrieve EC from first vector argument.
14818 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14820#ifndef NDEBUG
14821 // Check the assumption that all input vectors are the same type.
14822 for (unsigned I = 0; I < NumVecs; ++I)
14823 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14824 "Invalid type.");
14825#endif
14826 // memVT is `NumVecs * VT`.
14828 EC * NumVecs);
14829 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14830 Info.offset = 0;
14831 Info.align.reset();
14833 return true;
14834}
14835
14836/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14837/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14838/// specified in the intrinsic calls.
14840 const CallInst &I,
14841 MachineFunction &MF,
14842 unsigned Intrinsic) const {
14843 auto &DL = I.getModule()->getDataLayout();
14844 switch (Intrinsic) {
14845 case Intrinsic::aarch64_sve_st2:
14846 return setInfoSVEStN<2>(*this, DL, Info, I);
14847 case Intrinsic::aarch64_sve_st3:
14848 return setInfoSVEStN<3>(*this, DL, Info, I);
14849 case Intrinsic::aarch64_sve_st4:
14850 return setInfoSVEStN<4>(*this, DL, Info, I);
14851 case Intrinsic::aarch64_neon_ld2:
14852 case Intrinsic::aarch64_neon_ld3:
14853 case Intrinsic::aarch64_neon_ld4:
14854 case Intrinsic::aarch64_neon_ld1x2:
14855 case Intrinsic::aarch64_neon_ld1x3:
14856 case Intrinsic::aarch64_neon_ld1x4: {
14858 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14859 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14860 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14861 Info.offset = 0;
14862 Info.align.reset();
14863 // volatile loads with NEON intrinsics not supported
14865 return true;
14866 }
14867 case Intrinsic::aarch64_neon_ld2lane:
14868 case Intrinsic::aarch64_neon_ld3lane:
14869 case Intrinsic::aarch64_neon_ld4lane:
14870 case Intrinsic::aarch64_neon_ld2r:
14871 case Intrinsic::aarch64_neon_ld3r:
14872 case Intrinsic::aarch64_neon_ld4r: {
14874 // ldx return struct with the same vec type
14875 Type *RetTy = I.getType();
14876 auto *StructTy = cast<StructType>(RetTy);
14877 unsigned NumElts = StructTy->getNumElements();
14878 Type *VecTy = StructTy->getElementType(0);
14879 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14880 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14881 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14882 Info.offset = 0;
14883 Info.align.reset();
14884 // volatile loads with NEON intrinsics not supported
14886 return true;
14887 }
14888 case Intrinsic::aarch64_neon_st2:
14889 case Intrinsic::aarch64_neon_st3:
14890 case Intrinsic::aarch64_neon_st4:
14891 case Intrinsic::aarch64_neon_st1x2:
14892 case Intrinsic::aarch64_neon_st1x3:
14893 case Intrinsic::aarch64_neon_st1x4: {
14895 unsigned NumElts = 0;
14896 for (const Value *Arg : I.args()) {
14897 Type *ArgTy = Arg->getType();
14898 if (!ArgTy->isVectorTy())
14899 break;
14900 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14901 }
14902 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14903 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14904 Info.offset = 0;
14905 Info.align.reset();
14906 // volatile stores with NEON intrinsics not supported
14908 return true;
14909 }
14910 case Intrinsic::aarch64_neon_st2lane:
14911 case Intrinsic::aarch64_neon_st3lane:
14912 case Intrinsic::aarch64_neon_st4lane: {
14914 unsigned NumElts = 0;
14915 // all the vector type is same
14916 Type *VecTy = I.getArgOperand(0)->getType();
14917 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14918
14919 for (const Value *Arg : I.args()) {
14920 Type *ArgTy = Arg->getType();
14921 if (!ArgTy->isVectorTy())
14922 break;
14923 NumElts += 1;
14924 }
14925
14926 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14927 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14928 Info.offset = 0;
14929 Info.align.reset();
14930 // volatile stores with NEON intrinsics not supported
14932 return true;
14933 }
14934 case Intrinsic::aarch64_ldaxr:
14935 case Intrinsic::aarch64_ldxr: {
14936 Type *ValTy = I.getParamElementType(0);
14938 Info.memVT = MVT::getVT(ValTy);
14939 Info.ptrVal = I.getArgOperand(0);
14940 Info.offset = 0;
14941 Info.align = DL.getABITypeAlign(ValTy);
14943 return true;
14944 }
14945 case Intrinsic::aarch64_stlxr:
14946 case Intrinsic::aarch64_stxr: {
14947 Type *ValTy = I.getParamElementType(1);
14949 Info.memVT = MVT::getVT(ValTy);
14950 Info.ptrVal = I.getArgOperand(1);
14951 Info.offset = 0;
14952 Info.align = DL.getABITypeAlign(ValTy);
14954 return true;
14955 }
14956 case Intrinsic::aarch64_ldaxp:
14957 case Intrinsic::aarch64_ldxp:
14959 Info.memVT = MVT::i128;
14960 Info.ptrVal = I.getArgOperand(0);
14961 Info.offset = 0;
14962 Info.align = Align(16);
14964 return true;
14965 case Intrinsic::aarch64_stlxp:
14966 case Intrinsic::aarch64_stxp:
14968 Info.memVT = MVT::i128;
14969 Info.ptrVal = I.getArgOperand(2);
14970 Info.offset = 0;
14971 Info.align = Align(16);
14973 return true;
14974 case Intrinsic::aarch64_sve_ldnt1: {
14975 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
14977 Info.memVT = MVT::getVT(I.getType());
14978 Info.ptrVal = I.getArgOperand(1);
14979 Info.offset = 0;
14980 Info.align = DL.getABITypeAlign(ElTy);
14982 return true;
14983 }
14984 case Intrinsic::aarch64_sve_stnt1: {
14985 Type *ElTy =
14986 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
14988 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
14989 Info.ptrVal = I.getArgOperand(2);
14990 Info.offset = 0;
14991 Info.align = DL.getABITypeAlign(ElTy);
14993 return true;
14994 }
14995 case Intrinsic::aarch64_mops_memset_tag: {
14996 Value *Dst = I.getArgOperand(0);
14997 Value *Val = I.getArgOperand(1);
14999 Info.memVT = MVT::getVT(Val->getType());
15000 Info.ptrVal = Dst;
15001 Info.offset = 0;
15002 Info.align = I.getParamAlign(0).valueOrOne();
15004 // The size of the memory being operated on is unknown at this point
15006 return true;
15007 }
15008 default:
15009 break;
15010 }
15011
15012 return false;
15013}
15014
15016 ISD::LoadExtType ExtTy,
15017 EVT NewVT) const {
15018 // TODO: This may be worth removing. Check regression tests for diffs.
15019 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15020 return false;
15021
15022 // If we're reducing the load width in order to avoid having to use an extra
15023 // instruction to do extension then it's probably a good idea.
15024 if (ExtTy != ISD::NON_EXTLOAD)
15025 return true;
15026 // Don't reduce load width if it would prevent us from combining a shift into
15027 // the offset.
15028 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15029 assert(Mem);
15030 const SDValue &Base = Mem->getBasePtr();
15031 if (Base.getOpcode() == ISD::ADD &&
15032 Base.getOperand(1).getOpcode() == ISD::SHL &&
15033 Base.getOperand(1).hasOneUse() &&
15034 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15035 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15036 if (Mem->getMemoryVT().isScalableVector())
15037 return false;
15038 // The shift can be combined if it matches the size of the value being
15039 // loaded (and so reducing the width would make it not match).
15040 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15041 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15042 if (ShiftAmount == Log2_32(LoadBytes))
15043 return false;
15044 }
15045 // We have no reason to disallow reducing the load width, so allow it.
15046 return true;
15047}
15048
15049// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15051 EVT VT = Extend.getValueType();
15052 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15053 SDValue Extract = Extend.getOperand(0);
15054 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15055 Extract = Extract.getOperand(0);
15056 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15057 EVT VecVT = Extract.getOperand(0).getValueType();
15058 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15059 return false;
15060 }
15061 }
15062 return true;
15063}
15064
15065// Truncations from 64-bit GPR to 32-bit GPR is free.
15067 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15068 return false;
15069 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15070 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15071 return NumBits1 > NumBits2;
15072}
15074 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15075 return false;
15076 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15077 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15078 return NumBits1 > NumBits2;
15079}
15080
15081/// Check if it is profitable to hoist instruction in then/else to if.
15082/// Not profitable if I and it's user can form a FMA instruction
15083/// because we prefer FMSUB/FMADD.
15085 if (I->getOpcode() != Instruction::FMul)
15086 return true;
15087
15088 if (!I->hasOneUse())
15089 return true;
15090
15091 Instruction *User = I->user_back();
15092
15093 if (!(User->getOpcode() == Instruction::FSub ||
15094 User->getOpcode() == Instruction::FAdd))
15095 return true;
15096
15098 const Function *F = I->getFunction();
15099 const DataLayout &DL = F->getParent()->getDataLayout();
15100 Type *Ty = User->getOperand(0)->getType();
15101
15102 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15104 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15105 Options.UnsafeFPMath));
15106}
15107
15108// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15109// 64-bit GPR.
15111 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15112 return false;
15113 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15114 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15115 return NumBits1 == 32 && NumBits2 == 64;
15116}
15118 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15119 return false;
15120 unsigned NumBits1 = VT1.getSizeInBits();
15121 unsigned NumBits2 = VT2.getSizeInBits();
15122 return NumBits1 == 32 && NumBits2 == 64;
15123}
15124
15126 EVT VT1 = Val.getValueType();
15127 if (isZExtFree(VT1, VT2)) {
15128 return true;
15129 }
15130
15131 if (Val.getOpcode() != ISD::LOAD)
15132 return false;
15133
15134 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15135 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15136 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15137 VT1.getSizeInBits() <= 32);
15138}
15139
15140bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15141 if (isa<FPExtInst>(Ext))
15142 return false;
15143
15144 // Vector types are not free.
15145 if (Ext->getType()->isVectorTy())
15146 return false;
15147
15148 for (const Use &U : Ext->uses()) {
15149 // The extension is free if we can fold it with a left shift in an
15150 // addressing mode or an arithmetic operation: add, sub, and cmp.
15151
15152 // Is there a shift?
15153 const Instruction *Instr = cast<Instruction>(U.getUser());
15154
15155 // Is this a constant shift?
15156 switch (Instr->getOpcode()) {
15157 case Instruction::Shl:
15158 if (!isa<ConstantInt>(Instr->getOperand(1)))
15159 return false;
15160 break;
15161 case Instruction::GetElementPtr: {
15162 gep_type_iterator GTI = gep_type_begin(Instr);
15163 auto &DL = Ext->getModule()->getDataLayout();
15164 std::advance(GTI, U.getOperandNo()-1);
15165 Type *IdxTy = GTI.getIndexedType();
15166 // This extension will end up with a shift because of the scaling factor.
15167 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15168 // Get the shift amount based on the scaling factor:
15169 // log2(sizeof(IdxTy)) - log2(8).
15170 if (IdxTy->isScalableTy())
15171 return false;
15172 uint64_t ShiftAmt =
15173 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15174 3;
15175 // Is the constant foldable in the shift of the addressing mode?
15176 // I.e., shift amount is between 1 and 4 inclusive.
15177 if (ShiftAmt == 0 || ShiftAmt > 4)
15178 return false;
15179 break;
15180 }
15181 case Instruction::Trunc:
15182 // Check if this is a noop.
15183 // trunc(sext ty1 to ty2) to ty1.
15184 if (Instr->getType() == Ext->getOperand(0)->getType())
15185 continue;
15186 [[fallthrough]];
15187 default:
15188 return false;
15189 }
15190
15191 // At this point we can use the bfm family, so this extension is free
15192 // for that use.
15193 }
15194 return true;
15195}
15196
15197static bool isSplatShuffle(Value *V) {
15198 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15199 return all_equal(Shuf->getShuffleMask());
15200 return false;
15201}
15202
15203/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15204/// or upper half of the vector elements.
15205static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15206 bool AllowSplat = false) {
15207 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15208 auto *FullTy = FullV->getType();
15209 auto *HalfTy = HalfV->getType();
15210 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15211 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15212 };
15213
15214 auto extractHalf = [](Value *FullV, Value *HalfV) {
15215 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15216 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15217 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15218 };
15219
15220 ArrayRef<int> M1, M2;
15221 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15222 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15223 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15224 return false;
15225
15226 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15227 // it is not checked as an extract below.
15228 if (AllowSplat && isSplatShuffle(Op1))
15229 S1Op1 = nullptr;
15230 if (AllowSplat && isSplatShuffle(Op2))
15231 S2Op1 = nullptr;
15232
15233 // Check that the operands are half as wide as the result and we extract
15234 // half of the elements of the input vectors.
15235 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15236 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15237 return false;
15238
15239 // Check the mask extracts either the lower or upper half of vector
15240 // elements.
15241 int M1Start = 0;
15242 int M2Start = 0;
15243 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15244 if ((S1Op1 &&
15245 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15246 (S2Op1 &&
15247 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15248 return false;
15249
15250 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15251 (M2Start != 0 && M2Start != (NumElements / 2)))
15252 return false;
15253 if (S1Op1 && S2Op1 && M1Start != M2Start)
15254 return false;
15255
15256 return true;
15257}
15258
15259/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15260/// of the vector elements.
15261static bool areExtractExts(Value *Ext1, Value *Ext2) {
15262 auto areExtDoubled = [](Instruction *Ext) {
15263 return Ext->getType()->getScalarSizeInBits() ==
15264 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15265 };
15266
15267 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15268 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15269 !areExtDoubled(cast<Instruction>(Ext1)) ||
15270 !areExtDoubled(cast<Instruction>(Ext2)))
15271 return false;
15272
15273 return true;
15274}
15275
15276/// Check if Op could be used with vmull_high_p64 intrinsic.
15278 Value *VectorOperand = nullptr;
15279 ConstantInt *ElementIndex = nullptr;
15280 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15281 m_ConstantInt(ElementIndex))) &&
15282 ElementIndex->getValue() == 1 &&
15283 isa<FixedVectorType>(VectorOperand->getType()) &&
15284 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15285}
15286
15287/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15288static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15290}
15291
15293 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15294 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15295 if (!GEP || GEP->getNumOperands() != 2)
15296 return false;
15297
15298 Value *Base = GEP->getOperand(0);
15299 Value *Offsets = GEP->getOperand(1);
15300
15301 // We only care about scalar_base+vector_offsets.
15302 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15303 return false;
15304
15305 // Sink extends that would allow us to use 32-bit offset vectors.
15306 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15307 auto *OffsetsInst = cast<Instruction>(Offsets);
15308 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15309 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15310 Ops.push_back(&GEP->getOperandUse(1));
15311 }
15312
15313 // Sink the GEP.
15314 return true;
15315}
15316
15317/// We want to sink following cases:
15318/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15320 if (match(Op, m_VScale()))
15321 return true;
15322 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15324 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15325 return true;
15326 }
15327 return false;
15328}
15329
15330/// Check if sinking \p I's operands to I's basic block is profitable, because
15331/// the operands can be folded into a target instruction, e.g.
15332/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15334 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15335 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15336 switch (II->getIntrinsicID()) {
15337 case Intrinsic::aarch64_neon_smull:
15338 case Intrinsic::aarch64_neon_umull:
15339 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15340 /*AllowSplat=*/true)) {
15341 Ops.push_back(&II->getOperandUse(0));
15342 Ops.push_back(&II->getOperandUse(1));
15343 return true;
15344 }
15345 [[fallthrough]];
15346
15347 case Intrinsic::fma:
15348 if (isa<VectorType>(I->getType()) &&
15349 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15350 !Subtarget->hasFullFP16())
15351 return false;
15352 [[fallthrough]];
15353 case Intrinsic::aarch64_neon_sqdmull:
15354 case Intrinsic::aarch64_neon_sqdmulh:
15355 case Intrinsic::aarch64_neon_sqrdmulh:
15356 // Sink splats for index lane variants
15357 if (isSplatShuffle(II->getOperand(0)))
15358 Ops.push_back(&II->getOperandUse(0));
15359 if (isSplatShuffle(II->getOperand(1)))
15360 Ops.push_back(&II->getOperandUse(1));
15361 return !Ops.empty();
15362 case Intrinsic::aarch64_neon_fmlal:
15363 case Intrinsic::aarch64_neon_fmlal2:
15364 case Intrinsic::aarch64_neon_fmlsl:
15365 case Intrinsic::aarch64_neon_fmlsl2:
15366 // Sink splats for index lane variants
15367 if (isSplatShuffle(II->getOperand(1)))
15368 Ops.push_back(&II->getOperandUse(1));
15369 if (isSplatShuffle(II->getOperand(2)))
15370 Ops.push_back(&II->getOperandUse(2));
15371 return !Ops.empty();
15372 case Intrinsic::aarch64_sve_ptest_first:
15373 case Intrinsic::aarch64_sve_ptest_last:
15374 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15375 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15376 Ops.push_back(&II->getOperandUse(0));
15377 return !Ops.empty();
15378 case Intrinsic::aarch64_sme_write_horiz:
15379 case Intrinsic::aarch64_sme_write_vert:
15380 case Intrinsic::aarch64_sme_writeq_horiz:
15381 case Intrinsic::aarch64_sme_writeq_vert: {
15382 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15383 if (!Idx || Idx->getOpcode() != Instruction::Add)
15384 return false;
15385 Ops.push_back(&II->getOperandUse(1));
15386 return true;
15387 }
15388 case Intrinsic::aarch64_sme_read_horiz:
15389 case Intrinsic::aarch64_sme_read_vert:
15390 case Intrinsic::aarch64_sme_readq_horiz:
15391 case Intrinsic::aarch64_sme_readq_vert:
15392 case Intrinsic::aarch64_sme_ld1b_vert:
15393 case Intrinsic::aarch64_sme_ld1h_vert:
15394 case Intrinsic::aarch64_sme_ld1w_vert:
15395 case Intrinsic::aarch64_sme_ld1d_vert:
15396 case Intrinsic::aarch64_sme_ld1q_vert:
15397 case Intrinsic::aarch64_sme_st1b_vert:
15398 case Intrinsic::aarch64_sme_st1h_vert:
15399 case Intrinsic::aarch64_sme_st1w_vert:
15400 case Intrinsic::aarch64_sme_st1d_vert:
15401 case Intrinsic::aarch64_sme_st1q_vert:
15402 case Intrinsic::aarch64_sme_ld1b_horiz:
15403 case Intrinsic::aarch64_sme_ld1h_horiz:
15404 case Intrinsic::aarch64_sme_ld1w_horiz:
15405 case Intrinsic::aarch64_sme_ld1d_horiz:
15406 case Intrinsic::aarch64_sme_ld1q_horiz:
15407 case Intrinsic::aarch64_sme_st1b_horiz:
15408 case Intrinsic::aarch64_sme_st1h_horiz:
15409 case Intrinsic::aarch64_sme_st1w_horiz:
15410 case Intrinsic::aarch64_sme_st1d_horiz:
15411 case Intrinsic::aarch64_sme_st1q_horiz: {
15412 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15413 if (!Idx || Idx->getOpcode() != Instruction::Add)
15414 return false;
15415 Ops.push_back(&II->getOperandUse(3));
15416 return true;
15417 }
15418 case Intrinsic::aarch64_neon_pmull:
15419 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15420 return false;
15421 Ops.push_back(&II->getOperandUse(0));
15422 Ops.push_back(&II->getOperandUse(1));
15423 return true;
15424 case Intrinsic::aarch64_neon_pmull64:
15425 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15426 II->getArgOperand(1)))
15427 return false;
15428 Ops.push_back(&II->getArgOperandUse(0));
15429 Ops.push_back(&II->getArgOperandUse(1));
15430 return true;
15431 case Intrinsic::masked_gather:
15432 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15433 return false;
15434 Ops.push_back(&II->getArgOperandUse(0));
15435 return true;
15436 case Intrinsic::masked_scatter:
15437 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15438 return false;
15439 Ops.push_back(&II->getArgOperandUse(1));
15440 return true;
15441 default:
15442 return false;
15443 }
15444 }
15445
15446 // Sink vscales closer to uses for better isel
15447 switch (I->getOpcode()) {
15448 case Instruction::GetElementPtr:
15449 case Instruction::Add:
15450 case Instruction::Sub:
15451 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15452 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15453 Ops.push_back(&I->getOperandUse(Op));
15454 return true;
15455 }
15456 }
15457 break;
15458 default:
15459 break;
15460 }
15461
15462 if (!I->getType()->isVectorTy())
15463 return false;
15464
15465 switch (I->getOpcode()) {
15466 case Instruction::Sub:
15467 case Instruction::Add: {
15468 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15469 return false;
15470
15471 // If the exts' operands extract either the lower or upper elements, we
15472 // can sink them too.
15473 auto Ext1 = cast<Instruction>(I->getOperand(0));
15474 auto Ext2 = cast<Instruction>(I->getOperand(1));
15475 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15476 Ops.push_back(&Ext1->getOperandUse(0));
15477 Ops.push_back(&Ext2->getOperandUse(0));
15478 }
15479
15480 Ops.push_back(&I->getOperandUse(0));
15481 Ops.push_back(&I->getOperandUse(1));
15482
15483 return true;
15484 }
15485 case Instruction::Or: {
15486 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15487 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15488 if (Subtarget->hasNEON()) {
15489 Instruction *OtherAnd, *IA, *IB;
15490 Value *MaskValue;
15491 // MainAnd refers to And instruction that has 'Not' as one of its operands
15492 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15493 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15494 m_Instruction(IA)))))) {
15495 if (match(OtherAnd,
15496 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15497 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15498 ? cast<Instruction>(I->getOperand(1))
15499 : cast<Instruction>(I->getOperand(0));
15500
15501 // Both Ands should be in same basic block as Or
15502 if (I->getParent() != MainAnd->getParent() ||
15503 I->getParent() != OtherAnd->getParent())
15504 return false;
15505
15506 // Non-mask operands of both Ands should also be in same basic block
15507 if (I->getParent() != IA->getParent() ||
15508 I->getParent() != IB->getParent())
15509 return false;
15510
15511 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15512 Ops.push_back(&I->getOperandUse(0));
15513 Ops.push_back(&I->getOperandUse(1));
15514
15515 return true;
15516 }
15517 }
15518 }
15519
15520 return false;
15521 }
15522 case Instruction::Mul: {
15523 int NumZExts = 0, NumSExts = 0;
15524 for (auto &Op : I->operands()) {
15525 // Make sure we are not already sinking this operand
15526 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15527 continue;
15528
15529 if (match(&Op, m_SExt(m_Value()))) {
15530 NumSExts++;
15531 continue;
15532 } else if (match(&Op, m_ZExt(m_Value()))) {
15533 NumZExts++;
15534 continue;
15535 }
15536
15537 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15538
15539 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15540 // operand and the s/zext can help create indexed s/umull. This is
15541 // especially useful to prevent i64 mul being scalarized.
15542 if (Shuffle && isSplatShuffle(Shuffle) &&
15543 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15544 Ops.push_back(&Shuffle->getOperandUse(0));
15545 Ops.push_back(&Op);
15546 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15547 NumSExts++;
15548 else
15549 NumZExts++;
15550 continue;
15551 }
15552
15553 if (!Shuffle)
15554 continue;
15555
15556 Value *ShuffleOperand = Shuffle->getOperand(0);
15557 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15558 if (!Insert)
15559 continue;
15560
15561 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15562 if (!OperandInstr)
15563 continue;
15564
15565 ConstantInt *ElementConstant =
15566 dyn_cast<ConstantInt>(Insert->getOperand(2));
15567 // Check that the insertelement is inserting into element 0
15568 if (!ElementConstant || !ElementConstant->isZero())
15569 continue;
15570
15571 unsigned Opcode = OperandInstr->getOpcode();
15572 if (Opcode == Instruction::SExt)
15573 NumSExts++;
15574 else if (Opcode == Instruction::ZExt)
15575 NumZExts++;
15576 else {
15577 // If we find that the top bits are known 0, then we can sink and allow
15578 // the backend to generate a umull.
15579 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15580 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15581 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15582 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15583 continue;
15584 NumZExts++;
15585 }
15586
15587 Ops.push_back(&Shuffle->getOperandUse(0));
15588 Ops.push_back(&Op);
15589 }
15590
15591 // Is it profitable to sink if we found two of the same type of extends.
15592 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15593 }
15594 default:
15595 return false;
15596 }
15597 return false;
15598}
15599
15601 bool IsLittleEndian) {
15602 Value *Op = ZExt->getOperand(0);
15603 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15604 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15605 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15606 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15607 return false;
15608
15609 assert(DstWidth % SrcWidth == 0 &&
15610 "TBL lowering is not supported for a ZExt instruction with this "
15611 "source & destination element type.");
15612 unsigned ZExtFactor = DstWidth / SrcWidth;
15613 unsigned NumElts = SrcTy->getNumElements();
15614 IRBuilder<> Builder(ZExt);
15615 SmallVector<int> Mask;
15616 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15617 // vector to replace the original ZExt. This can later be lowered to a set of
15618 // tbl instructions.
15619 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15620 if (IsLittleEndian) {
15621 if (i % ZExtFactor == 0)
15622 Mask.push_back(i / ZExtFactor);
15623 else
15624 Mask.push_back(NumElts);
15625 } else {
15626 if ((i + 1) % ZExtFactor == 0)
15627 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15628 else
15629 Mask.push_back(NumElts);
15630 }
15631 }
15632
15633 auto *FirstEltZero = Builder.CreateInsertElement(
15634 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15635 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15636 Result = Builder.CreateBitCast(Result, DstTy);
15637 if (DstTy != ZExt->getType())
15638 Result = Builder.CreateZExt(Result, ZExt->getType());
15639 ZExt->replaceAllUsesWith(Result);
15640 ZExt->eraseFromParent();
15641 return true;
15642}
15643
15644static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15645 IRBuilder<> Builder(TI);
15647 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15648 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15649 auto *DstTy = cast<FixedVectorType>(TI->getType());
15650 assert(SrcTy->getElementType()->isIntegerTy() &&
15651 "Non-integer type source vector element is not supported");
15652 assert(DstTy->getElementType()->isIntegerTy(8) &&
15653 "Unsupported destination vector element type");
15654 unsigned SrcElemTySz =
15655 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15656 unsigned DstElemTySz =
15657 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15658 assert((SrcElemTySz % DstElemTySz == 0) &&
15659 "Cannot lower truncate to tbl instructions for a source element size "
15660 "that is not divisible by the destination element size");
15661 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15662 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15663 "Unsupported source vector element type size");
15664 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15665
15666 // Create a mask to choose every nth byte from the source vector table of
15667 // bytes to create the truncated destination vector, where 'n' is the truncate
15668 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15669 // 0,8,16,..Y*8th bytes for the little-endian format
15671 for (int Itr = 0; Itr < 16; Itr++) {
15672 if (Itr < NumElements)
15673 MaskConst.push_back(Builder.getInt8(
15674 IsLittleEndian ? Itr * TruncFactor
15675 : Itr * TruncFactor + (TruncFactor - 1)));
15676 else
15677 MaskConst.push_back(Builder.getInt8(255));
15678 }
15679
15680 int MaxTblSz = 128 * 4;
15681 int MaxSrcSz = SrcElemTySz * NumElements;
15682 int ElemsPerTbl =
15683 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15684 assert(ElemsPerTbl <= 16 &&
15685 "Maximum elements selected using TBL instruction cannot exceed 16!");
15686
15687 int ShuffleCount = 128 / SrcElemTySz;
15688 SmallVector<int> ShuffleLanes;
15689 for (int i = 0; i < ShuffleCount; ++i)
15690 ShuffleLanes.push_back(i);
15691
15692 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15693 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15694 // call TBL & save the result in a vector of TBL results for combining later.
15696 while (ShuffleLanes.back() < NumElements) {
15697 Parts.push_back(Builder.CreateBitCast(
15698 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15699
15700 if (Parts.size() == 4) {
15702 Intrinsic::aarch64_neon_tbl4, VecTy);
15703 Parts.push_back(ConstantVector::get(MaskConst));
15704 Results.push_back(Builder.CreateCall(F, Parts));
15705 Parts.clear();
15706 }
15707
15708 for (int i = 0; i < ShuffleCount; ++i)
15709 ShuffleLanes[i] += ShuffleCount;
15710 }
15711
15712 assert((Parts.empty() || Results.empty()) &&
15713 "Lowering trunc for vectors requiring different TBL instructions is "
15714 "not supported!");
15715 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15716 // registers
15717 if (!Parts.empty()) {
15718 Intrinsic::ID TblID;
15719 switch (Parts.size()) {
15720 case 1:
15721 TblID = Intrinsic::aarch64_neon_tbl1;
15722 break;
15723 case 2:
15724 TblID = Intrinsic::aarch64_neon_tbl2;
15725 break;
15726 case 3:
15727 TblID = Intrinsic::aarch64_neon_tbl3;
15728 break;
15729 }
15730
15731 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15732 Parts.push_back(ConstantVector::get(MaskConst));
15733 Results.push_back(Builder.CreateCall(F, Parts));
15734 }
15735
15736 // Extract the destination vector from TBL result(s) after combining them
15737 // where applicable. Currently, at most two TBLs are supported.
15738 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15739 "more than 2 tbl instructions!");
15740 Value *FinalResult = Results[0];
15741 if (Results.size() == 1) {
15742 if (ElemsPerTbl < 16) {
15743 SmallVector<int> FinalMask(ElemsPerTbl);
15744 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15745 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15746 }
15747 } else {
15748 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15749 if (ElemsPerTbl < 16) {
15750 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15751 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15752 } else {
15753 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15754 }
15755 FinalResult =
15756 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15757 }
15758
15759 TI->replaceAllUsesWith(FinalResult);
15760 TI->eraseFromParent();
15761}
15762
15764 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15765 // shuffle_vector instructions are serialized when targeting SVE,
15766 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15767 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15768 return false;
15769
15770 // Try to optimize conversions using tbl. This requires materializing constant
15771 // index vectors, which can increase code size and add loads. Skip the
15772 // transform unless the conversion is in a loop block guaranteed to execute
15773 // and we are not optimizing for size.
15774 Function *F = I->getParent()->getParent();
15775 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15776 F->hasOptSize())
15777 return false;
15778
15779 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15780 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15781 if (!SrcTy || !DstTy)
15782 return false;
15783
15784 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15785 // lowered to tbl instructions to insert the original i8 elements
15786 // into i8x lanes. This is enabled for cases where it is beneficial.
15787 auto *ZExt = dyn_cast<ZExtInst>(I);
15788 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15789 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15790 if (DstWidth % 8 != 0)
15791 return false;
15792
15793 auto *TruncDstType =
15794 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15795 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15796 // the remaining ZExt folded into the user, don't use tbl lowering.
15797 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15798 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15801 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15802 return false;
15803
15804 DstTy = TruncDstType;
15805 }
15806
15807 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15808 }
15809
15810 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15811 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15812 DstTy->getElementType()->isFloatTy()) {
15813 IRBuilder<> Builder(I);
15814 auto *ZExt = cast<ZExtInst>(
15815 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15816 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15817 I->replaceAllUsesWith(UI);
15818 I->eraseFromParent();
15819 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15820 Subtarget->isLittleEndian());
15821 }
15822
15823 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15824 // followed by a truncate lowered to using tbl.4.
15825 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15826 if (FPToUI &&
15827 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15828 SrcTy->getElementType()->isFloatTy() &&
15829 DstTy->getElementType()->isIntegerTy(8)) {
15830 IRBuilder<> Builder(I);
15831 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15832 VectorType::getInteger(SrcTy));
15833 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15834 I->replaceAllUsesWith(TruncI);
15835 I->eraseFromParent();
15836 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15837 return true;
15838 }
15839
15840 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15841 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15842 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15843 // registers
15844 auto *TI = dyn_cast<TruncInst>(I);
15845 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15846 ((SrcTy->getElementType()->isIntegerTy(32) ||
15847 SrcTy->getElementType()->isIntegerTy(64)) &&
15848 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15849 createTblForTrunc(TI, Subtarget->isLittleEndian());
15850 return true;
15851 }
15852
15853 return false;
15854}
15855
15857 Align &RequiredAligment) const {
15858 if (!LoadedType.isSimple() ||
15859 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15860 return false;
15861 // Cyclone supports unaligned accesses.
15862 RequiredAligment = Align(1);
15863 unsigned NumBits = LoadedType.getSizeInBits();
15864 return NumBits == 32 || NumBits == 64;
15865}
15866
15867/// A helper function for determining the number of interleaved accesses we
15868/// will generate when lowering accesses of the given type.
15870 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15871 unsigned VecSize = 128;
15872 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15873 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15874 if (UseScalable && isa<FixedVectorType>(VecTy))
15875 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15876 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15877}
15878
15881 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15882 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15883 return MOStridedAccess;
15885}
15886
15888 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15889 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15890 auto EC = VecTy->getElementCount();
15891 unsigned MinElts = EC.getKnownMinValue();
15892
15893 UseScalable = false;
15894
15895 if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15896 return false;
15897
15898 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15899 return false;
15900
15901 // Ensure that the predicate for this number of elements is available.
15902 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15903 return false;
15904
15905 // Ensure the number of vector elements is greater than 1.
15906 if (MinElts < 2)
15907 return false;
15908
15909 // Ensure the element type is legal.
15910 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15911 return false;
15912
15913 if (EC.isScalable()) {
15914 UseScalable = true;
15915 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
15916 }
15917
15918 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
15919 if (!Subtarget->isNeonAvailable() ||
15920 (Subtarget->useSVEForFixedLengthVectors() &&
15921 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
15922 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15923 isPowerOf2_32(MinElts) && VecSize > 128)))) {
15924 UseScalable = true;
15925 return true;
15926 }
15927
15928 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
15929 // 128 will be split into multiple interleaved accesses.
15930 return VecSize == 64 || VecSize % 128 == 0;
15931}
15932
15934 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
15935 return ScalableVectorType::get(VTy->getElementType(), 2);
15936
15937 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
15938 return ScalableVectorType::get(VTy->getElementType(), 4);
15939
15940 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
15941 return ScalableVectorType::get(VTy->getElementType(), 8);
15942
15943 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
15944 return ScalableVectorType::get(VTy->getElementType(), 8);
15945
15946 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
15947 return ScalableVectorType::get(VTy->getElementType(), 2);
15948
15949 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
15950 return ScalableVectorType::get(VTy->getElementType(), 4);
15951
15952 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
15953 return ScalableVectorType::get(VTy->getElementType(), 8);
15954
15955 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
15956 return ScalableVectorType::get(VTy->getElementType(), 16);
15957
15958 llvm_unreachable("Cannot handle input vector type");
15959}
15960
15961static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
15962 bool Scalable, Type *LDVTy,
15963 Type *PtrTy) {
15964 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15965 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
15966 Intrinsic::aarch64_sve_ld3_sret,
15967 Intrinsic::aarch64_sve_ld4_sret};
15968 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
15969 Intrinsic::aarch64_neon_ld3,
15970 Intrinsic::aarch64_neon_ld4};
15971 if (Scalable)
15972 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
15973
15974 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
15975}
15976
15977static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
15978 bool Scalable, Type *STVTy,
15979 Type *PtrTy) {
15980 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15981 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
15982 Intrinsic::aarch64_sve_st3,
15983 Intrinsic::aarch64_sve_st4};
15984 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
15985 Intrinsic::aarch64_neon_st3,
15986 Intrinsic::aarch64_neon_st4};
15987 if (Scalable)
15988 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
15989
15990 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
15991}
15992
15993/// Lower an interleaved load into a ldN intrinsic.
15994///
15995/// E.g. Lower an interleaved load (Factor = 2):
15996/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
15997/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
15998/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
15999///
16000/// Into:
16001/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16002/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16003/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16006 ArrayRef<unsigned> Indices, unsigned Factor) const {
16007 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16008 "Invalid interleave factor");
16009 assert(!Shuffles.empty() && "Empty shufflevector input");
16010 assert(Shuffles.size() == Indices.size() &&
16011 "Unmatched number of shufflevectors and indices");
16012
16013 const DataLayout &DL = LI->getModule()->getDataLayout();
16014
16015 VectorType *VTy = Shuffles[0]->getType();
16016
16017 // Skip if we do not have NEON and skip illegal vector types. We can
16018 // "legalize" wide vector types into multiple interleaved accesses as long as
16019 // the vector types are divisible by 128.
16020 bool UseScalable;
16021 if (!Subtarget->hasNEON() ||
16022 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
16023 return false;
16024
16025 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16026
16027 auto *FVTy = cast<FixedVectorType>(VTy);
16028
16029 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16030 // load integer vectors first and then convert to pointer vectors.
16031 Type *EltTy = FVTy->getElementType();
16032 if (EltTy->isPointerTy())
16033 FVTy =
16034 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16035
16036 // If we're going to generate more than one load, reset the sub-vector type
16037 // to something legal.
16038 FVTy = FixedVectorType::get(FVTy->getElementType(),
16039 FVTy->getNumElements() / NumLoads);
16040
16041 auto *LDVTy =
16042 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16043
16044 IRBuilder<> Builder(LI);
16045
16046 // The base address of the load.
16047 Value *BaseAddr = LI->getPointerOperand();
16048
16049 Type *PtrTy = LI->getPointerOperandType();
16050 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16051 LDVTy->getElementCount());
16052
16053 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16054 UseScalable, LDVTy, PtrTy);
16055
16056 // Holds sub-vectors extracted from the load intrinsic return values. The
16057 // sub-vectors are associated with the shufflevector instructions they will
16058 // replace.
16060
16061 Value *PTrue = nullptr;
16062 if (UseScalable) {
16063 std::optional<unsigned> PgPattern =
16064 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16065 if (Subtarget->getMinSVEVectorSizeInBits() ==
16066 Subtarget->getMaxSVEVectorSizeInBits() &&
16067 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16068 PgPattern = AArch64SVEPredPattern::all;
16069
16070 auto *PTruePat =
16071 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16072 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16073 {PTruePat});
16074 }
16075
16076 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16077
16078 // If we're generating more than one load, compute the base address of
16079 // subsequent loads as an offset from the previous.
16080 if (LoadCount > 0)
16081 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16082 FVTy->getNumElements() * Factor);
16083
16084 CallInst *LdN;
16085 if (UseScalable)
16086 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16087 else
16088 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16089
16090 // Extract and store the sub-vectors returned by the load intrinsic.
16091 for (unsigned i = 0; i < Shuffles.size(); i++) {
16092 ShuffleVectorInst *SVI = Shuffles[i];
16093 unsigned Index = Indices[i];
16094
16095 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16096
16097 if (UseScalable)
16098 SubVec = Builder.CreateExtractVector(
16099 FVTy, SubVec,
16100 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16101
16102 // Convert the integer vector to pointer vector if the element is pointer.
16103 if (EltTy->isPointerTy())
16104 SubVec = Builder.CreateIntToPtr(
16106 FVTy->getNumElements()));
16107
16108 SubVecs[SVI].push_back(SubVec);
16109 }
16110 }
16111
16112 // Replace uses of the shufflevector instructions with the sub-vectors
16113 // returned by the load intrinsic. If a shufflevector instruction is
16114 // associated with more than one sub-vector, those sub-vectors will be
16115 // concatenated into a single wide vector.
16116 for (ShuffleVectorInst *SVI : Shuffles) {
16117 auto &SubVec = SubVecs[SVI];
16118 auto *WideVec =
16119 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16120 SVI->replaceAllUsesWith(WideVec);
16121 }
16122
16123 return true;
16124}
16125
16126template <typename Iter>
16127bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16128 int MaxLookupDist = 20;
16129 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16130 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16131 const Value *PtrA1 =
16132 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16133
16134 while (++It != End) {
16135 if (It->isDebugOrPseudoInst())
16136 continue;
16137 if (MaxLookupDist-- == 0)
16138 break;
16139 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16140 const Value *PtrB1 =
16141 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16142 DL, OffsetB);
16143 if (PtrA1 == PtrB1 &&
16144 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16145 .abs() == 16)
16146 return true;
16147 }
16148 }
16149
16150 return false;
16151}
16152
16153/// Lower an interleaved store into a stN intrinsic.
16154///
16155/// E.g. Lower an interleaved store (Factor = 3):
16156/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16157/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16158/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16159///
16160/// Into:
16161/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16162/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16163/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16164/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16165///
16166/// Note that the new shufflevectors will be removed and we'll only generate one
16167/// st3 instruction in CodeGen.
16168///
16169/// Example for a more general valid mask (Factor 3). Lower:
16170/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16171/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16172/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16173///
16174/// Into:
16175/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16176/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16177/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16178/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16180 ShuffleVectorInst *SVI,
16181 unsigned Factor) const {
16182
16183 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16184 "Invalid interleave factor");
16185
16186 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16187 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16188
16189 unsigned LaneLen = VecTy->getNumElements() / Factor;
16190 Type *EltTy = VecTy->getElementType();
16191 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16192
16193 const DataLayout &DL = SI->getModule()->getDataLayout();
16194 bool UseScalable;
16195
16196 // Skip if we do not have NEON and skip illegal vector types. We can
16197 // "legalize" wide vector types into multiple interleaved accesses as long as
16198 // the vector types are divisible by 128.
16199 if (!Subtarget->hasNEON() ||
16200 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16201 return false;
16202
16203 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16204
16205 Value *Op0 = SVI->getOperand(0);
16206 Value *Op1 = SVI->getOperand(1);
16207 IRBuilder<> Builder(SI);
16208
16209 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16210 // vectors to integer vectors.
16211 if (EltTy->isPointerTy()) {
16212 Type *IntTy = DL.getIntPtrType(EltTy);
16213 unsigned NumOpElts =
16214 cast<FixedVectorType>(Op0->getType())->getNumElements();
16215
16216 // Convert to the corresponding integer vector.
16217 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16218 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16219 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16220
16221 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16222 }
16223
16224 // If we're going to generate more than one store, reset the lane length
16225 // and sub-vector type to something legal.
16226 LaneLen /= NumStores;
16227 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16228
16229 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16230 : SubVecTy;
16231
16232 // The base address of the store.
16233 Value *BaseAddr = SI->getPointerOperand();
16234
16235 auto Mask = SVI->getShuffleMask();
16236
16237 // Sanity check if all the indices are NOT in range.
16238 // If mask is `poison`, `Mask` may be a vector of -1s.
16239 // If all of them are `poison`, OOB read will happen later.
16240 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16241 return false;
16242 }
16243 // A 64bit st2 which does not start at element 0 will involved adding extra
16244 // ext elements making the st2 unprofitable, and if there is a nearby store
16245 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16246 // zip;ldp pair which has higher throughput.
16247 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16248 (Mask[0] != 0 ||
16249 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16250 DL) ||
16251 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16252 BaseAddr, DL)))
16253 return false;
16254
16255 Type *PtrTy = SI->getPointerOperandType();
16256 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16257 STVTy->getElementCount());
16258
16259 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16260 UseScalable, STVTy, PtrTy);
16261
16262 Value *PTrue = nullptr;
16263 if (UseScalable) {
16264 std::optional<unsigned> PgPattern =
16265 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16266 if (Subtarget->getMinSVEVectorSizeInBits() ==
16267 Subtarget->getMaxSVEVectorSizeInBits() &&
16268 Subtarget->getMinSVEVectorSizeInBits() ==
16269 DL.getTypeSizeInBits(SubVecTy))
16270 PgPattern = AArch64SVEPredPattern::all;
16271
16272 auto *PTruePat =
16273 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16274 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16275 {PTruePat});
16276 }
16277
16278 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16279
16281
16282 // Split the shufflevector operands into sub vectors for the new stN call.
16283 for (unsigned i = 0; i < Factor; i++) {
16284 Value *Shuffle;
16285 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16286 if (Mask[IdxI] >= 0) {
16287 Shuffle = Builder.CreateShuffleVector(
16288 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16289 } else {
16290 unsigned StartMask = 0;
16291 for (unsigned j = 1; j < LaneLen; j++) {
16292 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16293 if (Mask[IdxJ] >= 0) {
16294 StartMask = Mask[IdxJ] - j;
16295 break;
16296 }
16297 }
16298 // Note: Filling undef gaps with random elements is ok, since
16299 // those elements were being written anyway (with undefs).
16300 // In the case of all undefs we're defaulting to using elems from 0
16301 // Note: StartMask cannot be negative, it's checked in
16302 // isReInterleaveMask
16303 Shuffle = Builder.CreateShuffleVector(
16304 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16305 }
16306
16307 if (UseScalable)
16308 Shuffle = Builder.CreateInsertVector(
16309 STVTy, UndefValue::get(STVTy), Shuffle,
16310 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16311
16312 Ops.push_back(Shuffle);
16313 }
16314
16315 if (UseScalable)
16316 Ops.push_back(PTrue);
16317
16318 // If we generating more than one store, we compute the base address of
16319 // subsequent stores as an offset from the previous.
16320 if (StoreCount > 0)
16321 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16322 BaseAddr, LaneLen * Factor);
16323
16324 Ops.push_back(BaseAddr);
16325 Builder.CreateCall(StNFunc, Ops);
16326 }
16327 return true;
16328}
16329
16331 IntrinsicInst *DI, LoadInst *LI) const {
16332 // Only deinterleave2 supported at present.
16333 if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
16334 return false;
16335
16336 // Only a factor of 2 supported at present.
16337 const unsigned Factor = 2;
16338
16339 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16340 const DataLayout &DL = DI->getModule()->getDataLayout();
16341 bool UseScalable;
16342 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16343 return false;
16344
16345 // TODO: Add support for using SVE instructions with fixed types later, using
16346 // the code from lowerInterleavedLoad to obtain the correct container type.
16347 if (UseScalable && !VTy->isScalableTy())
16348 return false;
16349
16350 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16351
16352 VectorType *LdTy =
16354 VTy->getElementCount().divideCoefficientBy(NumLoads));
16355
16356 Type *PtrTy = LI->getPointerOperandType();
16357 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16358 UseScalable, LdTy, PtrTy);
16359
16360 IRBuilder<> Builder(LI);
16361
16362 Value *Pred = nullptr;
16363 if (UseScalable)
16364 Pred =
16365 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16366
16367 Value *BaseAddr = LI->getPointerOperand();
16368 Value *Result;
16369 if (NumLoads > 1) {
16370 Value *Left = PoisonValue::get(VTy);
16372
16373 for (unsigned I = 0; I < NumLoads; ++I) {
16374 Value *Offset = Builder.getInt64(I * Factor);
16375
16376 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16377 Value *LdN = nullptr;
16378 if (UseScalable)
16379 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16380 else
16381 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16382
16383 Value *Idx =
16384 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16385 Left = Builder.CreateInsertVector(
16386 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16387 Right = Builder.CreateInsertVector(
16388 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16389 }
16390
16391 Result = PoisonValue::get(DI->getType());
16392 Result = Builder.CreateInsertValue(Result, Left, 0);
16393 Result = Builder.CreateInsertValue(Result, Right, 1);
16394 } else {
16395 if (UseScalable)
16396 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16397 else
16398 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16399 }
16400
16401 DI->replaceAllUsesWith(Result);
16402 return true;
16403}
16404
16406 IntrinsicInst *II, StoreInst *SI) const {
16407 // Only interleave2 supported at present.
16408 if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16409 return false;
16410
16411 // Only a factor of 2 supported at present.
16412 const unsigned Factor = 2;
16413
16414 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16415 const DataLayout &DL = II->getModule()->getDataLayout();
16416 bool UseScalable;
16417 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16418 return false;
16419
16420 // TODO: Add support for using SVE instructions with fixed types later, using
16421 // the code from lowerInterleavedStore to obtain the correct container type.
16422 if (UseScalable && !VTy->isScalableTy())
16423 return false;
16424
16425 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16426
16427 VectorType *StTy =
16429 VTy->getElementCount().divideCoefficientBy(NumStores));
16430
16431 Type *PtrTy = SI->getPointerOperandType();
16432 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16433 UseScalable, StTy, PtrTy);
16434
16435 IRBuilder<> Builder(SI);
16436
16437 Value *BaseAddr = SI->getPointerOperand();
16438 Value *Pred = nullptr;
16439
16440 if (UseScalable)
16441 Pred =
16442 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16443
16444 Value *L = II->getOperand(0);
16445 Value *R = II->getOperand(1);
16446
16447 for (unsigned I = 0; I < NumStores; ++I) {
16448 Value *Address = BaseAddr;
16449 if (NumStores > 1) {
16450 Value *Offset = Builder.getInt64(I * Factor);
16451 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16452
16453 Value *Idx =
16454 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16455 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16456 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16457 }
16458
16459 if (UseScalable)
16460 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16461 else
16462 Builder.CreateCall(StNFunc, {L, R, Address});
16463 }
16464
16465 return true;
16466}
16467
16469 const MemOp &Op, const AttributeList &FuncAttributes) const {
16470 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16471 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16472 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16473 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16474 // taken one instruction to materialize the v2i64 zero and one store (with
16475 // restrictive addressing mode). Just do i64 stores.
16476 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16477 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16478 if (Op.isAligned(AlignCheck))
16479 return true;
16480 unsigned Fast;
16481 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16483 Fast;
16484 };
16485
16486 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16487 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16488 return MVT::v16i8;
16489 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16490 return MVT::f128;
16491 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16492 return MVT::i64;
16493 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16494 return MVT::i32;
16495 return MVT::Other;
16496}
16497
16499 const MemOp &Op, const AttributeList &FuncAttributes) const {
16500 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16501 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16502 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16503 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16504 // taken one instruction to materialize the v2i64 zero and one store (with
16505 // restrictive addressing mode). Just do i64 stores.
16506 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16507 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16508 if (Op.isAligned(AlignCheck))
16509 return true;
16510 unsigned Fast;
16511 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16513 Fast;
16514 };
16515
16516 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16517 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16518 return LLT::fixed_vector(2, 64);
16519 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16520 return LLT::scalar(128);
16521 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16522 return LLT::scalar(64);
16523 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16524 return LLT::scalar(32);
16525 return LLT();
16526}
16527
16528// 12-bit optionally shifted immediates are legal for adds.
16530 if (Immed == std::numeric_limits<int64_t>::min()) {
16531 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16532 << ": avoid UB for INT64_MIN\n");
16533 return false;
16534 }
16535 // Same encoding for add/sub, just flip the sign.
16536 Immed = std::abs(Immed);
16537 bool IsLegal = ((Immed >> 12) == 0 ||
16538 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16539 LLVM_DEBUG(dbgs() << "Is " << Immed
16540 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16541 return IsLegal;
16542}
16543
16545 // We will only emit addvl/inc* instructions for SVE2
16546 if (!Subtarget->hasSVE2())
16547 return false;
16548
16549 // addvl's immediates are in terms of the number of bytes in a register.
16550 // Since there are 16 in the base supported size (128bits), we need to
16551 // divide the immediate by that much to give us a useful immediate to
16552 // multiply by vscale. We can't have a remainder as a result of this.
16553 if (Imm % 16 == 0)
16554 return isInt<6>(Imm / 16);
16555
16556 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16557 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16558 // of addvl as a result, so only take h|w|d into account.
16559 // Dec[h|w|d] will cover subtractions.
16560 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16561 // FIXME: Can we make use of other patterns to cover other immediates?
16562
16563 // inch|dech
16564 if (Imm % 8 == 0)
16565 return std::labs(Imm / 8) <= 16;
16566 // incw|decw
16567 if (Imm % 4 == 0)
16568 return std::labs(Imm / 4) <= 16;
16569 // incd|decd
16570 if (Imm % 2 == 0)
16571 return std::labs(Imm / 2) <= 16;
16572
16573 return false;
16574}
16575
16576// Return false to prevent folding
16577// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16578// if the folding leads to worse code.
16580 SDValue AddNode, SDValue ConstNode) const {
16581 // Let the DAGCombiner decide for vector types and large types.
16582 const EVT VT = AddNode.getValueType();
16583 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16584 return true;
16585
16586 // It is worse if c1 is legal add immediate, while c1*c2 is not
16587 // and has to be composed by at least two instructions.
16588 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16589 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16590 const int64_t C1 = C1Node->getSExtValue();
16591 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16593 return true;
16595 // Adapt to the width of a register.
16596 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16598 if (Insn.size() > 1)
16599 return false;
16600
16601 // Default to true and let the DAGCombiner decide.
16602 return true;
16603}
16604
16605// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16606// immediates is the same as for an add or a sub.
16608 return isLegalAddImmediate(Immed);
16609}
16610
16611/// isLegalAddressingMode - Return true if the addressing mode represented
16612/// by AM is legal for this target, for a load/store of the specified type.
16614 const AddrMode &AMode, Type *Ty,
16615 unsigned AS, Instruction *I) const {
16616 // AArch64 has five basic addressing modes:
16617 // reg
16618 // reg + 9-bit signed offset
16619 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16620 // reg1 + reg2
16621 // reg + SIZE_IN_BYTES * reg
16622
16623 // No global is ever allowed as a base.
16624 if (AMode.BaseGV)
16625 return false;
16626
16627 // No reg+reg+imm addressing.
16628 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16629 return false;
16630
16631 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16632 // `2*ScaledReg` into `BaseReg + ScaledReg`
16633 AddrMode AM = AMode;
16634 if (AM.Scale && !AM.HasBaseReg) {
16635 if (AM.Scale == 1) {
16636 AM.HasBaseReg = true;
16637 AM.Scale = 0;
16638 } else if (AM.Scale == 2) {
16639 AM.HasBaseReg = true;
16640 AM.Scale = 1;
16641 } else {
16642 return false;
16643 }
16644 }
16645
16646 // A base register is required in all addressing modes.
16647 if (!AM.HasBaseReg)
16648 return false;
16649
16650 if (Ty->isScalableTy()) {
16651 if (isa<ScalableVectorType>(Ty)) {
16652 // See if we have a foldable vscale-based offset, for vector types which
16653 // are either legal or smaller than the minimum; more work will be
16654 // required if we need to consider addressing for types which need
16655 // legalization by splitting.
16656 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16657 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16658 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16659 isPowerOf2_64(VecNumBytes))
16660 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
16661
16662 uint64_t VecElemNumBytes =
16663 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16664 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16665 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16666 }
16667
16668 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16669 }
16670
16671 // No scalable offsets allowed for non-scalable types.
16672 if (AM.ScalableOffset)
16673 return false;
16674
16675 // check reg + imm case:
16676 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16677 uint64_t NumBytes = 0;
16678 if (Ty->isSized()) {
16679 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16680 NumBytes = NumBits / 8;
16681 if (!isPowerOf2_64(NumBits))
16682 NumBytes = 0;
16683 }
16684
16685 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16686 AM.Scale);
16687}
16688
16689// Check whether the 2 offsets belong to the same imm24 range, and their high
16690// 12bits are same, then their high part can be decoded with the offset of add.
16691int64_t
16693 int64_t MaxOffset) const {
16694 int64_t HighPart = MinOffset & ~0xfffULL;
16695 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16696 // Rebase the value to an integer multiple of imm12.
16697 return HighPart;
16698 }
16699
16700 return 0;
16701}
16702
16704 // Consider splitting large offset of struct or array.
16705 return true;
16706}
16707
16709 const MachineFunction &MF, EVT VT) const {
16710 VT = VT.getScalarType();
16711
16712 if (!VT.isSimple())
16713 return false;
16714
16715 switch (VT.getSimpleVT().SimpleTy) {
16716 case MVT::f16:
16717 return Subtarget->hasFullFP16();
16718 case MVT::f32:
16719 case MVT::f64:
16720 return true;
16721 default:
16722 break;
16723 }
16724
16725 return false;
16726}
16727
16729 Type *Ty) const {
16730 switch (Ty->getScalarType()->getTypeID()) {
16731 case Type::FloatTyID:
16732 case Type::DoubleTyID:
16733 return true;
16734 default:
16735 return false;
16736 }
16737}
16738
16740 EVT VT, CodeGenOptLevel OptLevel) const {
16741 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16743}
16744
16745const MCPhysReg *
16747 // LR is a callee-save register, but we must treat it as clobbered by any call
16748 // site. Hence we include LR in the scratch registers, which are in turn added
16749 // as implicit-defs for stackmaps and patchpoints.
16750 static const MCPhysReg ScratchRegs[] = {
16751 AArch64::X16, AArch64::X17, AArch64::LR, 0
16752 };
16753 return ScratchRegs;
16754}
16755
16757 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16758 return RCRegs;
16759}
16760
16761bool
16763 CombineLevel Level) const {
16764 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16765 N->getOpcode() == ISD::SRL) &&
16766 "Expected shift op");
16767
16768 SDValue ShiftLHS = N->getOperand(0);
16769 EVT VT = N->getValueType(0);
16770
16771 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16772 // combine it with shift 'N' to let it be lowered to UBFX except:
16773 // ((x >> C) & mask) << C.
16774 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16775 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16776 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16777 if (isMask_64(TruncMask)) {
16778 SDValue AndLHS = ShiftLHS.getOperand(0);
16779 if (AndLHS.getOpcode() == ISD::SRL) {
16780 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16781 if (N->getOpcode() == ISD::SHL)
16782 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16783 return SRLC->getZExtValue() == SHLC->getZExtValue();
16784 return false;
16785 }
16786 }
16787 }
16788 }
16789 return true;
16790}
16791
16793 const SDNode *N) const {
16794 assert(N->getOpcode() == ISD::XOR &&
16795 (N->getOperand(0).getOpcode() == ISD::SHL ||
16796 N->getOperand(0).getOpcode() == ISD::SRL) &&
16797 "Expected XOR(SHIFT) pattern");
16798
16799 // Only commute if the entire NOT mask is a hidden shifted mask.
16800 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16801 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16802 if (XorC && ShiftC) {
16803 unsigned MaskIdx, MaskLen;
16804 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16805 unsigned ShiftAmt = ShiftC->getZExtValue();
16806 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16807 if (N->getOperand(0).getOpcode() == ISD::SHL)
16808 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16809 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16810 }
16811 }
16812
16813 return false;
16814}
16815
16817 const SDNode *N, CombineLevel Level) const {
16818 assert(((N->getOpcode() == ISD::SHL &&
16819 N->getOperand(0).getOpcode() == ISD::SRL) ||
16820 (N->getOpcode() == ISD::SRL &&
16821 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16822 "Expected shift-shift mask");
16823 // Don't allow multiuse shift folding with the same shift amount.
16824 if (!N->getOperand(0)->hasOneUse())
16825 return false;
16826
16827 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16828 EVT VT = N->getValueType(0);
16829 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16830 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16831 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16832 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16833 }
16834
16835 return true;
16836}
16837
16839 unsigned BinOpcode, EVT VT) const {
16840 return VT.isScalableVector() && isTypeLegal(VT);
16841}
16842
16844 Type *Ty) const {
16845 assert(Ty->isIntegerTy());
16846
16847 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16848 if (BitSize == 0)
16849 return false;
16850
16851 int64_t Val = Imm.getSExtValue();
16852 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16853 return true;
16854
16855 if ((int64_t)Val < 0)
16856 Val = ~Val;
16857 if (BitSize == 32)
16858 Val &= (1LL << 32) - 1;
16859
16860 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16861 // MOVZ is free so return true for one or fewer MOVK.
16862 return Shift < 3;
16863}
16864
16866 unsigned Index) const {
16868 return false;
16869
16870 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16871}
16872
16873/// Turn vector tests of the signbit in the form of:
16874/// xor (sra X, elt_size(X)-1), -1
16875/// into:
16876/// cmge X, X, #0
16878 const AArch64Subtarget *Subtarget) {
16879 EVT VT = N->getValueType(0);
16880 if (!Subtarget->hasNEON() || !VT.isVector())
16881 return SDValue();
16882
16883 // There must be a shift right algebraic before the xor, and the xor must be a
16884 // 'not' operation.
16885 SDValue Shift = N->getOperand(0);
16886 SDValue Ones = N->getOperand(1);
16887 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16889 return SDValue();
16890
16891 // The shift should be smearing the sign bit across each vector element.
16892 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16893 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16894 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16895 return SDValue();
16896
16897 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16898}
16899
16900// Given a vecreduce_add node, detect the below pattern and convert it to the
16901// node sequence with UABDL, [S|U]ADB and UADDLP.
16902//
16903// i32 vecreduce_add(
16904// v16i32 abs(
16905// v16i32 sub(
16906// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16907// =================>
16908// i32 vecreduce_add(
16909// v4i32 UADDLP(
16910// v8i16 add(
16911// v8i16 zext(
16912// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16913// v8i16 zext(
16914// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
16916 SelectionDAG &DAG) {
16917 // Assumed i32 vecreduce_add
16918 if (N->getValueType(0) != MVT::i32)
16919 return SDValue();
16920
16921 SDValue VecReduceOp0 = N->getOperand(0);
16922 unsigned Opcode = VecReduceOp0.getOpcode();
16923 // Assumed v16i32 abs
16924 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
16925 return SDValue();
16926
16927 SDValue ABS = VecReduceOp0;
16928 // Assumed v16i32 sub
16929 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
16930 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
16931 return SDValue();
16932
16933 SDValue SUB = ABS->getOperand(0);
16934 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
16935 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
16936 // Assumed v16i32 type
16937 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
16938 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
16939 return SDValue();
16940
16941 // Assumed zext or sext
16942 bool IsZExt = false;
16943 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
16944 IsZExt = true;
16945 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
16946 IsZExt = false;
16947 } else
16948 return SDValue();
16949
16950 SDValue EXT0 = SUB->getOperand(0);
16951 SDValue EXT1 = SUB->getOperand(1);
16952 // Assumed zext's operand has v16i8 type
16953 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
16954 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
16955 return SDValue();
16956
16957 // Pattern is dectected. Let's convert it to sequence of nodes.
16958 SDLoc DL(N);
16959
16960 // First, create the node pattern of UABD/SABD.
16961 SDValue UABDHigh8Op0 =
16962 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16963 DAG.getConstant(8, DL, MVT::i64));
16964 SDValue UABDHigh8Op1 =
16965 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16966 DAG.getConstant(8, DL, MVT::i64));
16967 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16968 UABDHigh8Op0, UABDHigh8Op1);
16969 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
16970
16971 // Second, create the node pattern of UABAL.
16972 SDValue UABDLo8Op0 =
16973 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16974 DAG.getConstant(0, DL, MVT::i64));
16975 SDValue UABDLo8Op1 =
16976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16977 DAG.getConstant(0, DL, MVT::i64));
16978 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16979 UABDLo8Op0, UABDLo8Op1);
16980 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
16981 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
16982
16983 // Third, create the node of UADDLP.
16984 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
16985
16986 // Fourth, create the node of VECREDUCE_ADD.
16987 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
16988}
16989
16990// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
16991// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
16992// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
16993// If we have vectors larger than v16i8 we extract v16i8 vectors,
16994// Follow the same steps above to get DOT instructions concatenate them
16995// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
16997 const AArch64Subtarget *ST) {
16998 if (!ST->hasDotProd())
17000
17001 SDValue Op0 = N->getOperand(0);
17002 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17003 Op0.getValueType().getVectorElementType() != MVT::i32)
17004 return SDValue();
17005
17006 unsigned ExtOpcode = Op0.getOpcode();
17007 SDValue A = Op0;
17008 SDValue B;
17009 if (ExtOpcode == ISD::MUL) {
17010 A = Op0.getOperand(0);
17011 B = Op0.getOperand(1);
17012 if (A.getOpcode() != B.getOpcode() ||
17013 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17014 return SDValue();
17015 ExtOpcode = A.getOpcode();
17016 }
17017 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17018 return SDValue();
17019
17020 EVT Op0VT = A.getOperand(0).getValueType();
17021 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17022 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17023 if (!IsValidElementCount || !IsValidSize)
17024 return SDValue();
17025
17026 SDLoc DL(Op0);
17027 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17028 // the extend B.
17029 if (!B)
17030 B = DAG.getConstant(1, DL, Op0VT);
17031 else
17032 B = B.getOperand(0);
17033
17034 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17035 unsigned NumOfVecReduce;
17036 EVT TargetType;
17037 if (IsMultipleOf16) {
17038 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17039 TargetType = MVT::v4i32;
17040 } else {
17041 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17042 TargetType = MVT::v2i32;
17043 }
17044 auto DotOpcode =
17046 // Handle the case where we need to generate only one Dot operation.
17047 if (NumOfVecReduce == 1) {
17048 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17049 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17050 A.getOperand(0), B);
17051 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17052 }
17053 // Generate Dot instructions that are multiple of 16.
17054 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17055 SmallVector<SDValue, 4> SDotVec16;
17056 unsigned I = 0;
17057 for (; I < VecReduce16Num; I += 1) {
17058 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17059 SDValue Op0 =
17060 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17061 DAG.getConstant(I * 16, DL, MVT::i64));
17062 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17063 DAG.getConstant(I * 16, DL, MVT::i64));
17064 SDValue Dot =
17065 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17066 SDotVec16.push_back(Dot);
17067 }
17068 // Concatenate dot operations.
17069 EVT SDot16EVT =
17070 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17071 SDValue ConcatSDot16 =
17072 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17073 SDValue VecReduceAdd16 =
17074 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17075 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17076 if (VecReduce8Num == 0)
17077 return VecReduceAdd16;
17078
17079 // Generate the remainder Dot operation that is multiple of 8.
17080 SmallVector<SDValue, 4> SDotVec8;
17081 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17082 SDValue Vec8Op0 =
17083 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17084 DAG.getConstant(I * 16, DL, MVT::i64));
17085 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17086 DAG.getConstant(I * 16, DL, MVT::i64));
17087 SDValue Dot =
17088 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17089 SDValue VecReudceAdd8 =
17090 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17091 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17092 VecReudceAdd8);
17093}
17094
17095// Given an (integer) vecreduce, we know the order of the inputs does not
17096// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17097// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17098// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17100 auto DetectAddExtract = [&](SDValue A) {
17101 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17102 // UADDLP(x) if found.
17103 assert(A.getOpcode() == ISD::ADD);
17104 EVT VT = A.getValueType();
17105 SDValue Op0 = A.getOperand(0);
17106 SDValue Op1 = A.getOperand(1);
17107 if (Op0.getOpcode() != Op0.getOpcode() ||
17108 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17109 Op0.getOpcode() != ISD::SIGN_EXTEND))
17110 return SDValue();
17111 SDValue Ext0 = Op0.getOperand(0);
17112 SDValue Ext1 = Op1.getOperand(0);
17113 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17115 Ext0.getOperand(0) != Ext1.getOperand(0))
17116 return SDValue();
17117 // Check that the type is twice the add types, and the extract are from
17118 // upper/lower parts of the same source.
17120 VT.getVectorNumElements() * 2)
17121 return SDValue();
17122 if ((Ext0.getConstantOperandVal(1) != 0 ||
17124 (Ext1.getConstantOperandVal(1) != 0 ||
17126 return SDValue();
17127 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17129 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17130 };
17131
17132 if (SDValue R = DetectAddExtract(A))
17133 return R;
17134
17135 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17136 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17137 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17138 A.getOperand(1));
17139 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17140 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17141 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17142 A.getOperand(0));
17143 return SDValue();
17144}
17145
17146// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17147// UADDLV(concat), where the concat represents the 64-bit zext sources.
17149 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17150 // UADDLV(concat(zext, zext)) if found.
17151 assert(A.getOpcode() == ISD::ADD);
17152 EVT VT = A.getValueType();
17153 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17154 return SDValue();
17155 SDValue Op0 = A.getOperand(0);
17156 SDValue Op1 = A.getOperand(1);
17157 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17158 return SDValue();
17159 SDValue Ext0 = Op0.getOperand(0);
17160 SDValue Ext1 = Op1.getOperand(0);
17161 EVT ExtVT0 = Ext0.getValueType();
17162 EVT ExtVT1 = Ext1.getValueType();
17163 // Check zext VTs are the same and 64-bit length.
17164 if (ExtVT0 != ExtVT1 ||
17165 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17166 return SDValue();
17167 // Get VT for concat of zext sources.
17168 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17169 SDValue Concat =
17170 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17171
17172 switch (VT.getSimpleVT().SimpleTy) {
17173 case MVT::v2i64:
17174 case MVT::v4i32:
17175 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17176 case MVT::v8i16: {
17177 SDValue Uaddlv =
17178 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17179 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17180 }
17181 default:
17182 llvm_unreachable("Unhandled vector type");
17183 }
17184}
17185
17187 SDValue A = N->getOperand(0);
17188 if (A.getOpcode() == ISD::ADD) {
17189 if (SDValue R = performUADDVAddCombine(A, DAG))
17190 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17191 else if (SDValue R = performUADDVZextCombine(A, DAG))
17192 return R;
17193 }
17194 return SDValue();
17195}
17196
17199 const AArch64Subtarget *Subtarget) {
17200 if (DCI.isBeforeLegalizeOps())
17201 return SDValue();
17202
17203 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17204}
17205
17206SDValue
17207AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17208 SelectionDAG &DAG,
17209 SmallVectorImpl<SDNode *> &Created) const {
17211 if (isIntDivCheap(N->getValueType(0), Attr))
17212 return SDValue(N,0); // Lower SDIV as SDIV
17213
17214 EVT VT = N->getValueType(0);
17215
17216 // For scalable and fixed types, mark them as cheap so we can handle it much
17217 // later. This allows us to handle larger than legal types.
17218 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17219 return SDValue(N, 0);
17220
17221 // fold (sdiv X, pow2)
17222 if ((VT != MVT::i32 && VT != MVT::i64) ||
17223 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17224 return SDValue();
17225
17226 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17227}
17228
17229SDValue
17230AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17231 SelectionDAG &DAG,
17232 SmallVectorImpl<SDNode *> &Created) const {
17234 if (isIntDivCheap(N->getValueType(0), Attr))
17235 return SDValue(N, 0); // Lower SREM as SREM
17236
17237 EVT VT = N->getValueType(0);
17238
17239 // For scalable and fixed types, mark them as cheap so we can handle it much
17240 // later. This allows us to handle larger than legal types.
17241 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17242 return SDValue(N, 0);
17243
17244 // fold (srem X, pow2)
17245 if ((VT != MVT::i32 && VT != MVT::i64) ||
17246 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17247 return SDValue();
17248
17249 unsigned Lg2 = Divisor.countr_zero();
17250 if (Lg2 == 0)
17251 return SDValue();
17252
17253 SDLoc DL(N);
17254 SDValue N0 = N->getOperand(0);
17255 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17256 SDValue Zero = DAG.getConstant(0, DL, VT);
17257 SDValue CCVal, CSNeg;
17258 if (Lg2 == 1) {
17259 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17260 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17261 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17262
17263 Created.push_back(Cmp.getNode());
17264 Created.push_back(And.getNode());
17265 } else {
17266 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17267 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17268
17269 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17270 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17271 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17272 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17273 Negs.getValue(1));
17274
17275 Created.push_back(Negs.getNode());
17276 Created.push_back(AndPos.getNode());
17277 Created.push_back(AndNeg.getNode());
17278 }
17279
17280 return CSNeg;
17281}
17282
17283static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17284 switch(getIntrinsicID(S.getNode())) {
17285 default:
17286 break;
17287 case Intrinsic::aarch64_sve_cntb:
17288 return 8;
17289 case Intrinsic::aarch64_sve_cnth:
17290 return 16;
17291 case Intrinsic::aarch64_sve_cntw:
17292 return 32;
17293 case Intrinsic::aarch64_sve_cntd:
17294 return 64;
17295 }
17296 return {};
17297}
17298
17299/// Calculates what the pre-extend type is, based on the extension
17300/// operation node provided by \p Extend.
17301///
17302/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17303/// pre-extend type is pulled directly from the operand, while other extend
17304/// operations need a bit more inspection to get this information.
17305///
17306/// \param Extend The SDNode from the DAG that represents the extend operation
17307///
17308/// \returns The type representing the \p Extend source type, or \p MVT::Other
17309/// if no valid type can be determined
17311 switch (Extend.getOpcode()) {
17312 case ISD::SIGN_EXTEND:
17313 case ISD::ZERO_EXTEND:
17314 return Extend.getOperand(0).getValueType();
17315 case ISD::AssertSext:
17316 case ISD::AssertZext:
17318 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17319 if (!TypeNode)
17320 return MVT::Other;
17321 return TypeNode->getVT();
17322 }
17323 case ISD::AND: {
17325 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17326 if (!Constant)
17327 return MVT::Other;
17328
17329 uint32_t Mask = Constant->getZExtValue();
17330
17331 if (Mask == UCHAR_MAX)
17332 return MVT::i8;
17333 else if (Mask == USHRT_MAX)
17334 return MVT::i16;
17335 else if (Mask == UINT_MAX)
17336 return MVT::i32;
17337
17338 return MVT::Other;
17339 }
17340 default:
17341 return MVT::Other;
17342 }
17343}
17344
17345/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17346/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17347/// SExt/ZExt rather than the scalar SExt/ZExt
17349 EVT VT = BV.getValueType();
17350 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17352 return SDValue();
17353
17354 // Use the first item in the buildvector/shuffle to get the size of the
17355 // extend, and make sure it looks valid.
17356 SDValue Extend = BV->getOperand(0);
17357 unsigned ExtendOpcode = Extend.getOpcode();
17358 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17359 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17360 ExtendOpcode == ISD::AssertSext;
17361 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17362 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17363 return SDValue();
17364 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17365 // calculatePreExtendType will work without issue.
17366 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17367 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17368 return SDValue();
17369
17370 // Restrict valid pre-extend data type
17371 EVT PreExtendType = calculatePreExtendType(Extend);
17372 if (PreExtendType == MVT::Other ||
17373 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17374 return SDValue();
17375
17376 // Make sure all other operands are equally extended
17377 for (SDValue Op : drop_begin(BV->ops())) {
17378 if (Op.isUndef())
17379 continue;
17380 unsigned Opc = Op.getOpcode();
17381 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17382 Opc == ISD::AssertSext;
17383 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17384 return SDValue();
17385 }
17386
17387 SDValue NBV;
17388 SDLoc DL(BV);
17389 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17390 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17391 EVT PreExtendLegalType =
17392 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17394 for (SDValue Op : BV->ops())
17395 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17396 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17397 PreExtendLegalType));
17398 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17399 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17400 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17401 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17402 BV.getOperand(1).isUndef()
17403 ? DAG.getUNDEF(PreExtendVT)
17404 : BV.getOperand(1).getOperand(0),
17405 cast<ShuffleVectorSDNode>(BV)->getMask());
17406 }
17407 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17408}
17409
17410/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17411/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17413 // If the value type isn't a vector, none of the operands are going to be dups
17414 EVT VT = Mul->getValueType(0);
17415 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17416 return SDValue();
17417
17418 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17419 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17420
17421 // Neither operands have been changed, don't make any further changes
17422 if (!Op0 && !Op1)
17423 return SDValue();
17424
17425 SDLoc DL(Mul);
17426 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17427 Op1 ? Op1 : Mul->getOperand(1));
17428}
17429
17430// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17431// Same for other types with equivalent constants.
17433 EVT VT = N->getValueType(0);
17434 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17435 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17436 return SDValue();
17437 if (N->getOperand(0).getOpcode() != ISD::AND ||
17438 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17439 return SDValue();
17440
17441 SDValue And = N->getOperand(0);
17442 SDValue Srl = And.getOperand(0);
17443
17444 APInt V1, V2, V3;
17445 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17446 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17448 return SDValue();
17449
17450 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17451 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17452 V3 != (HalfSize - 1))
17453 return SDValue();
17454
17455 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17456 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17457 VT.getVectorElementCount() * 2);
17458
17459 SDLoc DL(N);
17460 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17461 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17462 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17463}
17464
17467 const AArch64Subtarget *Subtarget) {
17468
17469 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17470 return Ext;
17472 return Ext;
17473
17474 if (DCI.isBeforeLegalizeOps())
17475 return SDValue();
17476
17477 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17478 // and in MachineCombiner pass, add+mul will be combined into madd.
17479 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17480 SDLoc DL(N);
17481 EVT VT = N->getValueType(0);
17482 SDValue N0 = N->getOperand(0);
17483 SDValue N1 = N->getOperand(1);
17484 SDValue MulOper;
17485 unsigned AddSubOpc;
17486
17487 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17488 AddSubOpc = V->getOpcode();
17489 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17490 SDValue Opnd = V->getOperand(1);
17491 MulOper = V->getOperand(0);
17492 if (AddSubOpc == ISD::SUB)
17493 std::swap(Opnd, MulOper);
17494 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17495 return C->isOne();
17496 }
17497 return false;
17498 };
17499
17500 if (IsAddSubWith1(N0)) {
17501 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17502 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17503 }
17504
17505 if (IsAddSubWith1(N1)) {
17506 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17507 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17508 }
17509
17510 // The below optimizations require a constant RHS.
17511 if (!isa<ConstantSDNode>(N1))
17512 return SDValue();
17513
17514 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17515 const APInt &ConstValue = C->getAPIntValue();
17516
17517 // Allow the scaling to be folded into the `cnt` instruction by preventing
17518 // the scaling to be obscured here. This makes it easier to pattern match.
17519 if (IsSVECntIntrinsic(N0) ||
17520 (N0->getOpcode() == ISD::TRUNCATE &&
17521 (IsSVECntIntrinsic(N0->getOperand(0)))))
17522 if (ConstValue.sge(1) && ConstValue.sle(16))
17523 return SDValue();
17524
17525 // Multiplication of a power of two plus/minus one can be done more
17526 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17527 // future CPUs have a cheaper MADD instruction, this may need to be
17528 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17529 // 64-bit is 5 cycles, so this is always a win.
17530 // More aggressively, some multiplications N0 * C can be lowered to
17531 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17532 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17533 // TODO: lower more cases.
17534
17535 // TrailingZeroes is used to test if the mul can be lowered to
17536 // shift+add+shift.
17537 unsigned TrailingZeroes = ConstValue.countr_zero();
17538 if (TrailingZeroes) {
17539 // Conservatively do not lower to shift+add+shift if the mul might be
17540 // folded into smul or umul.
17541 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17542 isZeroExtended(N0, DAG)))
17543 return SDValue();
17544 // Conservatively do not lower to shift+add+shift if the mul might be
17545 // folded into madd or msub.
17546 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17547 N->use_begin()->getOpcode() == ISD::SUB))
17548 return SDValue();
17549 }
17550 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17551 // and shift+add+shift.
17552 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17553 unsigned ShiftAmt;
17554
17555 auto Shl = [&](SDValue N0, unsigned N1) {
17556 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17557 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17558 };
17559 auto Add = [&](SDValue N0, SDValue N1) {
17560 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17561 };
17562 auto Sub = [&](SDValue N0, SDValue N1) {
17563 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17564 };
17565 auto Negate = [&](SDValue N) {
17566 SDValue Zero = DAG.getConstant(0, DL, VT);
17567 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17568 };
17569
17570 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17571 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17572 // the (2^N - 1) can't be execused via a single instruction.
17573 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17574 unsigned BitWidth = C.getBitWidth();
17575 for (unsigned i = 1; i < BitWidth / 2; i++) {
17576 APInt Rem;
17577 APInt X(BitWidth, (1 << i) + 1);
17578 APInt::sdivrem(C, X, N, Rem);
17579 APInt NVMinus1 = N - 1;
17580 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17581 M = X;
17582 return true;
17583 }
17584 }
17585 return false;
17586 };
17587
17588 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
17589 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
17590 // the (2^N - 1) can't be execused via a single instruction.
17591 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
17592 APInt CVMinus1 = C - 1;
17593 if (CVMinus1.isNegative())
17594 return false;
17595 unsigned TrailingZeroes = CVMinus1.countr_zero();
17596 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
17597 if (SCVMinus1.isPowerOf2()) {
17598 unsigned BitWidth = SCVMinus1.getBitWidth();
17599 M = APInt(BitWidth, SCVMinus1.logBase2());
17600 N = APInt(BitWidth, TrailingZeroes);
17601 return true;
17602 }
17603 return false;
17604 };
17605
17606 if (ConstValue.isNonNegative()) {
17607 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17608 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17609 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17610 // (mul x, (2^M + 1) * (2^N + 1))
17611 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17612 // (mul x, (2^M + 1) * 2^N + 1))
17613 // => MV = add (shl x, M), x); add (shl MV, N), x)
17614 APInt SCVMinus1 = ShiftedConstValue - 1;
17615 APInt SCVPlus1 = ShiftedConstValue + 1;
17616 APInt CVPlus1 = ConstValue + 1;
17617 APInt CVM, CVN;
17618 if (SCVMinus1.isPowerOf2()) {
17619 ShiftAmt = SCVMinus1.logBase2();
17620 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17621 } else if (CVPlus1.isPowerOf2()) {
17622 ShiftAmt = CVPlus1.logBase2();
17623 return Sub(Shl(N0, ShiftAmt), N0);
17624 } else if (SCVPlus1.isPowerOf2()) {
17625 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17626 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17627 }
17628 if (Subtarget->hasALULSLFast() &&
17629 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17630 APInt CVMMinus1 = CVM - 1;
17631 APInt CVNMinus1 = CVN - 1;
17632 unsigned ShiftM1 = CVMMinus1.logBase2();
17633 unsigned ShiftN1 = CVNMinus1.logBase2();
17634 // ALULSLFast implicate that Shifts <= 4 places are fast
17635 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
17636 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17637 return Add(Shl(MVal, ShiftN1), MVal);
17638 }
17639 }
17640 if (Subtarget->hasALULSLFast() &&
17641 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
17642 unsigned ShiftM = CVM.getZExtValue();
17643 unsigned ShiftN = CVN.getZExtValue();
17644 // ALULSLFast implicate that Shifts <= 4 places are fast
17645 if (ShiftM <= 4 && ShiftN <= 4) {
17646 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
17647 return Add(Shl(MVal, CVN.getZExtValue()), N0);
17648 }
17649 }
17650 } else {
17651 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17652 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17653 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17654 APInt SCVPlus1 = -ShiftedConstValue + 1;
17655 APInt CVNegPlus1 = -ConstValue + 1;
17656 APInt CVNegMinus1 = -ConstValue - 1;
17657 if (CVNegPlus1.isPowerOf2()) {
17658 ShiftAmt = CVNegPlus1.logBase2();
17659 return Sub(N0, Shl(N0, ShiftAmt));
17660 } else if (CVNegMinus1.isPowerOf2()) {
17661 ShiftAmt = CVNegMinus1.logBase2();
17662 return Negate(Add(Shl(N0, ShiftAmt), N0));
17663 } else if (SCVPlus1.isPowerOf2()) {
17664 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17665 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17666 }
17667 }
17668
17669 return SDValue();
17670}
17671
17673 SelectionDAG &DAG) {
17674 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17675 // optimize away operation when it's from a constant.
17676 //
17677 // The general transformation is:
17678 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17679 // AND(VECTOR_CMP(x,y), constant2)
17680 // constant2 = UNARYOP(constant)
17681
17682 // Early exit if this isn't a vector operation, the operand of the
17683 // unary operation isn't a bitwise AND, or if the sizes of the operations
17684 // aren't the same.
17685 EVT VT = N->getValueType(0);
17686 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17687 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17688 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17689 return SDValue();
17690
17691 // Now check that the other operand of the AND is a constant. We could
17692 // make the transformation for non-constant splats as well, but it's unclear
17693 // that would be a benefit as it would not eliminate any operations, just
17694 // perform one more step in scalar code before moving to the vector unit.
17695 if (BuildVectorSDNode *BV =
17696 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17697 // Bail out if the vector isn't a constant.
17698 if (!BV->isConstant())
17699 return SDValue();
17700
17701 // Everything checks out. Build up the new and improved node.
17702 SDLoc DL(N);
17703 EVT IntVT = BV->getValueType(0);
17704 // Create a new constant of the appropriate type for the transformed
17705 // DAG.
17706 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17707 // The AND node needs bitcasts to/from an integer vector type around it.
17708 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17709 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17710 N->getOperand(0)->getOperand(0), MaskConst);
17711 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17712 return Res;
17713 }
17714
17715 return SDValue();
17716}
17717
17719 const AArch64Subtarget *Subtarget) {
17720 // First try to optimize away the conversion when it's conditionally from
17721 // a constant. Vectors only.
17723 return Res;
17724
17725 EVT VT = N->getValueType(0);
17726 if (VT != MVT::f32 && VT != MVT::f64)
17727 return SDValue();
17728
17729 // Only optimize when the source and destination types have the same width.
17730 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17731 return SDValue();
17732
17733 // If the result of an integer load is only used by an integer-to-float
17734 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17735 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17736 SDValue N0 = N->getOperand(0);
17737 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17738 N0.hasOneUse() &&
17739 // Do not change the width of a volatile load.
17740 !cast<LoadSDNode>(N0)->isVolatile()) {
17741 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17742 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17743 LN0->getPointerInfo(), LN0->getAlign(),
17744 LN0->getMemOperand()->getFlags());
17745
17746 // Make sure successors of the original load stay after it by updating them
17747 // to use the new Chain.
17748 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17749
17750 unsigned Opcode =
17752 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17753 }
17754
17755 return SDValue();
17756}
17757
17758/// Fold a floating-point multiply by power of two into floating-point to
17759/// fixed-point conversion.
17762 const AArch64Subtarget *Subtarget) {
17763 if (!Subtarget->isNeonAvailable())
17764 return SDValue();
17765
17766 if (!N->getValueType(0).isSimple())
17767 return SDValue();
17768
17769 SDValue Op = N->getOperand(0);
17770 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17771 return SDValue();
17772
17773 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17774 return SDValue();
17775
17776 SDValue ConstVec = Op->getOperand(1);
17777 if (!isa<BuildVectorSDNode>(ConstVec))
17778 return SDValue();
17779
17780 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17781 uint32_t FloatBits = FloatTy.getSizeInBits();
17782 if (FloatBits != 32 && FloatBits != 64 &&
17783 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17784 return SDValue();
17785
17786 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17787 uint32_t IntBits = IntTy.getSizeInBits();
17788 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17789 return SDValue();
17790
17791 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17792 if (IntBits > FloatBits)
17793 return SDValue();
17794
17795 BitVector UndefElements;
17796 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17797 int32_t Bits = IntBits == 64 ? 64 : 32;
17798 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17799 if (C == -1 || C == 0 || C > Bits)
17800 return SDValue();
17801
17802 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17803 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17804 return SDValue();
17805
17806 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17807 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17808 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17809 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17810 return SDValue();
17811 }
17812
17813 SDLoc DL(N);
17814 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17815 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17816 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17817 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17818 SDValue FixConv =
17820 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17821 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17822 // We can handle smaller integers by generating an extra trunc.
17823 if (IntBits < FloatBits)
17824 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17825
17826 return FixConv;
17827}
17828
17829/// Fold a floating-point divide by power of two into fixed-point to
17830/// floating-point conversion.
17833 const AArch64Subtarget *Subtarget) {
17834 if (!Subtarget->hasNEON())
17835 return SDValue();
17836
17837 SDValue Op = N->getOperand(0);
17838 unsigned Opc = Op->getOpcode();
17839 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17840 !Op.getOperand(0).getValueType().isSimple() ||
17841 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17842 return SDValue();
17843
17844 SDValue ConstVec = N->getOperand(1);
17845 if (!isa<BuildVectorSDNode>(ConstVec))
17846 return SDValue();
17847
17848 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17849 int32_t IntBits = IntTy.getSizeInBits();
17850 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17851 return SDValue();
17852
17853 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17854 int32_t FloatBits = FloatTy.getSizeInBits();
17855 if (FloatBits != 32 && FloatBits != 64)
17856 return SDValue();
17857
17858 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17859 if (IntBits > FloatBits)
17860 return SDValue();
17861
17862 BitVector UndefElements;
17863 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17864 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17865 if (C == -1 || C == 0 || C > FloatBits)
17866 return SDValue();
17867
17868 MVT ResTy;
17869 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17870 switch (NumLanes) {
17871 default:
17872 return SDValue();
17873 case 2:
17874 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17875 break;
17876 case 4:
17877 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17878 break;
17879 }
17880
17881 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17882 return SDValue();
17883
17884 SDLoc DL(N);
17885 SDValue ConvInput = Op.getOperand(0);
17886 bool IsSigned = Opc == ISD::SINT_TO_FP;
17887 if (IntBits < FloatBits)
17888 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17889 ResTy, ConvInput);
17890
17891 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17892 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17893 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17894 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17895 DAG.getConstant(C, DL, MVT::i32));
17896}
17897
17899 const AArch64TargetLowering &TLI) {
17900 EVT VT = N->getValueType(0);
17901 SelectionDAG &DAG = DCI.DAG;
17902 SDLoc DL(N);
17903 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17904
17905 if (!VT.isVector())
17906 return SDValue();
17907
17908 // The combining code works for NEON, SVE2 and SME.
17909 if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
17910 (VT.isScalableVector() && !Subtarget.hasSVE2()))
17911 return SDValue();
17912
17913 SDValue N0 = N->getOperand(0);
17914 if (N0.getOpcode() != ISD::AND)
17915 return SDValue();
17916
17917 SDValue N1 = N->getOperand(1);
17918 if (N1.getOpcode() != ISD::AND)
17919 return SDValue();
17920
17921 // InstCombine does (not (neg a)) => (add a -1).
17922 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17923 // Loop over all combinations of AND operands.
17924 for (int i = 1; i >= 0; --i) {
17925 for (int j = 1; j >= 0; --j) {
17926 SDValue O0 = N0->getOperand(i);
17927 SDValue O1 = N1->getOperand(j);
17928 SDValue Sub, Add, SubSibling, AddSibling;
17929
17930 // Find a SUB and an ADD operand, one from each AND.
17931 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17932 Sub = O0;
17933 Add = O1;
17934 SubSibling = N0->getOperand(1 - i);
17935 AddSibling = N1->getOperand(1 - j);
17936 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17937 Add = O0;
17938 Sub = O1;
17939 AddSibling = N0->getOperand(1 - i);
17940 SubSibling = N1->getOperand(1 - j);
17941 } else
17942 continue;
17943
17945 continue;
17946
17947 // Constant ones is always righthand operand of the Add.
17948 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
17949 continue;
17950
17951 if (Sub.getOperand(1) != Add.getOperand(0))
17952 continue;
17953
17954 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
17955 }
17956 }
17957
17958 // (or (and a b) (and (not a) c)) => (bsl a b c)
17959 // We only have to look for constant vectors here since the general, variable
17960 // case can be handled in TableGen.
17961 unsigned Bits = VT.getScalarSizeInBits();
17962 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
17963 for (int i = 1; i >= 0; --i)
17964 for (int j = 1; j >= 0; --j) {
17965 APInt Val1, Val2;
17966
17967 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
17969 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
17970 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
17971 N0->getOperand(1 - i), N1->getOperand(1 - j));
17972 }
17973 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
17974 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
17975 if (!BVN0 || !BVN1)
17976 continue;
17977
17978 bool FoundMatch = true;
17979 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
17980 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
17981 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
17982 if (!CN0 || !CN1 ||
17983 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
17984 FoundMatch = false;
17985 break;
17986 }
17987 }
17988 if (FoundMatch)
17989 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
17990 N0->getOperand(1 - i), N1->getOperand(1 - j));
17991 }
17992
17993 return SDValue();
17994}
17995
17996// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
17997// convert to csel(ccmp(.., cc0)), depending on cc1:
17998
17999// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18000// =>
18001// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18002//
18003// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18004// =>
18005// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18007 EVT VT = N->getValueType(0);
18008 SDValue CSel0 = N->getOperand(0);
18009 SDValue CSel1 = N->getOperand(1);
18010
18011 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18012 CSel1.getOpcode() != AArch64ISD::CSEL)
18013 return SDValue();
18014
18015 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18016 return SDValue();
18017
18018 if (!isNullConstant(CSel0.getOperand(0)) ||
18019 !isOneConstant(CSel0.getOperand(1)) ||
18020 !isNullConstant(CSel1.getOperand(0)) ||
18021 !isOneConstant(CSel1.getOperand(1)))
18022 return SDValue();
18023
18024 SDValue Cmp0 = CSel0.getOperand(3);
18025 SDValue Cmp1 = CSel1.getOperand(3);
18028 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18029 return SDValue();
18030 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18031 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18032 std::swap(Cmp0, Cmp1);
18033 std::swap(CC0, CC1);
18034 }
18035
18036 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18037 return SDValue();
18038
18039 SDLoc DL(N);
18040 SDValue CCmp, Condition;
18041 unsigned NZCV;
18042
18043 if (N->getOpcode() == ISD::AND) {
18045 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18047 } else {
18049 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18051 }
18052
18053 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18054
18055 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18056 if (Op1 && Op1->getAPIntValue().isNegative() &&
18057 Op1->getAPIntValue().sgt(-32)) {
18058 // CCMP accept the constant int the range [0, 31]
18059 // if the Op1 is a constant in the range [-31, -1], we
18060 // can select to CCMN to avoid the extra mov
18061 SDValue AbsOp1 =
18062 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18063 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18064 NZCVOp, Condition, Cmp0);
18065 } else {
18066 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18067 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18068 }
18069 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18070 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18071 CCmp);
18072}
18073
18075 const AArch64Subtarget *Subtarget,
18076 const AArch64TargetLowering &TLI) {
18077 SelectionDAG &DAG = DCI.DAG;
18078 EVT VT = N->getValueType(0);
18079
18080 if (SDValue R = performANDORCSELCombine(N, DAG))
18081 return R;
18082
18083 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18084 return SDValue();
18085
18086 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18087 return Res;
18088
18089 return SDValue();
18090}
18091
18093 if (!MemVT.getVectorElementType().isSimple())
18094 return false;
18095
18096 uint64_t MaskForTy = 0ull;
18097 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18098 case MVT::i8:
18099 MaskForTy = 0xffull;
18100 break;
18101 case MVT::i16:
18102 MaskForTy = 0xffffull;
18103 break;
18104 case MVT::i32:
18105 MaskForTy = 0xffffffffull;
18106 break;
18107 default:
18108 return false;
18109 break;
18110 }
18111
18112 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18113 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18114 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18115
18116 return false;
18117}
18118
18120 SDValue LeafOp = SDValue(N, 0);
18121 SDValue Op = N->getOperand(0);
18122 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18123 LeafOp.getValueType() != Op.getValueType())
18124 Op = Op->getOperand(0);
18125 if (LeafOp.getValueType() == Op.getValueType())
18126 return Op;
18127 return SDValue();
18128}
18129
18132 SelectionDAG &DAG = DCI.DAG;
18133 SDValue Src = N->getOperand(0);
18134 unsigned Opc = Src->getOpcode();
18135
18136 // Zero/any extend of an unsigned unpack
18137 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18138 SDValue UnpkOp = Src->getOperand(0);
18139 SDValue Dup = N->getOperand(1);
18140
18141 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18142 return SDValue();
18143
18144 SDLoc DL(N);
18145 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18146 if (!C)
18147 return SDValue();
18148
18149 uint64_t ExtVal = C->getZExtValue();
18150
18151 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18152 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18153 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18154 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18155 };
18156
18157 // If the mask is fully covered by the unpack, we don't need to push
18158 // a new AND onto the operand
18159 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18160 if (MaskAndTypeMatch(EltTy))
18161 return Src;
18162
18163 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18164 // to see if the mask is all-ones of size MemTy.
18165 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18166 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18167 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18168 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18169 if (MaskAndTypeMatch(EltTy))
18170 return Src;
18171 }
18172
18173 // Truncate to prevent a DUP with an over wide constant
18174 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18175
18176 // Otherwise, make sure we propagate the AND to the operand
18177 // of the unpack
18178 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18179 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18180
18181 SDValue And = DAG.getNode(ISD::AND, DL,
18182 UnpkOp->getValueType(0), UnpkOp, Dup);
18183
18184 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18185 }
18186
18187 if (DCI.isBeforeLegalizeOps())
18188 return SDValue();
18189
18190 // If both sides of AND operations are i1 splat_vectors then
18191 // we can produce just i1 splat_vector as the result.
18192 if (isAllActivePredicate(DAG, N->getOperand(0)))
18193 return N->getOperand(1);
18194 if (isAllActivePredicate(DAG, N->getOperand(1)))
18195 return N->getOperand(0);
18196
18198 return SDValue();
18199
18200 SDValue Mask = N->getOperand(1);
18201
18202 if (!Src.hasOneUse())
18203 return SDValue();
18204
18205 EVT MemVT;
18206
18207 // SVE load instructions perform an implicit zero-extend, which makes them
18208 // perfect candidates for combining.
18209 switch (Opc) {
18213 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18214 break;
18230 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18231 break;
18232 default:
18233 return SDValue();
18234 }
18235
18236 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18237 return Src;
18238
18239 return SDValue();
18240}
18241
18242// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18245
18246 // This function performs an optimization on a specific pattern involving
18247 // an AND operation and SETCC (Set Condition Code) node.
18248
18249 SDValue SetCC = N->getOperand(0);
18250 EVT VT = N->getValueType(0);
18251 SelectionDAG &DAG = DCI.DAG;
18252
18253 // Checks if the current node (N) is used by any SELECT instruction and
18254 // returns an empty SDValue to avoid applying the optimization to prevent
18255 // incorrect results
18256 for (auto U : N->uses())
18257 if (U->getOpcode() == ISD::SELECT)
18258 return SDValue();
18259
18260 // Check if the operand is a SETCC node with floating-point comparison
18261 if (SetCC.getOpcode() == ISD::SETCC &&
18262 SetCC.getOperand(0).getValueType() == MVT::f32) {
18263
18264 SDValue Cmp;
18266
18267 // Check if the DAG is after legalization and if we can emit the conjunction
18268 if (!DCI.isBeforeLegalize() &&
18269 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18270
18272
18273 SDLoc DL(N);
18274 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18275 DAG.getConstant(0, DL, VT),
18276 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18277 }
18278 }
18279 return SDValue();
18280}
18281
18284 SelectionDAG &DAG = DCI.DAG;
18285 SDValue LHS = N->getOperand(0);
18286 SDValue RHS = N->getOperand(1);
18287 EVT VT = N->getValueType(0);
18288
18289 if (SDValue R = performANDORCSELCombine(N, DAG))
18290 return R;
18291
18292 if (SDValue R = performANDSETCCCombine(N,DCI))
18293 return R;
18294
18295 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18296 return SDValue();
18297
18298 if (VT.isScalableVector())
18299 return performSVEAndCombine(N, DCI);
18300
18301 // The combining code below works only for NEON vectors. In particular, it
18302 // does not work for SVE when dealing with vectors wider than 128 bits.
18303 if (!VT.is64BitVector() && !VT.is128BitVector())
18304 return SDValue();
18305
18306 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18307 if (!BVN)
18308 return SDValue();
18309
18310 // AND does not accept an immediate, so check if we can use a BIC immediate
18311 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18312 // pattern in isel, because some immediates may be lowered to the preferred
18313 // (and x, (movi imm)) form, even though an mvni representation also exists.
18314 APInt DefBits(VT.getSizeInBits(), 0);
18315 APInt UndefBits(VT.getSizeInBits(), 0);
18316 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18317 SDValue NewOp;
18318
18319 // Any bits known to already be 0 need not be cleared again, which can help
18320 // reduce the size of the immediate to one supported by the instruction.
18321 KnownBits Known = DAG.computeKnownBits(LHS);
18322 APInt ZeroSplat(VT.getSizeInBits(), 0);
18323 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18324 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18325 << (Known.Zero.getBitWidth() * I);
18326
18327 DefBits = ~(DefBits | ZeroSplat);
18328 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18329 DefBits, &LHS)) ||
18330 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18331 DefBits, &LHS)))
18332 return NewOp;
18333
18334 UndefBits = ~(UndefBits | ZeroSplat);
18335 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18336 UndefBits, &LHS)) ||
18337 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18338 UndefBits, &LHS)))
18339 return NewOp;
18340 }
18341
18342 return SDValue();
18343}
18344
18347 SelectionDAG &DAG = DCI.DAG;
18348 SDValue LHS = N->getOperand(0);
18349 SDValue RHS = N->getOperand(1);
18350 EVT VT = N->getValueType(0);
18351 SDLoc DL(N);
18352
18353 if (!N->getFlags().hasAllowReassociation())
18354 return SDValue();
18355
18356 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18357 auto ReassocComplex = [&](SDValue A, SDValue B) {
18358 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18359 return SDValue();
18360 unsigned Opc = A.getConstantOperandVal(0);
18361 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18362 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18363 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18364 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18365 return SDValue();
18366 SDValue VCMLA = DAG.getNode(
18367 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18368 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18369 A.getOperand(2), A.getOperand(3));
18370 VCMLA->setFlags(A->getFlags());
18371 return VCMLA;
18372 };
18373 if (SDValue R = ReassocComplex(LHS, RHS))
18374 return R;
18375 if (SDValue R = ReassocComplex(RHS, LHS))
18376 return R;
18377
18378 return SDValue();
18379}
18380
18381static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18382 switch (Opcode) {
18383 case ISD::STRICT_FADD:
18384 case ISD::FADD:
18385 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18386 case ISD::ADD:
18387 return VT == MVT::i64;
18388 default:
18389 return false;
18390 }
18391}
18392
18393static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18395
18397 if ((N.getOpcode() == ISD::SETCC) ||
18398 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18399 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18400 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18401 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18402 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18403 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18404 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18405 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18406 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18407 // get_active_lane_mask is lowered to a whilelo instruction.
18408 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18409 return true;
18410
18411 return false;
18412}
18413
18414// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18415// ... into: "ptrue p, all" + PTEST
18416static SDValue
18419 const AArch64Subtarget *Subtarget) {
18420 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18421 // Make sure PTEST can be legalised with illegal types.
18422 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18423 return SDValue();
18424
18425 SDValue N0 = N->getOperand(0);
18426 EVT VT = N0.getValueType();
18427
18428 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18429 !isNullConstant(N->getOperand(1)))
18430 return SDValue();
18431
18432 // Restricted the DAG combine to only cases where we're extracting from a
18433 // flag-setting operation.
18434 if (!isPredicateCCSettingOp(N0))
18435 return SDValue();
18436
18437 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18438 SelectionDAG &DAG = DCI.DAG;
18439 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18440 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18441}
18442
18443// Materialize : Idx = (add (mul vscale, NumEls), -1)
18444// i1 = extract_vector_elt t37, Constant:i64<Idx>
18445// ... into: "ptrue p, all" + PTEST
18446static SDValue
18449 const AArch64Subtarget *Subtarget) {
18450 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18451 // Make sure PTEST is legal types.
18452 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18453 return SDValue();
18454
18455 SDValue N0 = N->getOperand(0);
18456 EVT OpVT = N0.getValueType();
18457
18458 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18459 return SDValue();
18460
18461 // Idx == (add (mul vscale, NumEls), -1)
18462 SDValue Idx = N->getOperand(1);
18463 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18464 return SDValue();
18465
18466 SDValue VS = Idx.getOperand(0);
18467 if (VS.getOpcode() != ISD::VSCALE)
18468 return SDValue();
18469
18470 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18471 if (VS.getConstantOperandVal(0) != NumEls)
18472 return SDValue();
18473
18474 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18475 SelectionDAG &DAG = DCI.DAG;
18476 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18477 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18478}
18479
18480static SDValue
18482 const AArch64Subtarget *Subtarget) {
18483 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18484 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18485 return Res;
18486 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18487 return Res;
18488
18489 SelectionDAG &DAG = DCI.DAG;
18490 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18491
18492 EVT VT = N->getValueType(0);
18493 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18494 bool IsStrict = N0->isStrictFPOpcode();
18495
18496 // extract(dup x) -> x
18497 if (N0.getOpcode() == AArch64ISD::DUP)
18498 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18499 : N0.getOperand(0);
18500
18501 // Rewrite for pairwise fadd pattern
18502 // (f32 (extract_vector_elt
18503 // (fadd (vXf32 Other)
18504 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18505 // ->
18506 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18507 // (extract_vector_elt (vXf32 Other) 1))
18508 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18509 // we can only do this when it's used only by the extract_vector_elt.
18510 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18511 (!IsStrict || N0.hasOneUse())) {
18512 SDLoc DL(N0);
18513 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18514 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18515
18516 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18517 SDValue Other = N00;
18518
18519 // And handle the commutative case.
18520 if (!Shuffle) {
18521 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18522 Other = N01;
18523 }
18524
18525 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18526 Other == Shuffle->getOperand(0)) {
18527 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18528 DAG.getConstant(0, DL, MVT::i64));
18529 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18530 DAG.getConstant(1, DL, MVT::i64));
18531 if (!IsStrict)
18532 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18533
18534 // For strict_fadd we need uses of the final extract_vector to be replaced
18535 // with the strict_fadd, but we also need uses of the chain output of the
18536 // original strict_fadd to use the chain output of the new strict_fadd as
18537 // otherwise it may not be deleted.
18538 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18539 {VT, MVT::Other},
18540 {N0->getOperand(0), Extract1, Extract2});
18541 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18542 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18543 return SDValue(N, 0);
18544 }
18545 }
18546
18547 return SDValue();
18548}
18549
18552 SelectionDAG &DAG) {
18553 SDLoc dl(N);
18554 EVT VT = N->getValueType(0);
18555 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18556 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18557
18558 if (VT.isScalableVector())
18559 return SDValue();
18560
18561 // Optimize concat_vectors of truncated vectors, where the intermediate
18562 // type is illegal, to avoid said illegality, e.g.,
18563 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18564 // (v2i16 (truncate (v2i64)))))
18565 // ->
18566 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18567 // (v4i32 (bitcast (v2i64))),
18568 // <0, 2, 4, 6>)))
18569 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18570 // on both input and result type, so we might generate worse code.
18571 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18572 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18573 N1Opc == ISD::TRUNCATE) {
18574 SDValue N00 = N0->getOperand(0);
18575 SDValue N10 = N1->getOperand(0);
18576 EVT N00VT = N00.getValueType();
18577
18578 if (N00VT == N10.getValueType() &&
18579 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18580 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18581 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18583 for (size_t i = 0; i < Mask.size(); ++i)
18584 Mask[i] = i * 2;
18585 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18586 DAG.getVectorShuffle(
18587 MidVT, dl,
18588 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18589 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18590 }
18591 }
18592
18593 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18594 N->getOperand(0).getValueType() == MVT::v2i16 ||
18595 N->getOperand(0).getValueType() == MVT::v2i8) {
18596 EVT SrcVT = N->getOperand(0).getValueType();
18597 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18598 // loads to prevent having to go through the v4i8 load legalization that
18599 // needs to extend each element into a larger type.
18600 if (N->getNumOperands() % 2 == 0 &&
18601 all_of(N->op_values(), [SrcVT](SDValue V) {
18602 if (V.getValueType() != SrcVT)
18603 return false;
18604 if (V.isUndef())
18605 return true;
18606 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18607 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18608 LD->getExtensionType() == ISD::NON_EXTLOAD;
18609 })) {
18610 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18611 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
18613
18614 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18615 SDValue V = N->getOperand(i);
18616 if (V.isUndef())
18617 Ops.push_back(DAG.getUNDEF(FVT));
18618 else {
18619 LoadSDNode *LD = cast<LoadSDNode>(V);
18620 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
18621 LD->getBasePtr(), LD->getMemOperand());
18622 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18623 Ops.push_back(NewLoad);
18624 }
18625 }
18626 return DAG.getBitcast(N->getValueType(0),
18627 DAG.getBuildVector(NVT, dl, Ops));
18628 }
18629 }
18630
18631 // Canonicalise concat_vectors to replace concatenations of truncated nots
18632 // with nots of concatenated truncates. This in some cases allows for multiple
18633 // redundant negations to be eliminated.
18634 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18635 // (v4i16 (truncate (not (v4i32)))))
18636 // ->
18637 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18638 // (v4i16 (truncate (v4i32)))))
18639 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18640 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18641 N->isOnlyUserOf(N1.getNode())) {
18642 auto isBitwiseVectorNegate = [](SDValue V) {
18643 return V->getOpcode() == ISD::XOR &&
18644 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18645 };
18646 SDValue N00 = N0->getOperand(0);
18647 SDValue N10 = N1->getOperand(0);
18648 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18649 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18650 return DAG.getNOT(
18651 dl,
18652 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18653 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18654 N00->getOperand(0)),
18655 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18656 N10->getOperand(0))),
18657 VT);
18658 }
18659 }
18660
18661 // Wait till after everything is legalized to try this. That way we have
18662 // legal vector types and such.
18663 if (DCI.isBeforeLegalizeOps())
18664 return SDValue();
18665
18666 // Optimise concat_vectors of two identical binops with a 128-bit destination
18667 // size, combine into an binop of two contacts of the source vectors. eg:
18668 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
18669 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18670 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
18671 N1->hasOneUse()) {
18672 SDValue N00 = N0->getOperand(0);
18673 SDValue N01 = N0->getOperand(1);
18674 SDValue N10 = N1->getOperand(0);
18675 SDValue N11 = N1->getOperand(1);
18676
18677 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18678 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18679 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18680 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18681 }
18682 }
18683
18684 auto IsRSHRN = [](SDValue Shr) {
18685 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18686 return false;
18687 SDValue Op = Shr.getOperand(0);
18688 EVT VT = Op.getValueType();
18689 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18690 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18691 return false;
18692
18693 APInt Imm;
18694 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18695 Imm = APInt(VT.getScalarSizeInBits(),
18696 Op.getOperand(1).getConstantOperandVal(0)
18697 << Op.getOperand(1).getConstantOperandVal(1));
18698 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18699 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18700 Imm = APInt(VT.getScalarSizeInBits(),
18701 Op.getOperand(1).getConstantOperandVal(0));
18702 else
18703 return false;
18704
18705 if (Imm != 1ULL << (ShtAmt - 1))
18706 return false;
18707 return true;
18708 };
18709
18710 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18711 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18712 ((IsRSHRN(N1) &&
18714 N1.isUndef())) {
18715 SDValue X = N0.getOperand(0).getOperand(0);
18716 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18717 : N1.getOperand(0).getOperand(0);
18718 EVT BVT =
18719 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18720 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18721 SDValue Add = DAG.getNode(
18722 ISD::ADD, dl, BVT, CC,
18723 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18724 SDValue Shr =
18725 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18726 return Shr;
18727 }
18728
18729 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18730 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18731 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18732 N0.getOperand(1) == N1.getOperand(1)) {
18733 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18734 DAG.getUNDEF(N0.getValueType()));
18735 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18736 DAG.getUNDEF(N0.getValueType()));
18737 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18738 }
18739
18740 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18741 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18742 // canonicalise to that.
18743 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18744 assert(VT.getScalarSizeInBits() == 64);
18745 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18746 DAG.getConstant(0, dl, MVT::i64));
18747 }
18748
18749 // Canonicalise concat_vectors so that the right-hand vector has as few
18750 // bit-casts as possible before its real operation. The primary matching
18751 // destination for these operations will be the narrowing "2" instructions,
18752 // which depend on the operation being performed on this right-hand vector.
18753 // For example,
18754 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18755 // becomes
18756 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18757
18758 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18759 return SDValue();
18760 SDValue RHS = N1->getOperand(0);
18761 MVT RHSTy = RHS.getValueType().getSimpleVT();
18762 // If the RHS is not a vector, this is not the pattern we're looking for.
18763 if (!RHSTy.isVector())
18764 return SDValue();
18765
18766 LLVM_DEBUG(
18767 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18768
18769 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18770 RHSTy.getVectorNumElements() * 2);
18771 return DAG.getNode(ISD::BITCAST, dl, VT,
18772 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18773 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18774 RHS));
18775}
18776
18777static SDValue
18779 SelectionDAG &DAG) {
18780 if (DCI.isBeforeLegalizeOps())
18781 return SDValue();
18782
18783 EVT VT = N->getValueType(0);
18784 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18785 return SDValue();
18786
18787 SDValue V = N->getOperand(0);
18788
18789 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18790 // blocks this combine because the non-const case requires custom lowering.
18791 //
18792 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18793 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18794 if (isa<ConstantSDNode>(V.getOperand(0)))
18795 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18796
18797 return SDValue();
18798}
18799
18800static SDValue
18802 SelectionDAG &DAG) {
18803 SDLoc DL(N);
18804 SDValue Vec = N->getOperand(0);
18805 SDValue SubVec = N->getOperand(1);
18806 uint64_t IdxVal = N->getConstantOperandVal(2);
18807 EVT VecVT = Vec.getValueType();
18808 EVT SubVT = SubVec.getValueType();
18809
18810 // Only do this for legal fixed vector types.
18811 if (!VecVT.isFixedLengthVector() ||
18812 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18813 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18814 return SDValue();
18815
18816 // Ignore widening patterns.
18817 if (IdxVal == 0 && Vec.isUndef())
18818 return SDValue();
18819
18820 // Subvector must be half the width and an "aligned" insertion.
18821 unsigned NumSubElts = SubVT.getVectorNumElements();
18822 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18823 (IdxVal != 0 && IdxVal != NumSubElts))
18824 return SDValue();
18825
18826 // Fold insert_subvector -> concat_vectors
18827 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18828 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18829 SDValue Lo, Hi;
18830 if (IdxVal == 0) {
18831 Lo = SubVec;
18832 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18833 DAG.getVectorIdxConstant(NumSubElts, DL));
18834 } else {
18835 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18836 DAG.getVectorIdxConstant(0, DL));
18837 Hi = SubVec;
18838 }
18839 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18840}
18841
18844 SelectionDAG &DAG) {
18845 // Wait until after everything is legalized to try this. That way we have
18846 // legal vector types and such.
18847 if (DCI.isBeforeLegalizeOps())
18848 return SDValue();
18849 // Transform a scalar conversion of a value from a lane extract into a
18850 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18851 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18852 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18853 //
18854 // The second form interacts better with instruction selection and the
18855 // register allocator to avoid cross-class register copies that aren't
18856 // coalescable due to a lane reference.
18857
18858 // Check the operand and see if it originates from a lane extract.
18859 SDValue Op1 = N->getOperand(1);
18861 return SDValue();
18862
18863 // Yep, no additional predication needed. Perform the transform.
18864 SDValue IID = N->getOperand(0);
18865 SDValue Shift = N->getOperand(2);
18866 SDValue Vec = Op1.getOperand(0);
18867 SDValue Lane = Op1.getOperand(1);
18868 EVT ResTy = N->getValueType(0);
18869 EVT VecResTy;
18870 SDLoc DL(N);
18871
18872 // The vector width should be 128 bits by the time we get here, even
18873 // if it started as 64 bits (the extract_vector handling will have
18874 // done so). Bail if it is not.
18875 if (Vec.getValueSizeInBits() != 128)
18876 return SDValue();
18877
18878 if (Vec.getValueType() == MVT::v4i32)
18879 VecResTy = MVT::v4f32;
18880 else if (Vec.getValueType() == MVT::v2i64)
18881 VecResTy = MVT::v2f64;
18882 else
18883 return SDValue();
18884
18885 SDValue Convert =
18886 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18887 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18888}
18889
18890// AArch64 high-vector "long" operations are formed by performing the non-high
18891// version on an extract_subvector of each operand which gets the high half:
18892//
18893// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18894//
18895// However, there are cases which don't have an extract_high explicitly, but
18896// have another operation that can be made compatible with one for free. For
18897// example:
18898//
18899// (dupv64 scalar) --> (extract_high (dup128 scalar))
18900//
18901// This routine does the actual conversion of such DUPs, once outer routines
18902// have determined that everything else is in order.
18903// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18904// similarly here.
18906 MVT VT = N.getSimpleValueType();
18907 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18908 N.getConstantOperandVal(1) == 0)
18909 N = N.getOperand(0);
18910
18911 switch (N.getOpcode()) {
18912 case AArch64ISD::DUP:
18917 case AArch64ISD::MOVI:
18923 break;
18924 default:
18925 // FMOV could be supported, but isn't very useful, as it would only occur
18926 // if you passed a bitcast' floating point immediate to an eligible long
18927 // integer op (addl, smull, ...).
18928 return SDValue();
18929 }
18930
18931 if (!VT.is64BitVector())
18932 return SDValue();
18933
18934 SDLoc DL(N);
18935 unsigned NumElems = VT.getVectorNumElements();
18936 if (N.getValueType().is64BitVector()) {
18937 MVT ElementTy = VT.getVectorElementType();
18938 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
18939 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
18940 }
18941
18942 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18943 DAG.getConstant(NumElems, DL, MVT::i64));
18944}
18945
18947 if (N.getOpcode() == ISD::BITCAST)
18948 N = N.getOperand(0);
18949 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18950 return false;
18951 if (N.getOperand(0).getValueType().isScalableVector())
18952 return false;
18953 return N.getConstantOperandAPInt(1) ==
18954 N.getOperand(0).getValueType().getVectorNumElements() / 2;
18955}
18956
18957/// Helper structure to keep track of ISD::SET_CC operands.
18962};
18963
18964/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
18966 const SDValue *Cmp;
18968};
18969
18970/// Helper structure to keep track of SetCC information.
18974};
18975
18976/// Helper structure to be able to read SetCC information. If set to
18977/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
18978/// GenericSetCCInfo.
18982};
18983
18984/// Check whether or not \p Op is a SET_CC operation, either a generic or
18985/// an
18986/// AArch64 lowered one.
18987/// \p SetCCInfo is filled accordingly.
18988/// \post SetCCInfo is meanginfull only when this function returns true.
18989/// \return True when Op is a kind of SET_CC operation.
18991 // If this is a setcc, this is straight forward.
18992 if (Op.getOpcode() == ISD::SETCC) {
18993 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
18994 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
18995 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18996 SetCCInfo.IsAArch64 = false;
18997 return true;
18998 }
18999 // Otherwise, check if this is a matching csel instruction.
19000 // In other words:
19001 // - csel 1, 0, cc
19002 // - csel 0, 1, !cc
19003 if (Op.getOpcode() != AArch64ISD::CSEL)
19004 return false;
19005 // Set the information about the operands.
19006 // TODO: we want the operands of the Cmp not the csel
19007 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19008 SetCCInfo.IsAArch64 = true;
19009 SetCCInfo.Info.AArch64.CC =
19010 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19011
19012 // Check that the operands matches the constraints:
19013 // (1) Both operands must be constants.
19014 // (2) One must be 1 and the other must be 0.
19015 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19016 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19017
19018 // Check (1).
19019 if (!TValue || !FValue)
19020 return false;
19021
19022 // Check (2).
19023 if (!TValue->isOne()) {
19024 // Update the comparison when we are interested in !cc.
19025 std::swap(TValue, FValue);
19026 SetCCInfo.Info.AArch64.CC =
19028 }
19029 return TValue->isOne() && FValue->isZero();
19030}
19031
19032// Returns true if Op is setcc or zext of setcc.
19033static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19034 if (isSetCC(Op, Info))
19035 return true;
19036 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19037 isSetCC(Op->getOperand(0), Info));
19038}
19039
19040// The folding we want to perform is:
19041// (add x, [zext] (setcc cc ...) )
19042// -->
19043// (csel x, (add x, 1), !cc ...)
19044//
19045// The latter will get matched to a CSINC instruction.
19047 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19048 SDValue LHS = Op->getOperand(0);
19049 SDValue RHS = Op->getOperand(1);
19050 SetCCInfoAndKind InfoAndKind;
19051
19052 // If both operands are a SET_CC, then we don't want to perform this
19053 // folding and create another csel as this results in more instructions
19054 // (and higher register usage).
19055 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19056 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19057 return SDValue();
19058
19059 // If neither operand is a SET_CC, give up.
19060 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19061 std::swap(LHS, RHS);
19062 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19063 return SDValue();
19064 }
19065
19066 // FIXME: This could be generatized to work for FP comparisons.
19067 EVT CmpVT = InfoAndKind.IsAArch64
19068 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19069 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19070 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19071 return SDValue();
19072
19073 SDValue CCVal;
19074 SDValue Cmp;
19075 SDLoc dl(Op);
19076 if (InfoAndKind.IsAArch64) {
19077 CCVal = DAG.getConstant(
19079 MVT::i32);
19080 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19081 } else
19082 Cmp = getAArch64Cmp(
19083 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19084 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19085 dl);
19086
19087 EVT VT = Op->getValueType(0);
19088 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19089 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19090}
19091
19092// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19094 EVT VT = N->getValueType(0);
19095 // Only scalar integer and vector types.
19096 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19097 return SDValue();
19098
19099 SDValue LHS = N->getOperand(0);
19100 SDValue RHS = N->getOperand(1);
19101 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19102 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19103 return SDValue();
19104
19105 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19106 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19107 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19108 return SDValue();
19109
19110 SDValue Op1 = LHS->getOperand(0);
19111 SDValue Op2 = RHS->getOperand(0);
19112 EVT OpVT1 = Op1.getValueType();
19113 EVT OpVT2 = Op2.getValueType();
19114 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19115 Op2.getOpcode() != AArch64ISD::UADDV ||
19116 OpVT1.getVectorElementType() != VT)
19117 return SDValue();
19118
19119 SDValue Val1 = Op1.getOperand(0);
19120 SDValue Val2 = Op2.getOperand(0);
19121 EVT ValVT = Val1->getValueType(0);
19122 SDLoc DL(N);
19123 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19124 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19125 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19126 DAG.getConstant(0, DL, MVT::i64));
19127}
19128
19129/// Perform the scalar expression combine in the form of:
19130/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19131/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19133 EVT VT = N->getValueType(0);
19134 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19135 return SDValue();
19136
19137 SDValue LHS = N->getOperand(0);
19138 SDValue RHS = N->getOperand(1);
19139
19140 // Handle commutivity.
19141 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19142 LHS.getOpcode() != AArch64ISD::CSNEG) {
19143 std::swap(LHS, RHS);
19144 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19145 LHS.getOpcode() != AArch64ISD::CSNEG) {
19146 return SDValue();
19147 }
19148 }
19149
19150 if (!LHS.hasOneUse())
19151 return SDValue();
19152
19153 AArch64CC::CondCode AArch64CC =
19154 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19155
19156 // The CSEL should include a const one operand, and the CSNEG should include
19157 // One or NegOne operand.
19158 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19159 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19160 if (!CTVal || !CFVal)
19161 return SDValue();
19162
19163 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19164 (CTVal->isOne() || CFVal->isOne())) &&
19165 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19166 (CTVal->isOne() || CFVal->isAllOnes())))
19167 return SDValue();
19168
19169 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19170 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19171 !CFVal->isOne()) {
19172 std::swap(CTVal, CFVal);
19173 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19174 }
19175
19176 SDLoc DL(N);
19177 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19178 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19179 !CFVal->isAllOnes()) {
19180 APInt C = -1 * CFVal->getAPIntValue();
19181 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19182 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19183 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19184 }
19185
19186 // It might be neutral for larger constants, as the immediate need to be
19187 // materialized in a register.
19188 APInt ADDC = CTVal->getAPIntValue();
19189 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19190 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19191 return SDValue();
19192
19193 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19194 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19195 "Unexpected constant value");
19196
19197 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19198 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19199 SDValue Cmp = LHS.getOperand(3);
19200
19201 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19202}
19203
19204// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19206 EVT VT = N->getValueType(0);
19207 if (N->getOpcode() != ISD::ADD)
19208 return SDValue();
19209
19210 SDValue Dot = N->getOperand(0);
19211 SDValue A = N->getOperand(1);
19212 // Handle commutivity
19213 auto isZeroDot = [](SDValue Dot) {
19214 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19215 Dot.getOpcode() == AArch64ISD::SDOT) &&
19217 };
19218 if (!isZeroDot(Dot))
19219 std::swap(Dot, A);
19220 if (!isZeroDot(Dot))
19221 return SDValue();
19222
19223 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19224 Dot.getOperand(2));
19225}
19226
19228 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19229}
19230
19232 SDLoc DL(Op);
19233 EVT VT = Op.getValueType();
19234 SDValue Zero = DAG.getConstant(0, DL, VT);
19235 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19236}
19237
19238// Try to fold
19239//
19240// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19241//
19242// The folding helps csel to be matched with csneg without generating
19243// redundant neg instruction, which includes negation of the csel expansion
19244// of abs node lowered by lowerABS.
19246 if (!isNegatedInteger(SDValue(N, 0)))
19247 return SDValue();
19248
19249 SDValue CSel = N->getOperand(1);
19250 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19251 return SDValue();
19252
19253 SDValue N0 = CSel.getOperand(0);
19254 SDValue N1 = CSel.getOperand(1);
19255
19256 // If both of them is not negations, it's not worth the folding as it
19257 // introduces two additional negations while reducing one negation.
19258 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19259 return SDValue();
19260
19261 SDValue N0N = getNegatedInteger(N0, DAG);
19262 SDValue N1N = getNegatedInteger(N1, DAG);
19263
19264 SDLoc DL(N);
19265 EVT VT = CSel.getValueType();
19266 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19267 CSel.getOperand(3));
19268}
19269
19270// The basic add/sub long vector instructions have variants with "2" on the end
19271// which act on the high-half of their inputs. They are normally matched by
19272// patterns like:
19273//
19274// (add (zeroext (extract_high LHS)),
19275// (zeroext (extract_high RHS)))
19276// -> uaddl2 vD, vN, vM
19277//
19278// However, if one of the extracts is something like a duplicate, this
19279// instruction can still be used profitably. This function puts the DAG into a
19280// more appropriate form for those patterns to trigger.
19283 SelectionDAG &DAG = DCI.DAG;
19284 if (DCI.isBeforeLegalizeOps())
19285 return SDValue();
19286
19287 MVT VT = N->getSimpleValueType(0);
19288 if (!VT.is128BitVector()) {
19289 if (N->getOpcode() == ISD::ADD)
19290 return performSetccAddFolding(N, DAG);
19291 return SDValue();
19292 }
19293
19294 // Make sure both branches are extended in the same way.
19295 SDValue LHS = N->getOperand(0);
19296 SDValue RHS = N->getOperand(1);
19297 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19298 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19299 LHS.getOpcode() != RHS.getOpcode())
19300 return SDValue();
19301
19302 unsigned ExtType = LHS.getOpcode();
19303
19304 // It's not worth doing if at least one of the inputs isn't already an
19305 // extract, but we don't know which it'll be so we have to try both.
19306 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19307 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19308 if (!RHS.getNode())
19309 return SDValue();
19310
19311 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19312 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19313 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19314 if (!LHS.getNode())
19315 return SDValue();
19316
19317 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19318 }
19319
19320 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19321}
19322
19323static bool isCMP(SDValue Op) {
19324 return Op.getOpcode() == AArch64ISD::SUBS &&
19325 !Op.getNode()->hasAnyUseOfValue(0);
19326}
19327
19328// (CSEL 1 0 CC Cond) => CC
19329// (CSEL 0 1 CC Cond) => !CC
19330static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19331 if (Op.getOpcode() != AArch64ISD::CSEL)
19332 return std::nullopt;
19333 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19334 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19335 return std::nullopt;
19336 SDValue OpLHS = Op.getOperand(0);
19337 SDValue OpRHS = Op.getOperand(1);
19338 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19339 return CC;
19340 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19341 return getInvertedCondCode(CC);
19342
19343 return std::nullopt;
19344}
19345
19346// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19347// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19348static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19349 SDValue CmpOp = Op->getOperand(2);
19350 if (!isCMP(CmpOp))
19351 return SDValue();
19352
19353 if (IsAdd) {
19354 if (!isOneConstant(CmpOp.getOperand(1)))
19355 return SDValue();
19356 } else {
19357 if (!isNullConstant(CmpOp.getOperand(0)))
19358 return SDValue();
19359 }
19360
19361 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19362 auto CC = getCSETCondCode(CsetOp);
19363 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19364 return SDValue();
19365
19366 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19367 Op->getOperand(0), Op->getOperand(1),
19368 CsetOp.getOperand(3));
19369}
19370
19371// (ADC x 0 cond) => (CINC x HS cond)
19373 SDValue LHS = N->getOperand(0);
19374 SDValue RHS = N->getOperand(1);
19375 SDValue Cond = N->getOperand(2);
19376
19377 if (!isNullConstant(RHS))
19378 return SDValue();
19379
19380 EVT VT = N->getValueType(0);
19381 SDLoc DL(N);
19382
19383 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19384 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19385 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19386}
19387
19388// Transform vector add(zext i8 to i32, zext i8 to i32)
19389// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19390// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19391// extends.
19393 EVT VT = N->getValueType(0);
19394 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19395 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19396 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19397 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19398 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19399 N->getOperand(0).getOperand(0).getValueType() !=
19400 N->getOperand(1).getOperand(0).getValueType())
19401 return SDValue();
19402
19403 SDValue N0 = N->getOperand(0).getOperand(0);
19404 SDValue N1 = N->getOperand(1).getOperand(0);
19405 EVT InVT = N0.getValueType();
19406
19407 EVT S1 = InVT.getScalarType();
19408 EVT S2 = VT.getScalarType();
19409 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19410 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19411 SDLoc DL(N);
19412 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19415 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19416 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19417 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19418 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19419 }
19420 return SDValue();
19421}
19422
19425 SelectionDAG &DAG) {
19426 SDLoc DL(N);
19427 EVT VT = N->getValueType(0);
19428
19429 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19430 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19431 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19432 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19433 Elt1->getOpcode() == ISD::FP_ROUND &&
19434 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19435 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19436 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19438 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19439 // Constant index.
19440 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19441 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19442 Elt0->getOperand(0)->getOperand(0) ==
19443 Elt1->getOperand(0)->getOperand(0) &&
19444 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19445 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19446 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19447 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19448 SDValue HighLanes;
19449 if (Elt2->getOpcode() == ISD::UNDEF &&
19450 Elt3->getOpcode() == ISD::UNDEF) {
19451 HighLanes = DAG.getUNDEF(MVT::v2f32);
19452 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19453 Elt3->getOpcode() == ISD::FP_ROUND &&
19454 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19455 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19456 Elt2->getConstantOperandVal(1) ==
19457 Elt3->getConstantOperandVal(1) &&
19458 Elt2->getOperand(0)->getOpcode() ==
19460 Elt3->getOperand(0)->getOpcode() ==
19462 // Constant index.
19463 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19464 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19465 Elt2->getOperand(0)->getOperand(0) ==
19466 Elt3->getOperand(0)->getOperand(0) &&
19467 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19468 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19469 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19470 HighLanes =
19471 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19472 }
19473 if (HighLanes) {
19474 SDValue DoubleToSingleSticky =
19475 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19476 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19477 DoubleToSingleSticky, HighLanes);
19478 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19479 Elt0->getOperand(1));
19480 }
19481 }
19482 }
19483 }
19484
19485 if (VT == MVT::v2f64) {
19486 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19487 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19488 Elt1->getOpcode() == ISD::FP_EXTEND &&
19490 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19491 Elt0->getOperand(0)->getOperand(0) ==
19492 Elt1->getOperand(0)->getOperand(0) &&
19493 // Constant index.
19494 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19495 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19496 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19497 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19498 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19499 // ResultType's known minimum vector length.
19500 Elt0->getOperand(0)->getConstantOperandVal(1) %
19502 0) {
19503 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19504 if (SrcVec.getValueType() == MVT::v4f16 ||
19505 SrcVec.getValueType() == MVT::v4bf16) {
19506 SDValue HalfToSingle =
19507 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19508 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19509 SDValue Extract = DAG.getNode(
19511 HalfToSingle, SubvectorIdx);
19512 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19513 }
19514 }
19515 }
19516
19517 // A build vector of two extracted elements is equivalent to an
19518 // extract subvector where the inner vector is any-extended to the
19519 // extract_vector_elt VT.
19520 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19521 // (extract_elt_iXX_to_i32 vec Idx+1))
19522 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19523
19524 // For now, only consider the v2i32 case, which arises as a result of
19525 // legalization.
19526 if (VT != MVT::v2i32)
19527 return SDValue();
19528
19529 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19530 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19531 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19532 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19533 // Constant index.
19534 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19535 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19536 // Both EXTRACT_VECTOR_ELT from same vector...
19537 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19538 // ... and contiguous. First element's index +1 == second element's index.
19539 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19540 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19541 // ResultType's known minimum vector length.
19542 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19543 SDValue VecToExtend = Elt0->getOperand(0);
19544 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19545 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19546 return SDValue();
19547
19548 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19549
19550 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19551 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19552 SubvectorIdx);
19553 }
19554
19555 return SDValue();
19556}
19557
19559 SelectionDAG &DAG) {
19560 EVT VT = N->getValueType(0);
19561 SDValue N0 = N->getOperand(0);
19562 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19563 N0.getOpcode() == AArch64ISD::DUP) {
19564 SDValue Op = N0.getOperand(0);
19565 if (VT.getScalarType() == MVT::i32 &&
19566 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19567 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19568 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19569 }
19570
19571 return SDValue();
19572}
19573
19574// Check an node is an extend or shift operand
19576 unsigned Opcode = N.getOpcode();
19577 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19578 EVT SrcVT;
19579 if (Opcode == ISD::SIGN_EXTEND_INREG)
19580 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19581 else
19582 SrcVT = N.getOperand(0).getValueType();
19583
19584 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19585 } else if (Opcode == ISD::AND) {
19586 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19587 if (!CSD)
19588 return false;
19589 uint64_t AndMask = CSD->getZExtValue();
19590 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19591 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19592 return isa<ConstantSDNode>(N.getOperand(1));
19593 }
19594
19595 return false;
19596}
19597
19598// (N - Y) + Z --> (Z - Y) + N
19599// when N is an extend or shift operand
19601 SelectionDAG &DAG) {
19602 auto IsOneUseExtend = [](SDValue N) {
19603 return N.hasOneUse() && isExtendOrShiftOperand(N);
19604 };
19605
19606 // DAGCombiner will revert the combination when Z is constant cause
19607 // dead loop. So don't enable the combination when Z is constant.
19608 // If Z is one use shift C, we also can't do the optimization.
19609 // It will falling to self infinite loop.
19610 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19611 return SDValue();
19612
19613 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19614 return SDValue();
19615
19616 SDValue Shift = SUB.getOperand(0);
19617 if (!IsOneUseExtend(Shift))
19618 return SDValue();
19619
19620 SDLoc DL(N);
19621 EVT VT = N->getValueType(0);
19622
19623 SDValue Y = SUB.getOperand(1);
19624 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19625 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19626}
19627
19629 SelectionDAG &DAG) {
19630 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19631 // commutative.
19632 if (N->getOpcode() != ISD::ADD)
19633 return SDValue();
19634
19635 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19636 // shifted register is only available for i32 and i64.
19637 EVT VT = N->getValueType(0);
19638 if (VT != MVT::i32 && VT != MVT::i64)
19639 return SDValue();
19640
19641 SDLoc DL(N);
19642 SDValue LHS = N->getOperand(0);
19643 SDValue RHS = N->getOperand(1);
19644
19645 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19646 return Val;
19647 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19648 return Val;
19649
19650 uint64_t LHSImm = 0, RHSImm = 0;
19651 // If both operand are shifted by imm and shift amount is not greater than 4
19652 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19653 // on RHS.
19654 //
19655 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19656 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19657 // with LSL (shift > 4). For the rest of processors, this is no-op for
19658 // performance or correctness.
19659 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19660 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19661 RHSImm > 4 && LHS.hasOneUse())
19662 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19663
19664 return SDValue();
19665}
19666
19667// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19668// This reassociates it back to allow the creation of more mls instructions.
19670 if (N->getOpcode() != ISD::SUB)
19671 return SDValue();
19672
19673 SDValue Add = N->getOperand(1);
19674 SDValue X = N->getOperand(0);
19675 if (Add.getOpcode() != ISD::ADD)
19676 return SDValue();
19677
19678 if (!Add.hasOneUse())
19679 return SDValue();
19681 return SDValue();
19682
19683 SDValue M1 = Add.getOperand(0);
19684 SDValue M2 = Add.getOperand(1);
19685 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19686 M1.getOpcode() != AArch64ISD::UMULL)
19687 return SDValue();
19688 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19690 return SDValue();
19691
19692 EVT VT = N->getValueType(0);
19693 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19694 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19695}
19696
19697// Combine into mla/mls.
19698// This works on the patterns of:
19699// add v1, (mul v2, v3)
19700// sub v1, (mul v2, v3)
19701// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19702// It will transform the add/sub to a scalable version, so that we can
19703// make use of SVE's MLA/MLS that will be generated for that pattern
19704static SDValue
19706 SelectionDAG &DAG = DCI.DAG;
19707 // Make sure that the types are legal
19708 if (!DCI.isAfterLegalizeDAG())
19709 return SDValue();
19710 // Before using SVE's features, check first if it's available.
19711 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19712 return SDValue();
19713
19714 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19715 return SDValue();
19716
19717 if (!N->getValueType(0).isFixedLengthVector())
19718 return SDValue();
19719
19720 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19721 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19722 return SDValue();
19723
19724 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19725 return SDValue();
19726
19727 SDValue MulValue = Op1->getOperand(0);
19728 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19729 return SDValue();
19730
19731 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19732 return SDValue();
19733
19734 EVT ScalableVT = MulValue.getValueType();
19735 if (!ScalableVT.isScalableVector())
19736 return SDValue();
19737
19738 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19739 SDValue NewValue =
19740 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19741 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19742 };
19743
19744 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19745 return res;
19746 else if (N->getOpcode() == ISD::ADD)
19747 return performOpt(N->getOperand(1), N->getOperand(0));
19748
19749 return SDValue();
19750}
19751
19752// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19753// help, for example, to produce ssra from sshr+add.
19755 EVT VT = N->getValueType(0);
19756 if (VT != MVT::i64)
19757 return SDValue();
19758 SDValue Op0 = N->getOperand(0);
19759 SDValue Op1 = N->getOperand(1);
19760
19761 // At least one of the operands should be an extract, and the other should be
19762 // something that is easy to convert to v1i64 type (in this case a load).
19763 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19764 Op0.getOpcode() != ISD::LOAD)
19765 return SDValue();
19766 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19767 Op1.getOpcode() != ISD::LOAD)
19768 return SDValue();
19769
19770 SDLoc DL(N);
19771 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19772 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19773 Op0 = Op0.getOperand(0);
19774 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19775 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19776 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19777 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19778 Op1 = Op1.getOperand(0);
19779 } else
19780 return SDValue();
19781
19782 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19783 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19784 DAG.getConstant(0, DL, MVT::i64));
19785}
19786
19789 if (!BV->hasOneUse())
19790 return false;
19791 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19792 if (!Ld || !Ld->isSimple())
19793 return false;
19794 Loads.push_back(Ld);
19795 return true;
19796 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19798 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19799 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19800 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19801 return false;
19802 Loads.push_back(Ld);
19803 }
19804 return true;
19805 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19806 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19807 // are lowered. Note that this only comes up because we do not always visit
19808 // operands before uses. After that is fixed this can be removed and in the
19809 // meantime this is fairly specific to the lowering we expect from IR.
19810 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19811 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19812 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19813 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19814 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19815 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19816 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19817 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19818 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19819 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19820 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19821 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19822 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19823 B.getOperand(1).getNumOperands() != 4)
19824 return false;
19825 auto SV1 = cast<ShuffleVectorSDNode>(B);
19826 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19827 int NumElts = B.getValueType().getVectorNumElements();
19828 int NumSubElts = NumElts / 4;
19829 for (int I = 0; I < NumSubElts; I++) {
19830 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19831 if (SV1->getMaskElt(I) != I ||
19832 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19833 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19834 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19835 return false;
19836 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19837 if (SV2->getMaskElt(I) != I ||
19838 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19839 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19840 return false;
19841 }
19842 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19843 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19844 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19845 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19846 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19847 !Ld2->isSimple() || !Ld3->isSimple())
19848 return false;
19849 Loads.push_back(Ld0);
19850 Loads.push_back(Ld1);
19851 Loads.push_back(Ld2);
19852 Loads.push_back(Ld3);
19853 return true;
19854 }
19855 return false;
19856}
19857
19859 SelectionDAG &DAG,
19860 unsigned &NumSubLoads) {
19861 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19862 return false;
19863
19864 SmallVector<LoadSDNode *> Loads0, Loads1;
19865 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19866 isLoadOrMultipleLoads(Op1, Loads1)) {
19867 if (NumSubLoads && Loads0.size() != NumSubLoads)
19868 return false;
19869 NumSubLoads = Loads0.size();
19870 return Loads0.size() == Loads1.size() &&
19871 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19872 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19873 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19874 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19875 Size / 8, 1);
19876 });
19877 }
19878
19879 if (Op0.getOpcode() != Op1.getOpcode())
19880 return false;
19881
19882 switch (Op0.getOpcode()) {
19883 case ISD::ADD:
19884 case ISD::SUB:
19886 DAG, NumSubLoads) &&
19888 DAG, NumSubLoads);
19889 case ISD::SIGN_EXTEND:
19890 case ISD::ANY_EXTEND:
19891 case ISD::ZERO_EXTEND:
19892 EVT XVT = Op0.getOperand(0).getValueType();
19893 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19894 XVT.getScalarSizeInBits() != 32)
19895 return false;
19897 DAG, NumSubLoads);
19898 }
19899 return false;
19900}
19901
19902// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19903// into a single load of twice the size, that we extract the bottom part and top
19904// part so that the shl can use a shll2 instruction. The two loads in that
19905// example can also be larger trees of instructions, which are identical except
19906// for the leaves which are all loads offset from the LHS, including
19907// buildvectors of multiple loads. For example the RHS tree could be
19908// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19909// Whilst it can be common for the larger loads to replace LDP instructions
19910// (which doesn't gain anything on it's own), the larger loads can help create
19911// more efficient code, and in buildvectors prevent the need for ld1 lane
19912// inserts which can be slower than normal loads.
19914 EVT VT = N->getValueType(0);
19915 if (!VT.isFixedLengthVector() ||
19916 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19917 VT.getScalarSizeInBits() != 64))
19918 return SDValue();
19919
19920 SDValue Other = N->getOperand(0);
19921 SDValue Shift = N->getOperand(1);
19922 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19923 std::swap(Shift, Other);
19924 APInt ShiftAmt;
19925 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19926 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
19927 return SDValue();
19928
19929 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
19930 !ISD::isExtOpcode(Other.getOpcode()) ||
19931 Shift.getOperand(0).getOperand(0).getValueType() !=
19932 Other.getOperand(0).getValueType() ||
19933 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
19934 return SDValue();
19935
19936 SDValue Op0 = Other.getOperand(0);
19937 SDValue Op1 = Shift.getOperand(0).getOperand(0);
19938
19939 unsigned NumSubLoads = 0;
19940 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19941 return SDValue();
19942
19943 // Attempt to rule out some unprofitable cases using heuristics (some working
19944 // around suboptimal code generation), notably if the extend not be able to
19945 // use ushll2 instructions as the types are not large enough. Otherwise zip's
19946 // will need to be created which can increase the instruction count.
19947 unsigned NumElts = Op0.getValueType().getVectorNumElements();
19948 unsigned NumSubElts = NumElts / NumSubLoads;
19949 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
19950 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
19951 Op0.getValueType().getSizeInBits() < 128 &&
19953 return SDValue();
19954
19955 // Recreate the tree with the new combined loads.
19956 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
19957 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
19958 EVT DVT =
19960
19961 SmallVector<LoadSDNode *> Loads0, Loads1;
19962 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19963 isLoadOrMultipleLoads(Op1, Loads1)) {
19964 EVT LoadVT = EVT::getVectorVT(
19965 *DAG.getContext(), Op0.getValueType().getScalarType(),
19966 Op0.getValueType().getVectorNumElements() / Loads0.size());
19967 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19968
19969 SmallVector<SDValue> NewLoads;
19970 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
19971 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
19972 L0->getBasePtr(), L0->getPointerInfo(),
19973 L0->getOriginalAlign());
19974 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
19975 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
19976 NewLoads.push_back(Load);
19977 }
19978 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
19979 }
19980
19982 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
19983 Ops.push_back(GenCombinedTree(O0, O1, DAG));
19984 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
19985 };
19986 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
19987
19988 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
19989 int Hi = NumSubElts, Lo = 0;
19990 for (unsigned i = 0; i < NumSubLoads; i++) {
19991 for (unsigned j = 0; j < NumSubElts; j++) {
19992 LowMask[i * NumSubElts + j] = Lo++;
19993 HighMask[i * NumSubElts + j] = Hi++;
19994 }
19995 Lo += NumSubElts;
19996 Hi += NumSubElts;
19997 }
19998 SDLoc DL(N);
19999 SDValue Ext0, Ext1;
20000 // Extract the top and bottom lanes, then extend the result. Possibly extend
20001 // the result then extract the lanes if the two operands match as it produces
20002 // slightly smaller code.
20003 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20005 NewOp, DAG.getConstant(0, DL, MVT::i64));
20006 SDValue SubH =
20007 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20008 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20009 SDValue Extr0 =
20010 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20011 SDValue Extr1 =
20012 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20013 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20014 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20015 } else {
20017 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20018 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20019 DAG.getConstant(0, DL, MVT::i64));
20020 SDValue SubH =
20021 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20022 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20023 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20024 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20025 }
20026 SDValue NShift =
20027 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20028 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20029}
20030
20033 // Try to change sum of two reductions.
20034 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20035 return Val;
20036 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20037 return Val;
20038 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20039 return Val;
20040 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20041 return Val;
20043 return Val;
20045 return Val;
20046 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20047 return Val;
20048 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20049 return Val;
20050 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20051 return Val;
20052
20053 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20054 return Val;
20055
20056 return performAddSubLongCombine(N, DCI);
20057}
20058
20059// Massage DAGs which we can use the high-half "long" operations on into
20060// something isel will recognize better. E.g.
20061//
20062// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20063// (aarch64_neon_umull (extract_high (v2i64 vec)))
20064// (extract_high (v2i64 (dup128 scalar)))))
20065//
20068 SelectionDAG &DAG) {
20069 if (DCI.isBeforeLegalizeOps())
20070 return SDValue();
20071
20072 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20073 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20074 assert(LHS.getValueType().is64BitVector() &&
20075 RHS.getValueType().is64BitVector() &&
20076 "unexpected shape for long operation");
20077
20078 // Either node could be a DUP, but it's not worth doing both of them (you'd
20079 // just as well use the non-high version) so look for a corresponding extract
20080 // operation on the other "wing".
20083 if (!RHS.getNode())
20084 return SDValue();
20087 if (!LHS.getNode())
20088 return SDValue();
20089 } else
20090 return SDValue();
20091
20092 if (IID == Intrinsic::not_intrinsic)
20093 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20094
20095 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20096 N->getOperand(0), LHS, RHS);
20097}
20098
20099static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20100 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20101 unsigned ElemBits = ElemTy.getSizeInBits();
20102
20103 int64_t ShiftAmount;
20104 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20105 APInt SplatValue, SplatUndef;
20106 unsigned SplatBitSize;
20107 bool HasAnyUndefs;
20108 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20109 HasAnyUndefs, ElemBits) ||
20110 SplatBitSize != ElemBits)
20111 return SDValue();
20112
20113 ShiftAmount = SplatValue.getSExtValue();
20114 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20115 ShiftAmount = CVN->getSExtValue();
20116 } else
20117 return SDValue();
20118
20119 // If the shift amount is zero, remove the shift intrinsic.
20120 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20121 return N->getOperand(1);
20122
20123 unsigned Opcode;
20124 bool IsRightShift;
20125 switch (IID) {
20126 default:
20127 llvm_unreachable("Unknown shift intrinsic");
20128 case Intrinsic::aarch64_neon_sqshl:
20129 Opcode = AArch64ISD::SQSHL_I;
20130 IsRightShift = false;
20131 break;
20132 case Intrinsic::aarch64_neon_uqshl:
20133 Opcode = AArch64ISD::UQSHL_I;
20134 IsRightShift = false;
20135 break;
20136 case Intrinsic::aarch64_neon_srshl:
20137 Opcode = AArch64ISD::SRSHR_I;
20138 IsRightShift = true;
20139 break;
20140 case Intrinsic::aarch64_neon_urshl:
20141 Opcode = AArch64ISD::URSHR_I;
20142 IsRightShift = true;
20143 break;
20144 case Intrinsic::aarch64_neon_sqshlu:
20145 Opcode = AArch64ISD::SQSHLU_I;
20146 IsRightShift = false;
20147 break;
20148 case Intrinsic::aarch64_neon_sshl:
20149 case Intrinsic::aarch64_neon_ushl:
20150 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20151 // left shift for positive shift amounts. For negative shifts we can use a
20152 // VASHR/VLSHR as appropiate.
20153 if (ShiftAmount < 0) {
20154 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20156 ShiftAmount = -ShiftAmount;
20157 } else
20158 Opcode = AArch64ISD::VSHL;
20159 IsRightShift = false;
20160 break;
20161 }
20162
20163 EVT VT = N->getValueType(0);
20164 SDValue Op = N->getOperand(1);
20165 SDLoc dl(N);
20166 if (VT == MVT::i64) {
20167 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20168 VT = MVT::v1i64;
20169 }
20170
20171 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20172 Op = DAG.getNode(Opcode, dl, VT, Op,
20173 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20174 if (N->getValueType(0) == MVT::i64)
20175 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20176 DAG.getConstant(0, dl, MVT::i64));
20177 return Op;
20178 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20179 Op = DAG.getNode(Opcode, dl, VT, Op,
20180 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20181 if (N->getValueType(0) == MVT::i64)
20182 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20183 DAG.getConstant(0, dl, MVT::i64));
20184 return Op;
20185 }
20186
20187 return SDValue();
20188}
20189
20190// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20191// the intrinsics must be legal and take an i32, this means there's almost
20192// certainly going to be a zext in the DAG which we can eliminate.
20193static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20194 SDValue AndN = N->getOperand(2);
20195 if (AndN.getOpcode() != ISD::AND)
20196 return SDValue();
20197
20198 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20199 if (!CMask || CMask->getZExtValue() != Mask)
20200 return SDValue();
20201
20202 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20203 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20204}
20205
20207 SelectionDAG &DAG) {
20208 SDLoc dl(N);
20209 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20210 DAG.getNode(Opc, dl,
20211 N->getOperand(1).getSimpleValueType(),
20212 N->getOperand(1)),
20213 DAG.getConstant(0, dl, MVT::i64));
20214}
20215
20217 SDLoc DL(N);
20218 SDValue Op1 = N->getOperand(1);
20219 SDValue Op2 = N->getOperand(2);
20220 EVT ScalarTy = Op2.getValueType();
20221 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20222 ScalarTy = MVT::i32;
20223
20224 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20225 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20226 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20227 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20228 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20229 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20230}
20231
20233 SDLoc dl(N);
20234 SDValue Scalar = N->getOperand(3);
20235 EVT ScalarTy = Scalar.getValueType();
20236
20237 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20238 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20239
20240 SDValue Passthru = N->getOperand(1);
20241 SDValue Pred = N->getOperand(2);
20242 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20243 Pred, Scalar, Passthru);
20244}
20245
20247 SDLoc dl(N);
20248 LLVMContext &Ctx = *DAG.getContext();
20249 EVT VT = N->getValueType(0);
20250
20251 assert(VT.isScalableVector() && "Expected a scalable vector.");
20252
20253 // Current lowering only supports the SVE-ACLE types.
20255 return SDValue();
20256
20257 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20258 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20259 EVT ByteVT =
20260 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20261
20262 // Convert everything to the domain of EXT (i.e bytes).
20263 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20264 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20265 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20266 DAG.getConstant(ElemSize, dl, MVT::i32));
20267
20268 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20269 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20270}
20271
20274 SelectionDAG &DAG) {
20275 if (DCI.isBeforeLegalize())
20276 return SDValue();
20277
20278 SDValue Comparator = N->getOperand(3);
20279 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20280 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20281 unsigned IID = getIntrinsicID(N);
20282 EVT VT = N->getValueType(0);
20283 EVT CmpVT = N->getOperand(2).getValueType();
20284 SDValue Pred = N->getOperand(1);
20285 SDValue Imm;
20286 SDLoc DL(N);
20287
20288 switch (IID) {
20289 default:
20290 llvm_unreachable("Called with wrong intrinsic!");
20291 break;
20292
20293 // Signed comparisons
20294 case Intrinsic::aarch64_sve_cmpeq_wide:
20295 case Intrinsic::aarch64_sve_cmpne_wide:
20296 case Intrinsic::aarch64_sve_cmpge_wide:
20297 case Intrinsic::aarch64_sve_cmpgt_wide:
20298 case Intrinsic::aarch64_sve_cmplt_wide:
20299 case Intrinsic::aarch64_sve_cmple_wide: {
20300 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20301 int64_t ImmVal = CN->getSExtValue();
20302 if (ImmVal >= -16 && ImmVal <= 15)
20303 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20304 else
20305 return SDValue();
20306 }
20307 break;
20308 }
20309 // Unsigned comparisons
20310 case Intrinsic::aarch64_sve_cmphs_wide:
20311 case Intrinsic::aarch64_sve_cmphi_wide:
20312 case Intrinsic::aarch64_sve_cmplo_wide:
20313 case Intrinsic::aarch64_sve_cmpls_wide: {
20314 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20315 uint64_t ImmVal = CN->getZExtValue();
20316 if (ImmVal <= 127)
20317 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20318 else
20319 return SDValue();
20320 }
20321 break;
20322 }
20323 }
20324
20325 if (!Imm)
20326 return SDValue();
20327
20328 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20329 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20330 N->getOperand(2), Splat, DAG.getCondCode(CC));
20331 }
20332
20333 return SDValue();
20334}
20335
20338 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20339
20340 SDLoc DL(Op);
20341 assert(Op.getValueType().isScalableVector() &&
20342 TLI.isTypeLegal(Op.getValueType()) &&
20343 "Expected legal scalable vector type!");
20344 assert(Op.getValueType() == Pg.getValueType() &&
20345 "Expected same type for PTEST operands");
20346
20347 // Ensure target specific opcodes are using legal type.
20348 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20349 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20350 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20351
20352 // Ensure operands have type nxv16i1.
20353 if (Op.getValueType() != MVT::nxv16i1) {
20356 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20357 else
20358 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20359 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20360 }
20361
20362 // Set condition code (CC) flags.
20363 SDValue Test = DAG.getNode(
20365 DL, MVT::Other, Pg, Op);
20366
20367 // Convert CC to integer based on requested condition.
20368 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20369 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20370 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20371 return DAG.getZExtOrTrunc(Res, DL, VT);
20372}
20373
20375 SelectionDAG &DAG) {
20376 SDLoc DL(N);
20377
20378 SDValue Pred = N->getOperand(1);
20379 SDValue VecToReduce = N->getOperand(2);
20380
20381 // NOTE: The integer reduction's result type is not always linked to the
20382 // operand's element type so we construct it from the intrinsic's result type.
20383 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20384 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20385
20386 // SVE reductions set the whole vector register with the first element
20387 // containing the reduction result, which we'll now extract.
20388 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20389 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20390 Zero);
20391}
20392
20394 SelectionDAG &DAG) {
20395 SDLoc DL(N);
20396
20397 SDValue Pred = N->getOperand(1);
20398 SDValue VecToReduce = N->getOperand(2);
20399
20400 EVT ReduceVT = VecToReduce.getValueType();
20401 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20402
20403 // SVE reductions set the whole vector register with the first element
20404 // containing the reduction result, which we'll now extract.
20405 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20406 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20407 Zero);
20408}
20409
20411 SelectionDAG &DAG) {
20412 SDLoc DL(N);
20413
20414 SDValue Pred = N->getOperand(1);
20415 SDValue InitVal = N->getOperand(2);
20416 SDValue VecToReduce = N->getOperand(3);
20417 EVT ReduceVT = VecToReduce.getValueType();
20418
20419 // Ordered reductions use the first lane of the result vector as the
20420 // reduction's initial value.
20421 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20422 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20423 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20424
20425 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20426
20427 // SVE reductions set the whole vector register with the first element
20428 // containing the reduction result, which we'll now extract.
20429 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20430 Zero);
20431}
20432
20433// If a merged operation has no inactive lanes we can relax it to a predicated
20434// or unpredicated operation, which potentially allows better isel (perhaps
20435// using immediate forms) or relaxing register reuse requirements.
20437 SelectionDAG &DAG, bool UnpredOp = false,
20438 bool SwapOperands = false) {
20439 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20440 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20441 SDValue Pg = N->getOperand(1);
20442 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20443 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20444
20445 // ISD way to specify an all active predicate.
20446 if (isAllActivePredicate(DAG, Pg)) {
20447 if (UnpredOp)
20448 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20449
20450 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20451 }
20452
20453 // FUTURE: SplatVector(true)
20454 return SDValue();
20455}
20456
20459 const AArch64Subtarget *Subtarget) {
20460 SelectionDAG &DAG = DCI.DAG;
20461 unsigned IID = getIntrinsicID(N);
20462 switch (IID) {
20463 default:
20464 break;
20465 case Intrinsic::get_active_lane_mask: {
20466 SDValue Res = SDValue();
20467 EVT VT = N->getValueType(0);
20468 if (VT.isFixedLengthVector()) {
20469 // We can use the SVE whilelo instruction to lower this intrinsic by
20470 // creating the appropriate sequence of scalable vector operations and
20471 // then extracting a fixed-width subvector from the scalable vector.
20472
20473 SDLoc DL(N);
20474 SDValue ID =
20475 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20476
20477 EVT WhileVT = EVT::getVectorVT(
20478 *DAG.getContext(), MVT::i1,
20480
20481 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20482 EVT PromVT = getPromotedVTForPredicate(WhileVT);
20483
20484 // Get the fixed-width equivalent of PromVT for extraction.
20485 EVT ExtVT =
20488
20489 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
20490 N->getOperand(1), N->getOperand(2));
20491 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
20492 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20493 DAG.getConstant(0, DL, MVT::i64));
20494 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
20495 }
20496 return Res;
20497 }
20498 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20499 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20500 return tryCombineFixedPointConvert(N, DCI, DAG);
20501 case Intrinsic::aarch64_neon_saddv:
20503 case Intrinsic::aarch64_neon_uaddv:
20505 case Intrinsic::aarch64_neon_sminv:
20507 case Intrinsic::aarch64_neon_uminv:
20509 case Intrinsic::aarch64_neon_smaxv:
20511 case Intrinsic::aarch64_neon_umaxv:
20513 case Intrinsic::aarch64_neon_fmax:
20514 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20515 N->getOperand(1), N->getOperand(2));
20516 case Intrinsic::aarch64_neon_fmin:
20517 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20518 N->getOperand(1), N->getOperand(2));
20519 case Intrinsic::aarch64_neon_fmaxnm:
20520 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20521 N->getOperand(1), N->getOperand(2));
20522 case Intrinsic::aarch64_neon_fminnm:
20523 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20524 N->getOperand(1), N->getOperand(2));
20525 case Intrinsic::aarch64_neon_smull:
20526 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20527 N->getOperand(1), N->getOperand(2));
20528 case Intrinsic::aarch64_neon_umull:
20529 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20530 N->getOperand(1), N->getOperand(2));
20531 case Intrinsic::aarch64_neon_pmull:
20532 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20533 N->getOperand(1), N->getOperand(2));
20534 case Intrinsic::aarch64_neon_sqdmull:
20535 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20536 case Intrinsic::aarch64_neon_sqshl:
20537 case Intrinsic::aarch64_neon_uqshl:
20538 case Intrinsic::aarch64_neon_sqshlu:
20539 case Intrinsic::aarch64_neon_srshl:
20540 case Intrinsic::aarch64_neon_urshl:
20541 case Intrinsic::aarch64_neon_sshl:
20542 case Intrinsic::aarch64_neon_ushl:
20543 return tryCombineShiftImm(IID, N, DAG);
20544 case Intrinsic::aarch64_neon_sabd:
20545 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20546 N->getOperand(1), N->getOperand(2));
20547 case Intrinsic::aarch64_neon_uabd:
20548 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20549 N->getOperand(1), N->getOperand(2));
20550 case Intrinsic::aarch64_crc32b:
20551 case Intrinsic::aarch64_crc32cb:
20552 return tryCombineCRC32(0xff, N, DAG);
20553 case Intrinsic::aarch64_crc32h:
20554 case Intrinsic::aarch64_crc32ch:
20555 return tryCombineCRC32(0xffff, N, DAG);
20556 case Intrinsic::aarch64_sve_saddv:
20557 // There is no i64 version of SADDV because the sign is irrelevant.
20558 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20560 else
20562 case Intrinsic::aarch64_sve_uaddv:
20564 case Intrinsic::aarch64_sve_smaxv:
20566 case Intrinsic::aarch64_sve_umaxv:
20568 case Intrinsic::aarch64_sve_sminv:
20570 case Intrinsic::aarch64_sve_uminv:
20572 case Intrinsic::aarch64_sve_orv:
20574 case Intrinsic::aarch64_sve_eorv:
20576 case Intrinsic::aarch64_sve_andv:
20578 case Intrinsic::aarch64_sve_index:
20579 return LowerSVEIntrinsicIndex(N, DAG);
20580 case Intrinsic::aarch64_sve_dup:
20581 return LowerSVEIntrinsicDUP(N, DAG);
20582 case Intrinsic::aarch64_sve_dup_x:
20583 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20584 N->getOperand(1));
20585 case Intrinsic::aarch64_sve_ext:
20586 return LowerSVEIntrinsicEXT(N, DAG);
20587 case Intrinsic::aarch64_sve_mul_u:
20588 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20589 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20590 case Intrinsic::aarch64_sve_smulh_u:
20591 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20592 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20593 case Intrinsic::aarch64_sve_umulh_u:
20594 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20595 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20596 case Intrinsic::aarch64_sve_smin_u:
20597 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20598 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20599 case Intrinsic::aarch64_sve_umin_u:
20600 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20601 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20602 case Intrinsic::aarch64_sve_smax_u:
20603 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20604 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20605 case Intrinsic::aarch64_sve_umax_u:
20606 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20607 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20608 case Intrinsic::aarch64_sve_lsl_u:
20609 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20610 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20611 case Intrinsic::aarch64_sve_lsr_u:
20612 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20613 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20614 case Intrinsic::aarch64_sve_asr_u:
20615 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20616 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20617 case Intrinsic::aarch64_sve_fadd_u:
20618 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20619 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20620 case Intrinsic::aarch64_sve_fdiv_u:
20621 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20622 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20623 case Intrinsic::aarch64_sve_fmax_u:
20624 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20625 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20626 case Intrinsic::aarch64_sve_fmaxnm_u:
20627 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20628 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20629 case Intrinsic::aarch64_sve_fmla_u:
20630 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20631 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20632 N->getOperand(2));
20633 case Intrinsic::aarch64_sve_fmin_u:
20634 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20635 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20636 case Intrinsic::aarch64_sve_fminnm_u:
20637 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20638 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20639 case Intrinsic::aarch64_sve_fmul_u:
20640 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20641 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20642 case Intrinsic::aarch64_sve_fsub_u:
20643 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20644 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20645 case Intrinsic::aarch64_sve_add_u:
20646 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20647 N->getOperand(3));
20648 case Intrinsic::aarch64_sve_sub_u:
20649 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20650 N->getOperand(3));
20651 case Intrinsic::aarch64_sve_subr:
20652 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20653 case Intrinsic::aarch64_sve_and_u:
20654 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20655 N->getOperand(3));
20656 case Intrinsic::aarch64_sve_bic_u:
20657 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20658 N->getOperand(2), N->getOperand(3));
20659 case Intrinsic::aarch64_sve_eor_u:
20660 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20661 N->getOperand(3));
20662 case Intrinsic::aarch64_sve_orr_u:
20663 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20664 N->getOperand(3));
20665 case Intrinsic::aarch64_sve_sabd_u:
20666 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20667 N->getOperand(2), N->getOperand(3));
20668 case Intrinsic::aarch64_sve_uabd_u:
20669 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20670 N->getOperand(2), N->getOperand(3));
20671 case Intrinsic::aarch64_sve_sdiv_u:
20672 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20673 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20674 case Intrinsic::aarch64_sve_udiv_u:
20675 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20676 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20677 case Intrinsic::aarch64_sve_sqadd:
20678 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20679 case Intrinsic::aarch64_sve_sqsub_u:
20680 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20681 N->getOperand(2), N->getOperand(3));
20682 case Intrinsic::aarch64_sve_uqadd:
20683 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20684 case Intrinsic::aarch64_sve_uqsub_u:
20685 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20686 N->getOperand(2), N->getOperand(3));
20687 case Intrinsic::aarch64_sve_sqadd_x:
20688 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20689 N->getOperand(1), N->getOperand(2));
20690 case Intrinsic::aarch64_sve_sqsub_x:
20691 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20692 N->getOperand(1), N->getOperand(2));
20693 case Intrinsic::aarch64_sve_uqadd_x:
20694 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20695 N->getOperand(1), N->getOperand(2));
20696 case Intrinsic::aarch64_sve_uqsub_x:
20697 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20698 N->getOperand(1), N->getOperand(2));
20699 case Intrinsic::aarch64_sve_asrd:
20700 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20701 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20702 case Intrinsic::aarch64_sve_cmphs:
20703 if (!N->getOperand(2).getValueType().isFloatingPoint())
20705 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20706 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20707 break;
20708 case Intrinsic::aarch64_sve_cmphi:
20709 if (!N->getOperand(2).getValueType().isFloatingPoint())
20711 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20712 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20713 break;
20714 case Intrinsic::aarch64_sve_fcmpge:
20715 case Intrinsic::aarch64_sve_cmpge:
20717 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20718 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20719 break;
20720 case Intrinsic::aarch64_sve_fcmpgt:
20721 case Intrinsic::aarch64_sve_cmpgt:
20723 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20724 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20725 break;
20726 case Intrinsic::aarch64_sve_fcmpeq:
20727 case Intrinsic::aarch64_sve_cmpeq:
20729 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20730 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20731 break;
20732 case Intrinsic::aarch64_sve_fcmpne:
20733 case Intrinsic::aarch64_sve_cmpne:
20735 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20736 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20737 break;
20738 case Intrinsic::aarch64_sve_fcmpuo:
20740 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20741 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20742 break;
20743 case Intrinsic::aarch64_sve_fadda:
20745 case Intrinsic::aarch64_sve_faddv:
20747 case Intrinsic::aarch64_sve_fmaxnmv:
20749 case Intrinsic::aarch64_sve_fmaxv:
20751 case Intrinsic::aarch64_sve_fminnmv:
20753 case Intrinsic::aarch64_sve_fminv:
20755 case Intrinsic::aarch64_sve_sel:
20756 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20757 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20758 case Intrinsic::aarch64_sve_cmpeq_wide:
20759 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20760 case Intrinsic::aarch64_sve_cmpne_wide:
20761 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20762 case Intrinsic::aarch64_sve_cmpge_wide:
20763 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20764 case Intrinsic::aarch64_sve_cmpgt_wide:
20765 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20766 case Intrinsic::aarch64_sve_cmplt_wide:
20767 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20768 case Intrinsic::aarch64_sve_cmple_wide:
20769 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20770 case Intrinsic::aarch64_sve_cmphs_wide:
20771 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20772 case Intrinsic::aarch64_sve_cmphi_wide:
20773 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20774 case Intrinsic::aarch64_sve_cmplo_wide:
20775 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20776 case Intrinsic::aarch64_sve_cmpls_wide:
20777 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20778 case Intrinsic::aarch64_sve_ptest_any:
20779 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20781 case Intrinsic::aarch64_sve_ptest_first:
20782 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20784 case Intrinsic::aarch64_sve_ptest_last:
20785 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20787 }
20788 return SDValue();
20789}
20790
20791static bool isCheapToExtend(const SDValue &N) {
20792 unsigned OC = N->getOpcode();
20793 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20795}
20796
20797static SDValue
20799 SelectionDAG &DAG) {
20800 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20801 // we can move the sext into the arguments and have the same result. For
20802 // example, if A and B are both loads, we can make those extending loads and
20803 // avoid an extra instruction. This pattern appears often in VLS code
20804 // generation where the inputs to the setcc have a different size to the
20805 // instruction that wants to use the result of the setcc.
20806 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20807 N->getOperand(0)->getOpcode() == ISD::SETCC);
20808 const SDValue SetCC = N->getOperand(0);
20809
20810 const SDValue CCOp0 = SetCC.getOperand(0);
20811 const SDValue CCOp1 = SetCC.getOperand(1);
20812 if (!CCOp0->getValueType(0).isInteger() ||
20813 !CCOp1->getValueType(0).isInteger())
20814 return SDValue();
20815
20816 ISD::CondCode Code =
20817 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20818
20819 ISD::NodeType ExtType =
20820 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20821
20822 if (isCheapToExtend(SetCC.getOperand(0)) &&
20823 isCheapToExtend(SetCC.getOperand(1))) {
20824 const SDValue Ext1 =
20825 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20826 const SDValue Ext2 =
20827 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20828
20829 return DAG.getSetCC(
20830 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20831 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20832 }
20833
20834 return SDValue();
20835}
20836
20839 SelectionDAG &DAG) {
20840 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20841 // we can convert that DUP into another extract_high (of a bigger DUP), which
20842 // helps the backend to decide that an sabdl2 would be useful, saving a real
20843 // extract_high operation.
20844 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20845 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20846 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20847 SDNode *ABDNode = N->getOperand(0).getNode();
20848 SDValue NewABD =
20850 if (!NewABD.getNode())
20851 return SDValue();
20852
20853 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20854 }
20855
20856 if (N->getValueType(0).isFixedLengthVector() &&
20857 N->getOpcode() == ISD::SIGN_EXTEND &&
20858 N->getOperand(0)->getOpcode() == ISD::SETCC)
20859 return performSignExtendSetCCCombine(N, DCI, DAG);
20860
20861 return SDValue();
20862}
20863
20865 SDValue SplatVal, unsigned NumVecElts) {
20866 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20867 Align OrigAlignment = St.getAlign();
20868 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20869
20870 // Create scalar stores. This is at least as good as the code sequence for a
20871 // split unaligned store which is a dup.s, ext.b, and two stores.
20872 // Most of the time the three stores should be replaced by store pair
20873 // instructions (stp).
20874 SDLoc DL(&St);
20875 SDValue BasePtr = St.getBasePtr();
20876 uint64_t BaseOffset = 0;
20877
20878 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20879 SDValue NewST1 =
20880 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20881 OrigAlignment, St.getMemOperand()->getFlags());
20882
20883 // As this in ISel, we will not merge this add which may degrade results.
20884 if (BasePtr->getOpcode() == ISD::ADD &&
20885 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20886 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20887 BasePtr = BasePtr->getOperand(0);
20888 }
20889
20890 unsigned Offset = EltOffset;
20891 while (--NumVecElts) {
20892 Align Alignment = commonAlignment(OrigAlignment, Offset);
20893 SDValue OffsetPtr =
20894 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20895 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20896 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20897 PtrInfo.getWithOffset(Offset), Alignment,
20898 St.getMemOperand()->getFlags());
20899 Offset += EltOffset;
20900 }
20901 return NewST1;
20902}
20903
20904// Returns an SVE type that ContentTy can be trivially sign or zero extended
20905// into.
20906static MVT getSVEContainerType(EVT ContentTy) {
20907 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20908
20909 switch (ContentTy.getSimpleVT().SimpleTy) {
20910 default:
20911 llvm_unreachable("No known SVE container for this MVT type");
20912 case MVT::nxv2i8:
20913 case MVT::nxv2i16:
20914 case MVT::nxv2i32:
20915 case MVT::nxv2i64:
20916 case MVT::nxv2f32:
20917 case MVT::nxv2f64:
20918 return MVT::nxv2i64;
20919 case MVT::nxv4i8:
20920 case MVT::nxv4i16:
20921 case MVT::nxv4i32:
20922 case MVT::nxv4f32:
20923 return MVT::nxv4i32;
20924 case MVT::nxv8i8:
20925 case MVT::nxv8i16:
20926 case MVT::nxv8f16:
20927 case MVT::nxv8bf16:
20928 return MVT::nxv8i16;
20929 case MVT::nxv16i8:
20930 return MVT::nxv16i8;
20931 }
20932}
20933
20934static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20935 SDLoc DL(N);
20936 EVT VT = N->getValueType(0);
20937
20939 return SDValue();
20940
20941 EVT ContainerVT = VT;
20942 if (ContainerVT.isInteger())
20943 ContainerVT = getSVEContainerType(ContainerVT);
20944
20945 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20946 SDValue Ops[] = { N->getOperand(0), // Chain
20947 N->getOperand(2), // Pg
20948 N->getOperand(3), // Base
20949 DAG.getValueType(VT) };
20950
20951 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
20952 SDValue LoadChain = SDValue(Load.getNode(), 1);
20953
20954 if (ContainerVT.isInteger() && (VT != ContainerVT))
20955 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
20956
20957 return DAG.getMergeValues({ Load, LoadChain }, DL);
20958}
20959
20961 SDLoc DL(N);
20962 EVT VT = N->getValueType(0);
20963 EVT PtrTy = N->getOperand(3).getValueType();
20964
20965 EVT LoadVT = VT;
20966 if (VT.isFloatingPoint())
20967 LoadVT = VT.changeTypeToInteger();
20968
20969 auto *MINode = cast<MemIntrinsicSDNode>(N);
20970 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
20971 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
20972 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
20973 MINode->getOperand(2), PassThru,
20974 MINode->getMemoryVT(), MINode->getMemOperand(),
20976
20977 if (VT.isFloatingPoint()) {
20978 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
20979 return DAG.getMergeValues(Ops, DL);
20980 }
20981
20982 return L;
20983}
20984
20985template <unsigned Opcode>
20987 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
20989 "Unsupported opcode.");
20990 SDLoc DL(N);
20991 EVT VT = N->getValueType(0);
20992
20993 EVT LoadVT = VT;
20994 if (VT.isFloatingPoint())
20995 LoadVT = VT.changeTypeToInteger();
20996
20997 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
20998 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
20999 SDValue LoadChain = SDValue(Load.getNode(), 1);
21000
21001 if (VT.isFloatingPoint())
21002 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21003
21004 return DAG.getMergeValues({Load, LoadChain}, DL);
21005}
21006
21008 SDLoc DL(N);
21009 SDValue Data = N->getOperand(2);
21010 EVT DataVT = Data.getValueType();
21011 EVT HwSrcVt = getSVEContainerType(DataVT);
21012 SDValue InputVT = DAG.getValueType(DataVT);
21013
21014 if (DataVT.isFloatingPoint())
21015 InputVT = DAG.getValueType(HwSrcVt);
21016
21017 SDValue SrcNew;
21018 if (Data.getValueType().isFloatingPoint())
21019 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21020 else
21021 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21022
21023 SDValue Ops[] = { N->getOperand(0), // Chain
21024 SrcNew,
21025 N->getOperand(4), // Base
21026 N->getOperand(3), // Pg
21027 InputVT
21028 };
21029
21030 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21031}
21032
21034 SDLoc DL(N);
21035
21036 SDValue Data = N->getOperand(2);
21037 EVT DataVT = Data.getValueType();
21038 EVT PtrTy = N->getOperand(4).getValueType();
21039
21040 if (DataVT.isFloatingPoint())
21042
21043 auto *MINode = cast<MemIntrinsicSDNode>(N);
21044 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21045 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21046 MINode->getMemoryVT(), MINode->getMemOperand(),
21047 ISD::UNINDEXED, false, false);
21048}
21049
21050/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21051/// load store optimizer pass will merge them to store pair stores. This should
21052/// be better than a movi to create the vector zero followed by a vector store
21053/// if the zero constant is not re-used, since one instructions and one register
21054/// live range will be removed.
21055///
21056/// For example, the final generated code should be:
21057///
21058/// stp xzr, xzr, [x0]
21059///
21060/// instead of:
21061///
21062/// movi v0.2d, #0
21063/// str q0, [x0]
21064///
21066 SDValue StVal = St.getValue();
21067 EVT VT = StVal.getValueType();
21068
21069 // Avoid scalarizing zero splat stores for scalable vectors.
21070 if (VT.isScalableVector())
21071 return SDValue();
21072
21073 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21074 // 2, 3 or 4 i32 elements.
21075 int NumVecElts = VT.getVectorNumElements();
21076 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21077 VT.getVectorElementType().getSizeInBits() == 64) ||
21078 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21079 VT.getVectorElementType().getSizeInBits() == 32)))
21080 return SDValue();
21081
21082 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21083 return SDValue();
21084
21085 // If the zero constant has more than one use then the vector store could be
21086 // better since the constant mov will be amortized and stp q instructions
21087 // should be able to be formed.
21088 if (!StVal.hasOneUse())
21089 return SDValue();
21090
21091 // If the store is truncating then it's going down to i16 or smaller, which
21092 // means it can be implemented in a single store anyway.
21093 if (St.isTruncatingStore())
21094 return SDValue();
21095
21096 // If the immediate offset of the address operand is too large for the stp
21097 // instruction, then bail out.
21098 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21099 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21100 if (Offset < -512 || Offset > 504)
21101 return SDValue();
21102 }
21103
21104 for (int I = 0; I < NumVecElts; ++I) {
21105 SDValue EltVal = StVal.getOperand(I);
21106 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21107 return SDValue();
21108 }
21109
21110 // Use a CopyFromReg WZR/XZR here to prevent
21111 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21112 SDLoc DL(&St);
21113 unsigned ZeroReg;
21114 EVT ZeroVT;
21115 if (VT.getVectorElementType().getSizeInBits() == 32) {
21116 ZeroReg = AArch64::WZR;
21117 ZeroVT = MVT::i32;
21118 } else {
21119 ZeroReg = AArch64::XZR;
21120 ZeroVT = MVT::i64;
21121 }
21122 SDValue SplatVal =
21123 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21124 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21125}
21126
21127/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21128/// value. The load store optimizer pass will merge them to store pair stores.
21129/// This has better performance than a splat of the scalar followed by a split
21130/// vector store. Even if the stores are not merged it is four stores vs a dup,
21131/// followed by an ext.b and two stores.
21133 SDValue StVal = St.getValue();
21134 EVT VT = StVal.getValueType();
21135
21136 // Don't replace floating point stores, they possibly won't be transformed to
21137 // stp because of the store pair suppress pass.
21138 if (VT.isFloatingPoint())
21139 return SDValue();
21140
21141 // We can express a splat as store pair(s) for 2 or 4 elements.
21142 unsigned NumVecElts = VT.getVectorNumElements();
21143 if (NumVecElts != 4 && NumVecElts != 2)
21144 return SDValue();
21145
21146 // If the store is truncating then it's going down to i16 or smaller, which
21147 // means it can be implemented in a single store anyway.
21148 if (St.isTruncatingStore())
21149 return SDValue();
21150
21151 // Check that this is a splat.
21152 // Make sure that each of the relevant vector element locations are inserted
21153 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21154 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21155 SDValue SplatVal;
21156 for (unsigned I = 0; I < NumVecElts; ++I) {
21157 // Check for insert vector elements.
21158 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21159 return SDValue();
21160
21161 // Check that same value is inserted at each vector element.
21162 if (I == 0)
21163 SplatVal = StVal.getOperand(1);
21164 else if (StVal.getOperand(1) != SplatVal)
21165 return SDValue();
21166
21167 // Check insert element index.
21168 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21169 if (!CIndex)
21170 return SDValue();
21171 uint64_t IndexVal = CIndex->getZExtValue();
21172 if (IndexVal >= NumVecElts)
21173 return SDValue();
21174 IndexNotInserted.reset(IndexVal);
21175
21176 StVal = StVal.getOperand(0);
21177 }
21178 // Check that all vector element locations were inserted to.
21179 if (IndexNotInserted.any())
21180 return SDValue();
21181
21182 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21183}
21184
21186 SelectionDAG &DAG,
21187 const AArch64Subtarget *Subtarget) {
21188
21189 StoreSDNode *S = cast<StoreSDNode>(N);
21190 if (S->isVolatile() || S->isIndexed())
21191 return SDValue();
21192
21193 SDValue StVal = S->getValue();
21194 EVT VT = StVal.getValueType();
21195
21196 if (!VT.isFixedLengthVector())
21197 return SDValue();
21198
21199 // If we get a splat of zeros, convert this vector store to a store of
21200 // scalars. They will be merged into store pairs of xzr thereby removing one
21201 // instruction and one register.
21202 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21203 return ReplacedZeroSplat;
21204
21205 // FIXME: The logic for deciding if an unaligned store should be split should
21206 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21207 // a call to that function here.
21208
21209 if (!Subtarget->isMisaligned128StoreSlow())
21210 return SDValue();
21211
21212 // Don't split at -Oz.
21214 return SDValue();
21215
21216 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21217 // those up regresses performance on micro-benchmarks and olden/bh.
21218 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21219 return SDValue();
21220
21221 // Split unaligned 16B stores. They are terrible for performance.
21222 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21223 // extensions can use this to mark that it does not want splitting to happen
21224 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21225 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21226 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21227 S->getAlign() <= Align(2))
21228 return SDValue();
21229
21230 // If we get a splat of a scalar convert this vector store to a store of
21231 // scalars. They will be merged into store pairs thereby removing two
21232 // instructions.
21233 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21234 return ReplacedSplat;
21235
21236 SDLoc DL(S);
21237
21238 // Split VT into two.
21239 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21240 unsigned NumElts = HalfVT.getVectorNumElements();
21241 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21242 DAG.getConstant(0, DL, MVT::i64));
21243 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21244 DAG.getConstant(NumElts, DL, MVT::i64));
21245 SDValue BasePtr = S->getBasePtr();
21246 SDValue NewST1 =
21247 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21248 S->getAlign(), S->getMemOperand()->getFlags());
21249 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21250 DAG.getConstant(8, DL, MVT::i64));
21251 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21252 S->getPointerInfo(), S->getAlign(),
21253 S->getMemOperand()->getFlags());
21254}
21255
21257 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21258
21259 // splice(pg, op1, undef) -> op1
21260 if (N->getOperand(2).isUndef())
21261 return N->getOperand(1);
21262
21263 return SDValue();
21264}
21265
21267 const AArch64Subtarget *Subtarget) {
21268 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21269 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21270 "Unexpected Opcode!");
21271
21272 // uunpklo/hi undef -> undef
21273 if (N->getOperand(0).isUndef())
21274 return DAG.getUNDEF(N->getValueType(0));
21275
21276 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21277 // extending load. We can do this even if this is already a masked
21278 // {z,}extload.
21279 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21280 N->getOpcode() == AArch64ISD::UUNPKLO) {
21281 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21282 SDValue Mask = MLD->getMask();
21283 SDLoc DL(N);
21284
21285 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21286 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21287 (MLD->getPassThru()->isUndef() ||
21288 isZerosVector(MLD->getPassThru().getNode()))) {
21289 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21290 unsigned PgPattern = Mask->getConstantOperandVal(0);
21291 EVT VT = N->getValueType(0);
21292
21293 // Ensure we can double the size of the predicate pattern
21294 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21295 if (NumElts &&
21296 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21297 Mask =
21298 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21299 SDValue PassThru = DAG.getConstant(0, DL, VT);
21300 SDValue NewLoad = DAG.getMaskedLoad(
21301 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21302 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21304
21305 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21306
21307 return NewLoad;
21308 }
21309 }
21310 }
21311
21312 return SDValue();
21313}
21314
21316 if (N->getOpcode() != AArch64ISD::UZP1)
21317 return false;
21318 SDValue Op0 = N->getOperand(0);
21319 EVT SrcVT = Op0->getValueType(0);
21320 EVT DstVT = N->getValueType(0);
21321 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21322 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21323 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21324}
21325
21326// Try to combine rounding shifts where the operands come from an extend, and
21327// the result is truncated and combined into one vector.
21328// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21330 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21331 SDValue Op0 = N->getOperand(0);
21332 SDValue Op1 = N->getOperand(1);
21333 EVT ResVT = N->getValueType(0);
21334
21335 unsigned RshOpc = Op0.getOpcode();
21336 if (RshOpc != AArch64ISD::RSHRNB_I)
21337 return SDValue();
21338
21339 // Same op code and imm value?
21340 SDValue ShiftValue = Op0.getOperand(1);
21341 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21342 return SDValue();
21343
21344 // Same unextended operand value?
21345 SDValue Lo = Op0.getOperand(0);
21346 SDValue Hi = Op1.getOperand(0);
21347 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21348 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21349 return SDValue();
21350 SDValue OrigArg = Lo.getOperand(0);
21351 if (OrigArg != Hi.getOperand(0))
21352 return SDValue();
21353
21354 SDLoc DL(N);
21355 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21356 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21357 ShiftValue);
21358}
21359
21360// Try to simplify:
21361// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21362// t2 = nxv8i16 srl(t1, ShiftValue)
21363// to
21364// t1 = nxv8i16 rshrnb(X, shiftvalue).
21365// rshrnb will zero the top half bits of each element. Therefore, this combine
21366// should only be performed when a following instruction with the rshrnb
21367// as an operand does not care about the top half of each element. For example,
21368// a uzp1 or a truncating store.
21370 const AArch64Subtarget *Subtarget) {
21371 EVT VT = Srl->getValueType(0);
21372 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21373 return SDValue();
21374
21375 EVT ResVT;
21376 if (VT == MVT::nxv8i16)
21377 ResVT = MVT::nxv16i8;
21378 else if (VT == MVT::nxv4i32)
21379 ResVT = MVT::nxv8i16;
21380 else if (VT == MVT::nxv2i64)
21381 ResVT = MVT::nxv4i32;
21382 else
21383 return SDValue();
21384
21385 SDLoc DL(Srl);
21386 unsigned ShiftValue;
21387 SDValue RShOperand;
21388 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21389 return SDValue();
21390 SDValue Rshrnb = DAG.getNode(
21391 AArch64ISD::RSHRNB_I, DL, ResVT,
21392 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21393 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21394}
21395
21397 const AArch64Subtarget *Subtarget) {
21398 SDLoc DL(N);
21399 SDValue Op0 = N->getOperand(0);
21400 SDValue Op1 = N->getOperand(1);
21401 EVT ResVT = N->getValueType(0);
21402
21403 // uzp1(x, undef) -> concat(truncate(x), undef)
21404 if (Op1.getOpcode() == ISD::UNDEF) {
21405 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21406 switch (ResVT.getSimpleVT().SimpleTy) {
21407 default:
21408 break;
21409 case MVT::v16i8:
21410 BCVT = MVT::v8i16;
21411 HalfVT = MVT::v8i8;
21412 break;
21413 case MVT::v8i16:
21414 BCVT = MVT::v4i32;
21415 HalfVT = MVT::v4i16;
21416 break;
21417 case MVT::v4i32:
21418 BCVT = MVT::v2i64;
21419 HalfVT = MVT::v2i32;
21420 break;
21421 }
21422 if (BCVT != MVT::Other) {
21423 SDValue BC = DAG.getBitcast(BCVT, Op0);
21424 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21425 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21426 DAG.getUNDEF(HalfVT));
21427 }
21428 }
21429
21430 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21431 return Urshr;
21432
21433 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21434 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21435
21436 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21437 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21438
21439 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21440 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21441 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21442 SDValue X = Op0.getOperand(0).getOperand(0);
21443 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21444 }
21445 }
21446
21447 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21448 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21449 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21450 SDValue Z = Op1.getOperand(0).getOperand(1);
21451 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21452 }
21453 }
21454
21455 // These optimizations only work on little endian.
21456 if (!DAG.getDataLayout().isLittleEndian())
21457 return SDValue();
21458
21459 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21460 // Example:
21461 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21462 // to
21463 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21465 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21466 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21467 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21468 Op1.getOperand(0));
21469 }
21470 }
21471
21472 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21473 return SDValue();
21474
21475 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21476 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21477
21478 // truncating uzp1(x, y) -> xtn(concat (x, y))
21479 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21480 EVT Op0Ty = SourceOp0.getValueType();
21481 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21482 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21483 SDValue Concat =
21486 SourceOp0, SourceOp1);
21487 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21488 }
21489 }
21490
21491 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21492 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21493 SourceOp1.getOpcode() != ISD::TRUNCATE)
21494 return SDValue();
21495 SourceOp0 = SourceOp0.getOperand(0);
21496 SourceOp1 = SourceOp1.getOperand(0);
21497
21498 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21499 !SourceOp0.getValueType().isSimple())
21500 return SDValue();
21501
21502 EVT ResultTy;
21503
21504 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21505 case MVT::v2i64:
21506 ResultTy = MVT::v4i32;
21507 break;
21508 case MVT::v4i32:
21509 ResultTy = MVT::v8i16;
21510 break;
21511 case MVT::v8i16:
21512 ResultTy = MVT::v16i8;
21513 break;
21514 default:
21515 return SDValue();
21516 }
21517
21518 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21519 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21520 SDValue UzpResult =
21521 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21522
21523 EVT BitcastResultTy;
21524
21525 switch (ResVT.getSimpleVT().SimpleTy) {
21526 case MVT::v2i32:
21527 BitcastResultTy = MVT::v2i64;
21528 break;
21529 case MVT::v4i16:
21530 BitcastResultTy = MVT::v4i32;
21531 break;
21532 case MVT::v8i8:
21533 BitcastResultTy = MVT::v8i16;
21534 break;
21535 default:
21536 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21537 }
21538
21539 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21540 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21541}
21542
21544 unsigned Opc = N->getOpcode();
21545
21546 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21548 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21550 "Invalid opcode.");
21551
21552 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21554 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21556 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21560
21561 SDLoc DL(N);
21562 SDValue Chain = N->getOperand(0);
21563 SDValue Pg = N->getOperand(1);
21564 SDValue Base = N->getOperand(2);
21565 SDValue Offset = N->getOperand(3);
21566 SDValue Ty = N->getOperand(4);
21567
21568 EVT ResVT = N->getValueType(0);
21569
21570 const auto OffsetOpc = Offset.getOpcode();
21571 const bool OffsetIsZExt =
21573 const bool OffsetIsSExt =
21575
21576 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21577 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21578 SDValue ExtPg = Offset.getOperand(0);
21579 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21580 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21581
21582 // If the predicate for the sign- or zero-extended offset is the
21583 // same as the predicate used for this load and the sign-/zero-extension
21584 // was from a 32-bits...
21585 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21586 SDValue UnextendedOffset = Offset.getOperand(1);
21587
21588 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21589 if (Signed)
21590 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21591
21592 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21593 {Chain, Pg, Base, UnextendedOffset, Ty});
21594 }
21595 }
21596
21597 return SDValue();
21598}
21599
21600/// Optimize a vector shift instruction and its operand if shifted out
21601/// bits are not used.
21603 const AArch64TargetLowering &TLI,
21605 assert(N->getOpcode() == AArch64ISD::VASHR ||
21606 N->getOpcode() == AArch64ISD::VLSHR);
21607
21608 SDValue Op = N->getOperand(0);
21609 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21610
21611 unsigned ShiftImm = N->getConstantOperandVal(1);
21612 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21613
21614 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21615 if (N->getOpcode() == AArch64ISD::VASHR &&
21616 Op.getOpcode() == AArch64ISD::VSHL &&
21617 N->getOperand(1) == Op.getOperand(1))
21618 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21619 return Op.getOperand(0);
21620
21621 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21622 APInt DemandedMask = ~ShiftedOutBits;
21623
21624 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21625 return SDValue(N, 0);
21626
21627 return SDValue();
21628}
21629
21631 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21632 // This transform works in partnership with performSetCCPunpkCombine to
21633 // remove unnecessary transfer of predicates into standard registers and back
21634 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21635 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21636 MVT::i1) {
21637 SDValue CC = N->getOperand(0)->getOperand(0);
21638 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21639 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21640 DAG.getVectorIdxConstant(0, SDLoc(N)));
21641 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21642 }
21643
21644 return SDValue();
21645}
21646
21647/// Target-specific DAG combine function for post-increment LD1 (lane) and
21648/// post-increment LD1R.
21651 bool IsLaneOp) {
21652 if (DCI.isBeforeLegalizeOps())
21653 return SDValue();
21654
21655 SelectionDAG &DAG = DCI.DAG;
21656 EVT VT = N->getValueType(0);
21657
21658 if (!VT.is128BitVector() && !VT.is64BitVector())
21659 return SDValue();
21660
21661 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21662 SDNode *LD = N->getOperand(LoadIdx).getNode();
21663 // If it is not LOAD, can not do such combine.
21664 if (LD->getOpcode() != ISD::LOAD)
21665 return SDValue();
21666
21667 // The vector lane must be a constant in the LD1LANE opcode.
21668 SDValue Lane;
21669 if (IsLaneOp) {
21670 Lane = N->getOperand(2);
21671 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21672 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21673 return SDValue();
21674 }
21675
21676 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21677 EVT MemVT = LoadSDN->getMemoryVT();
21678 // Check if memory operand is the same type as the vector element.
21679 if (MemVT != VT.getVectorElementType())
21680 return SDValue();
21681
21682 // Check if there are other uses. If so, do not combine as it will introduce
21683 // an extra load.
21684 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21685 ++UI) {
21686 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21687 continue;
21688 if (*UI != N)
21689 return SDValue();
21690 }
21691
21692 // If there is one use and it can splat the value, prefer that operation.
21693 // TODO: This could be expanded to more operations if they reliably use the
21694 // index variants.
21695 if (N->hasOneUse()) {
21696 unsigned UseOpc = N->use_begin()->getOpcode();
21697 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21698 return SDValue();
21699 }
21700
21701 SDValue Addr = LD->getOperand(1);
21702 SDValue Vector = N->getOperand(0);
21703 // Search for a use of the address operand that is an increment.
21704 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21705 Addr.getNode()->use_end(); UI != UE; ++UI) {
21706 SDNode *User = *UI;
21707 if (User->getOpcode() != ISD::ADD
21708 || UI.getUse().getResNo() != Addr.getResNo())
21709 continue;
21710
21711 // If the increment is a constant, it must match the memory ref size.
21712 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21713 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21714 uint32_t IncVal = CInc->getZExtValue();
21715 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21716 if (IncVal != NumBytes)
21717 continue;
21718 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21719 }
21720
21721 // To avoid cycle construction make sure that neither the load nor the add
21722 // are predecessors to each other or the Vector.
21725 Visited.insert(Addr.getNode());
21726 Worklist.push_back(User);
21727 Worklist.push_back(LD);
21728 Worklist.push_back(Vector.getNode());
21729 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21730 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21731 continue;
21732
21734 Ops.push_back(LD->getOperand(0)); // Chain
21735 if (IsLaneOp) {
21736 Ops.push_back(Vector); // The vector to be inserted
21737 Ops.push_back(Lane); // The lane to be inserted in the vector
21738 }
21739 Ops.push_back(Addr);
21740 Ops.push_back(Inc);
21741
21742 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21743 SDVTList SDTys = DAG.getVTList(Tys);
21744 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21745 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21746 MemVT,
21747 LoadSDN->getMemOperand());
21748
21749 // Update the uses.
21750 SDValue NewResults[] = {
21751 SDValue(LD, 0), // The result of load
21752 SDValue(UpdN.getNode(), 2) // Chain
21753 };
21754 DCI.CombineTo(LD, NewResults);
21755 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21756 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21757
21758 break;
21759 }
21760 return SDValue();
21761}
21762
21763/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21764/// address translation.
21767 SelectionDAG &DAG) {
21768 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21769 KnownBits Known;
21771 !DCI.isBeforeLegalizeOps());
21772 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21773 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21774 DCI.CommitTargetLoweringOpt(TLO);
21775 return true;
21776 }
21777 return false;
21778}
21779
21781 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21782 "Expected STORE dag node in input!");
21783
21784 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21785 if (!Store->isTruncatingStore() || Store->isIndexed())
21786 return SDValue();
21787 SDValue Ext = Store->getValue();
21788 auto ExtOpCode = Ext.getOpcode();
21789 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21790 ExtOpCode != ISD::ANY_EXTEND)
21791 return SDValue();
21792 SDValue Orig = Ext->getOperand(0);
21793 if (Store->getMemoryVT() != Orig.getValueType())
21794 return SDValue();
21795 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21796 Store->getBasePtr(), Store->getMemOperand());
21797 }
21798
21799 return SDValue();
21800}
21801
21802// A custom combine to lower load <3 x i8> as the more efficient sequence
21803// below:
21804// ldrb wX, [x0, #2]
21805// ldrh wY, [x0]
21806// orr wX, wY, wX, lsl #16
21807// fmov s0, wX
21808//
21809// Note that an alternative sequence with even fewer (although usually more
21810// complex/expensive) instructions would be:
21811// ld1r.4h { v0 }, [x0], #2
21812// ld1.b { v0 }[2], [x0]
21813//
21814// Generating this sequence unfortunately results in noticeably worse codegen
21815// for code that extends the loaded v3i8, due to legalization breaking vector
21816// shuffle detection in a way that is very difficult to work around.
21817// TODO: Revisit once v3i8 legalization has been improved in general.
21819 EVT MemVT = LD->getMemoryVT();
21820 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21821 LD->getOriginalAlign() >= 4)
21822 return SDValue();
21823
21824 SDLoc DL(LD);
21826 SDValue Chain = LD->getChain();
21827 SDValue BasePtr = LD->getBasePtr();
21828 MachineMemOperand *MMO = LD->getMemOperand();
21829 assert(LD->getOffset().isUndef() && "undef offset expected");
21830
21831 // Load 2 x i8, then 1 x i8.
21832 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21833 TypeSize Offset2 = TypeSize::getFixed(2);
21834 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21835 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21836 MF.getMachineMemOperand(MMO, 2, 1));
21837
21838 // Extend to i32.
21839 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21840 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21841
21842 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21843 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21844 DAG.getConstant(16, DL, MVT::i32));
21845 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21846 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21847
21848 // Extract v3i8 again.
21849 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21850 DAG.getConstant(0, DL, MVT::i64));
21851 SDValue TokenFactor = DAG.getNode(
21852 ISD::TokenFactor, DL, MVT::Other,
21853 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21854 return DAG.getMergeValues({Extract, TokenFactor}, DL);
21855}
21856
21857// Perform TBI simplification if supported by the target and try to break up
21858// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21859// load instructions can be selected.
21862 SelectionDAG &DAG,
21863 const AArch64Subtarget *Subtarget) {
21864 if (Subtarget->supportsAddressTopByteIgnored())
21865 performTBISimplification(N->getOperand(1), DCI, DAG);
21866
21867 LoadSDNode *LD = cast<LoadSDNode>(N);
21868 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21869 return SDValue(N, 0);
21870
21871 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21872 return Res;
21873
21874 if (!LD->isNonTemporal())
21875 return SDValue(N, 0);
21876
21877 EVT MemVT = LD->getMemoryVT();
21878 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21879 MemVT.getSizeInBits() % 256 == 0 ||
21880 256 % MemVT.getScalarSizeInBits() != 0)
21881 return SDValue(N, 0);
21882
21883 SDLoc DL(LD);
21884 SDValue Chain = LD->getChain();
21885 SDValue BasePtr = LD->getBasePtr();
21886 SDNodeFlags Flags = LD->getFlags();
21888 SmallVector<SDValue, 4> LoadOpsChain;
21889 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21890 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21891 // loads and reduce the amount of load instructions generated.
21892 MVT NewVT =
21894 256 / MemVT.getVectorElementType().getSizeInBits());
21895 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21896 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21897 for (unsigned I = 0; I < Num256Loads; I++) {
21898 unsigned PtrOffset = I * 32;
21899 SDValue NewPtr = DAG.getMemBasePlusOffset(
21900 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21901 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21902 SDValue NewLoad = DAG.getLoad(
21903 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21904 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21905 LoadOps.push_back(NewLoad);
21906 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21907 }
21908
21909 // Process remaining bits of the load operation.
21910 // This is done by creating an UNDEF vector to match the size of the
21911 // 256-bit loads and inserting the remaining load to it. We extract the
21912 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21913 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21914 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21915 MVT RemainingVT = MVT::getVectorVT(
21917 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21918 SDValue NewPtr = DAG.getMemBasePlusOffset(
21919 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21920 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21921 SDValue RemainingLoad =
21922 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21923 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21924 LD->getMemOperand()->getFlags(), LD->getAAInfo());
21925 SDValue UndefVector = DAG.getUNDEF(NewVT);
21926 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21927 SDValue ExtendedReminingLoad =
21928 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21929 {UndefVector, RemainingLoad, InsertIdx});
21930 LoadOps.push_back(ExtendedReminingLoad);
21931 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
21932 EVT ConcatVT =
21934 LoadOps.size() * NewVT.getVectorNumElements());
21935 SDValue ConcatVectors =
21936 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
21937 // Extract the original vector type size.
21938 SDValue ExtractSubVector =
21939 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
21940 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
21941 SDValue TokenFactor =
21942 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21943 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
21944}
21945
21947 EVT VecVT = Op.getValueType();
21948 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
21949 "Need boolean vector type.");
21950
21951 if (Depth > 3)
21953
21954 // We can get the base type from a vector compare or truncate.
21955 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
21956 return Op.getOperand(0).getValueType();
21957
21958 // If an operand is a bool vector, continue looking.
21960 for (SDValue Operand : Op->op_values()) {
21961 if (Operand.getValueType() != VecVT)
21962 continue;
21963
21964 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
21965 if (!BaseVT.isSimple())
21966 BaseVT = OperandVT;
21967 else if (OperandVT != BaseVT)
21969 }
21970
21971 return BaseVT;
21972}
21973
21974// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
21975// iN, we can use a trick that extracts the i^th bit from the i^th element and
21976// then performs a vector add to get a scalar bitmask. This requires that each
21977// element's bits are either all 1 or all 0.
21979 SDLoc DL(N);
21980 SDValue ComparisonResult(N, 0);
21981 EVT VecVT = ComparisonResult.getValueType();
21982 assert(VecVT.isVector() && "Must be a vector type");
21983
21984 unsigned NumElts = VecVT.getVectorNumElements();
21985 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
21986 return SDValue();
21987
21988 if (VecVT.getVectorElementType() != MVT::i1 &&
21989 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
21990 return SDValue();
21991
21992 // If we can find the original types to work on instead of a vector of i1,
21993 // we can avoid extend/extract conversion instructions.
21994 if (VecVT.getVectorElementType() == MVT::i1) {
21995 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
21996 if (!VecVT.isSimple()) {
21997 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
21998 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
21999 }
22000 }
22001 VecVT = VecVT.changeVectorElementTypeToInteger();
22002
22003 // Large vectors don't map directly to this conversion, so to avoid too many
22004 // edge cases, we don't apply it here. The conversion will likely still be
22005 // applied later via multiple smaller vectors, whose results are concatenated.
22006 if (VecVT.getSizeInBits() > 128)
22007 return SDValue();
22008
22009 // Ensure that all elements' bits are either 0s or 1s.
22010 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22011
22012 SmallVector<SDValue, 16> MaskConstants;
22013 if (VecVT == MVT::v16i8) {
22014 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22015 // per entry. We split it into two halves, apply the mask, zip the halves to
22016 // create 8x 16-bit values, and the perform the vector reduce.
22017 for (unsigned Half = 0; Half < 2; ++Half) {
22018 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22019 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22020 }
22021 }
22022 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22023 SDValue RepresentativeBits =
22024 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22025
22026 SDValue UpperRepresentativeBits =
22027 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22028 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22029 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22030 RepresentativeBits, UpperRepresentativeBits);
22031 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22032 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22033 }
22034
22035 // All other vector sizes.
22036 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22037 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22038 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22039 }
22040
22041 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22042 SDValue RepresentativeBits =
22043 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22044 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22045 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22046 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22047}
22048
22050 StoreSDNode *Store) {
22051 if (!Store->isTruncatingStore())
22052 return SDValue();
22053
22054 SDLoc DL(Store);
22055 SDValue VecOp = Store->getValue();
22056 EVT VT = VecOp.getValueType();
22057 EVT MemVT = Store->getMemoryVT();
22058
22059 if (!MemVT.isVector() || !VT.isVector() ||
22060 MemVT.getVectorElementType() != MVT::i1)
22061 return SDValue();
22062
22063 // If we are storing a vector that we are currently building, let
22064 // `scalarizeVectorStore()` handle this more efficiently.
22065 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22066 return SDValue();
22067
22068 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22069 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22070 if (!VectorBits)
22071 return SDValue();
22072
22073 EVT StoreVT =
22075 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22076 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22077 Store->getMemOperand());
22078}
22079
22081 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22082 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22083 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22084}
22085
22086// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22088 const AArch64Subtarget *Subtarget) {
22089 SDValue Value = ST->getValue();
22090 EVT ValueVT = Value.getValueType();
22091
22092 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22093 Value.getOpcode() != ISD::TRUNCATE ||
22094 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22095 return SDValue();
22096
22097 assert(ST->getOffset().isUndef() && "undef offset expected");
22098 SDLoc DL(ST);
22099 auto WideVT = EVT::getVectorVT(
22100 *DAG.getContext(),
22101 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22102 SDValue UndefVector = DAG.getUNDEF(WideVT);
22103 SDValue WideTrunc = DAG.getNode(
22104 ISD::INSERT_SUBVECTOR, DL, WideVT,
22105 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22106 SDValue Cast = DAG.getNode(
22107 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22108 WideTrunc);
22109
22111 SDValue Chain = ST->getChain();
22112 MachineMemOperand *MMO = ST->getMemOperand();
22113 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22114 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22115 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22116 TypeSize Offset2 = TypeSize::getFixed(2);
22117 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22118 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22119
22120 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22121 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22122 TypeSize Offset1 = TypeSize::getFixed(1);
22123 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22124 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22125
22126 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22127 DAG.getConstant(0, DL, MVT::i64));
22128 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22129 MF.getMachineMemOperand(MMO, 0, 1));
22130 return Chain;
22131}
22132
22135 SelectionDAG &DAG,
22136 const AArch64Subtarget *Subtarget) {
22137 StoreSDNode *ST = cast<StoreSDNode>(N);
22138 SDValue Chain = ST->getChain();
22139 SDValue Value = ST->getValue();
22140 SDValue Ptr = ST->getBasePtr();
22141 EVT ValueVT = Value.getValueType();
22142
22143 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22144 EVT EltVT = VT.getVectorElementType();
22145 return EltVT == MVT::f32 || EltVT == MVT::f64;
22146 };
22147
22148 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22149 return Res;
22150
22151 // If this is an FP_ROUND followed by a store, fold this into a truncating
22152 // store. We can do this even if this is already a truncstore.
22153 // We purposefully don't care about legality of the nodes here as we know
22154 // they can be split down into something legal.
22155 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22156 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22157 Subtarget->useSVEForFixedLengthVectors() &&
22158 ValueVT.isFixedLengthVector() &&
22159 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22160 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22161 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22162 ST->getMemoryVT(), ST->getMemOperand());
22163
22164 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22165 return Split;
22166
22167 if (Subtarget->supportsAddressTopByteIgnored() &&
22168 performTBISimplification(N->getOperand(2), DCI, DAG))
22169 return SDValue(N, 0);
22170
22171 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22172 return Store;
22173
22174 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22175 return Store;
22176
22177 if (ST->isTruncatingStore()) {
22178 EVT StoreVT = ST->getMemoryVT();
22179 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22180 return SDValue();
22181 if (SDValue Rshrnb =
22182 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22183 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22184 StoreVT, ST->getMemOperand());
22185 }
22186 }
22187
22188 return SDValue();
22189}
22190
22193 SelectionDAG &DAG,
22194 const AArch64Subtarget *Subtarget) {
22195 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22196 SDValue Value = MST->getValue();
22197 SDValue Mask = MST->getMask();
22198 SDLoc DL(N);
22199
22200 // If this is a UZP1 followed by a masked store, fold this into a masked
22201 // truncating store. We can do this even if this is already a masked
22202 // truncstore.
22203 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22204 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22205 Value.getValueType().isInteger()) {
22206 Value = Value.getOperand(0);
22207 if (Value.getOpcode() == ISD::BITCAST) {
22208 EVT HalfVT =
22209 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22210 EVT InVT = Value.getOperand(0).getValueType();
22211
22212 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22213 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22214 unsigned PgPattern = Mask->getConstantOperandVal(0);
22215
22216 // Ensure we can double the size of the predicate pattern
22217 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22218 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22219 MinSVESize) {
22220 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22221 PgPattern);
22222 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22223 MST->getBasePtr(), MST->getOffset(), Mask,
22224 MST->getMemoryVT(), MST->getMemOperand(),
22225 MST->getAddressingMode(),
22226 /*IsTruncating=*/true);
22227 }
22228 }
22229 }
22230 }
22231
22232 if (MST->isTruncatingStore()) {
22233 EVT ValueVT = Value->getValueType(0);
22234 EVT MemVT = MST->getMemoryVT();
22235 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22236 return SDValue();
22237 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22238 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22239 MST->getOffset(), MST->getMask(),
22240 MST->getMemoryVT(), MST->getMemOperand(),
22241 MST->getAddressingMode(), true);
22242 }
22243 }
22244
22245 return SDValue();
22246}
22247
22248/// \return true if part of the index was folded into the Base.
22249static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22250 SDLoc DL, SelectionDAG &DAG) {
22251 // This function assumes a vector of i64 indices.
22252 EVT IndexVT = Index.getValueType();
22253 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22254 return false;
22255
22256 // Simplify:
22257 // BasePtr = Ptr
22258 // Index = X + splat(Offset)
22259 // ->
22260 // BasePtr = Ptr + Offset * scale.
22261 // Index = X
22262 if (Index.getOpcode() == ISD::ADD) {
22263 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22264 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22265 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22266 Index = Index.getOperand(0);
22267 return true;
22268 }
22269 }
22270
22271 // Simplify:
22272 // BasePtr = Ptr
22273 // Index = (X + splat(Offset)) << splat(Shift)
22274 // ->
22275 // BasePtr = Ptr + (Offset << Shift) * scale)
22276 // Index = X << splat(shift)
22277 if (Index.getOpcode() == ISD::SHL &&
22278 Index.getOperand(0).getOpcode() == ISD::ADD) {
22279 SDValue Add = Index.getOperand(0);
22280 SDValue ShiftOp = Index.getOperand(1);
22281 SDValue OffsetOp = Add.getOperand(1);
22282 if (auto Shift = DAG.getSplatValue(ShiftOp))
22283 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22284 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22285 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22286 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22287 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22288 Add.getOperand(0), ShiftOp);
22289 return true;
22290 }
22291 }
22292
22293 return false;
22294}
22295
22296// Analyse the specified address returning true if a more optimal addressing
22297// mode is available. When returning true all parameters are updated to reflect
22298// their recommended values.
22300 SDValue &BasePtr, SDValue &Index,
22301 SelectionDAG &DAG) {
22302 // Try to iteratively fold parts of the index into the base pointer to
22303 // simplify the index as much as possible.
22304 bool Changed = false;
22305 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22306 Changed = true;
22307
22308 // Only consider element types that are pointer sized as smaller types can
22309 // be easily promoted.
22310 EVT IndexVT = Index.getValueType();
22311 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22312 return Changed;
22313
22314 // Can indices be trivially shrunk?
22315 EVT DataVT = N->getOperand(1).getValueType();
22316 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22317 // will later be re-extended to 64 bits in legalization
22318 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22319 return Changed;
22320 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22321 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22322 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22323 return true;
22324 }
22325
22326 // Match:
22327 // Index = step(const)
22328 int64_t Stride = 0;
22329 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22330 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22331 }
22332 // Match:
22333 // Index = step(const) << shift(const)
22334 else if (Index.getOpcode() == ISD::SHL &&
22335 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22336 SDValue RHS = Index.getOperand(1);
22337 if (auto *Shift =
22338 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22339 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22340 Stride = Step << Shift->getZExtValue();
22341 }
22342 }
22343
22344 // Return early because no supported pattern is found.
22345 if (Stride == 0)
22346 return Changed;
22347
22348 if (Stride < std::numeric_limits<int32_t>::min() ||
22349 Stride > std::numeric_limits<int32_t>::max())
22350 return Changed;
22351
22352 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22353 unsigned MaxVScale =
22355 int64_t LastElementOffset =
22356 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22357
22358 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22359 LastElementOffset > std::numeric_limits<int32_t>::max())
22360 return Changed;
22361
22362 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22363 // Stride does not scale explicitly by 'Scale', because it happens in
22364 // the gather/scatter addressing mode.
22365 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22366 return true;
22367}
22368
22371 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22372 assert(MGS && "Can only combine gather load or scatter store nodes");
22373
22374 if (!DCI.isBeforeLegalize())
22375 return SDValue();
22376
22377 SDLoc DL(MGS);
22378 SDValue Chain = MGS->getChain();
22379 SDValue Scale = MGS->getScale();
22380 SDValue Index = MGS->getIndex();
22381 SDValue Mask = MGS->getMask();
22382 SDValue BasePtr = MGS->getBasePtr();
22383 ISD::MemIndexType IndexType = MGS->getIndexType();
22384
22385 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22386 return SDValue();
22387
22388 // Here we catch such cases early and change MGATHER's IndexType to allow
22389 // the use of an Index that's more legalisation friendly.
22390 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22391 SDValue PassThru = MGT->getPassThru();
22392 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22393 return DAG.getMaskedGather(
22394 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22395 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22396 }
22397 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22398 SDValue Data = MSC->getValue();
22399 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22400 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22401 Ops, MSC->getMemOperand(), IndexType,
22402 MSC->isTruncatingStore());
22403}
22404
22405/// Target-specific DAG combine function for NEON load/store intrinsics
22406/// to merge base address updates.
22409 SelectionDAG &DAG) {
22410 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22411 return SDValue();
22412
22413 unsigned AddrOpIdx = N->getNumOperands() - 1;
22414 SDValue Addr = N->getOperand(AddrOpIdx);
22415
22416 // Search for a use of the address operand that is an increment.
22417 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22418 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22419 SDNode *User = *UI;
22420 if (User->getOpcode() != ISD::ADD ||
22421 UI.getUse().getResNo() != Addr.getResNo())
22422 continue;
22423
22424 // Check that the add is independent of the load/store. Otherwise, folding
22425 // it would create a cycle.
22428 Visited.insert(Addr.getNode());
22429 Worklist.push_back(N);
22430 Worklist.push_back(User);
22431 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22432 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22433 continue;
22434
22435 // Find the new opcode for the updating load/store.
22436 bool IsStore = false;
22437 bool IsLaneOp = false;
22438 bool IsDupOp = false;
22439 unsigned NewOpc = 0;
22440 unsigned NumVecs = 0;
22441 unsigned IntNo = N->getConstantOperandVal(1);
22442 switch (IntNo) {
22443 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22444 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22445 NumVecs = 2; break;
22446 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22447 NumVecs = 3; break;
22448 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22449 NumVecs = 4; break;
22450 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22451 NumVecs = 2; IsStore = true; break;
22452 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22453 NumVecs = 3; IsStore = true; break;
22454 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22455 NumVecs = 4; IsStore = true; break;
22456 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22457 NumVecs = 2; break;
22458 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22459 NumVecs = 3; break;
22460 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22461 NumVecs = 4; break;
22462 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22463 NumVecs = 2; IsStore = true; break;
22464 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22465 NumVecs = 3; IsStore = true; break;
22466 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22467 NumVecs = 4; IsStore = true; break;
22468 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22469 NumVecs = 2; IsDupOp = true; break;
22470 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22471 NumVecs = 3; IsDupOp = true; break;
22472 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22473 NumVecs = 4; IsDupOp = true; break;
22474 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22475 NumVecs = 2; IsLaneOp = true; break;
22476 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22477 NumVecs = 3; IsLaneOp = true; break;
22478 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22479 NumVecs = 4; IsLaneOp = true; break;
22480 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22481 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22482 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22483 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22484 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22485 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22486 }
22487
22488 EVT VecTy;
22489 if (IsStore)
22490 VecTy = N->getOperand(2).getValueType();
22491 else
22492 VecTy = N->getValueType(0);
22493
22494 // If the increment is a constant, it must match the memory ref size.
22495 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22496 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22497 uint32_t IncVal = CInc->getZExtValue();
22498 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22499 if (IsLaneOp || IsDupOp)
22500 NumBytes /= VecTy.getVectorNumElements();
22501 if (IncVal != NumBytes)
22502 continue;
22503 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22504 }
22506 Ops.push_back(N->getOperand(0)); // Incoming chain
22507 // Load lane and store have vector list as input.
22508 if (IsLaneOp || IsStore)
22509 for (unsigned i = 2; i < AddrOpIdx; ++i)
22510 Ops.push_back(N->getOperand(i));
22511 Ops.push_back(Addr); // Base register
22512 Ops.push_back(Inc);
22513
22514 // Return Types.
22515 EVT Tys[6];
22516 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22517 unsigned n;
22518 for (n = 0; n < NumResultVecs; ++n)
22519 Tys[n] = VecTy;
22520 Tys[n++] = MVT::i64; // Type of write back register
22521 Tys[n] = MVT::Other; // Type of the chain
22522 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22523
22524 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22525 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22526 MemInt->getMemoryVT(),
22527 MemInt->getMemOperand());
22528
22529 // Update the uses.
22530 std::vector<SDValue> NewResults;
22531 for (unsigned i = 0; i < NumResultVecs; ++i) {
22532 NewResults.push_back(SDValue(UpdN.getNode(), i));
22533 }
22534 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22535 DCI.CombineTo(N, NewResults);
22536 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22537
22538 break;
22539 }
22540 return SDValue();
22541}
22542
22543// Checks to see if the value is the prescribed width and returns information
22544// about its extension mode.
22545static
22546bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22547 ExtType = ISD::NON_EXTLOAD;
22548 switch(V.getNode()->getOpcode()) {
22549 default:
22550 return false;
22551 case ISD::LOAD: {
22552 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22553 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22554 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22555 ExtType = LoadNode->getExtensionType();
22556 return true;
22557 }
22558 return false;
22559 }
22560 case ISD::AssertSext: {
22561 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22562 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22563 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22564 ExtType = ISD::SEXTLOAD;
22565 return true;
22566 }
22567 return false;
22568 }
22569 case ISD::AssertZext: {
22570 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22571 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22572 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22573 ExtType = ISD::ZEXTLOAD;
22574 return true;
22575 }
22576 return false;
22577 }
22578 case ISD::Constant:
22579 case ISD::TargetConstant: {
22580 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22581 1LL << (width - 1);
22582 }
22583 }
22584
22585 return true;
22586}
22587
22588// This function does a whole lot of voodoo to determine if the tests are
22589// equivalent without and with a mask. Essentially what happens is that given a
22590// DAG resembling:
22591//
22592// +-------------+ +-------------+ +-------------+ +-------------+
22593// | Input | | AddConstant | | CompConstant| | CC |
22594// +-------------+ +-------------+ +-------------+ +-------------+
22595// | | | |
22596// V V | +----------+
22597// +-------------+ +----+ | |
22598// | ADD | |0xff| | |
22599// +-------------+ +----+ | |
22600// | | | |
22601// V V | |
22602// +-------------+ | |
22603// | AND | | |
22604// +-------------+ | |
22605// | | |
22606// +-----+ | |
22607// | | |
22608// V V V
22609// +-------------+
22610// | CMP |
22611// +-------------+
22612//
22613// The AND node may be safely removed for some combinations of inputs. In
22614// particular we need to take into account the extension type of the Input,
22615// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22616// width of the input (this can work for any width inputs, the above graph is
22617// specific to 8 bits.
22618//
22619// The specific equations were worked out by generating output tables for each
22620// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22621// problem was simplified by working with 4 bit inputs, which means we only
22622// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22623// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22624// patterns present in both extensions (0,7). For every distinct set of
22625// AddConstant and CompConstants bit patterns we can consider the masked and
22626// unmasked versions to be equivalent if the result of this function is true for
22627// all 16 distinct bit patterns of for the current extension type of Input (w0).
22628//
22629// sub w8, w0, w1
22630// and w10, w8, #0x0f
22631// cmp w8, w2
22632// cset w9, AArch64CC
22633// cmp w10, w2
22634// cset w11, AArch64CC
22635// cmp w9, w11
22636// cset w0, eq
22637// ret
22638//
22639// Since the above function shows when the outputs are equivalent it defines
22640// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22641// would be expensive to run during compiles. The equations below were written
22642// in a test harness that confirmed they gave equivalent outputs to the above
22643// for all inputs function, so they can be used determine if the removal is
22644// legal instead.
22645//
22646// isEquivalentMaskless() is the code for testing if the AND can be removed
22647// factored out of the DAG recognition as the DAG can take several forms.
22648
22649static bool isEquivalentMaskless(unsigned CC, unsigned width,
22650 ISD::LoadExtType ExtType, int AddConstant,
22651 int CompConstant) {
22652 // By being careful about our equations and only writing the in term
22653 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22654 // make them generally applicable to all bit widths.
22655 int MaxUInt = (1 << width);
22656
22657 // For the purposes of these comparisons sign extending the type is
22658 // equivalent to zero extending the add and displacing it by half the integer
22659 // width. Provided we are careful and make sure our equations are valid over
22660 // the whole range we can just adjust the input and avoid writing equations
22661 // for sign extended inputs.
22662 if (ExtType == ISD::SEXTLOAD)
22663 AddConstant -= (1 << (width-1));
22664
22665 switch(CC) {
22666 case AArch64CC::LE:
22667 case AArch64CC::GT:
22668 if ((AddConstant == 0) ||
22669 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22670 (AddConstant >= 0 && CompConstant < 0) ||
22671 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22672 return true;
22673 break;
22674 case AArch64CC::LT:
22675 case AArch64CC::GE:
22676 if ((AddConstant == 0) ||
22677 (AddConstant >= 0 && CompConstant <= 0) ||
22678 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22679 return true;
22680 break;
22681 case AArch64CC::HI:
22682 case AArch64CC::LS:
22683 if ((AddConstant >= 0 && CompConstant < 0) ||
22684 (AddConstant <= 0 && CompConstant >= -1 &&
22685 CompConstant < AddConstant + MaxUInt))
22686 return true;
22687 break;
22688 case AArch64CC::PL:
22689 case AArch64CC::MI:
22690 if ((AddConstant == 0) ||
22691 (AddConstant > 0 && CompConstant <= 0) ||
22692 (AddConstant < 0 && CompConstant <= AddConstant))
22693 return true;
22694 break;
22695 case AArch64CC::LO:
22696 case AArch64CC::HS:
22697 if ((AddConstant >= 0 && CompConstant <= 0) ||
22698 (AddConstant <= 0 && CompConstant >= 0 &&
22699 CompConstant <= AddConstant + MaxUInt))
22700 return true;
22701 break;
22702 case AArch64CC::EQ:
22703 case AArch64CC::NE:
22704 if ((AddConstant > 0 && CompConstant < 0) ||
22705 (AddConstant < 0 && CompConstant >= 0 &&
22706 CompConstant < AddConstant + MaxUInt) ||
22707 (AddConstant >= 0 && CompConstant >= 0 &&
22708 CompConstant >= AddConstant) ||
22709 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22710 return true;
22711 break;
22712 case AArch64CC::VS:
22713 case AArch64CC::VC:
22714 case AArch64CC::AL:
22715 case AArch64CC::NV:
22716 return true;
22717 case AArch64CC::Invalid:
22718 break;
22719 }
22720
22721 return false;
22722}
22723
22724// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22725// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22727 SDNode *AndNode, SelectionDAG &DAG,
22728 unsigned CCIndex, unsigned CmpIndex,
22729 unsigned CC) {
22730 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22731 if (!SubsC)
22732 return SDValue();
22733
22734 APInt SubsAP = SubsC->getAPIntValue();
22735 if (CC == AArch64CC::HI) {
22736 if (!SubsAP.isMask())
22737 return SDValue();
22738 } else if (CC == AArch64CC::LO) {
22739 if (!SubsAP.isPowerOf2())
22740 return SDValue();
22741 } else
22742 return SDValue();
22743
22744 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22745 if (!AndC)
22746 return SDValue();
22747
22748 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22749
22750 SDLoc DL(N);
22751 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22752 SDValue ANDS = DAG.getNode(
22753 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22754 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22755 SDValue AArch64_CC =
22757 N->getOperand(CCIndex)->getValueType(0));
22758
22759 // For now, only performCSELCombine and performBRCONDCombine call this
22760 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22761 // operands. So just init the ops direct to simplify the code. If we have some
22762 // other case with different CCIndex, CmpIndex, we need to use for loop to
22763 // rewrite the code here.
22764 // TODO: Do we need to assert number of operand is 4 here?
22765 assert((CCIndex == 2 && CmpIndex == 3) &&
22766 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22767 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22768 ANDS.getValue(1)};
22769 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22770}
22771
22772static
22775 SelectionDAG &DAG, unsigned CCIndex,
22776 unsigned CmpIndex) {
22777 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22778 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22779 unsigned CondOpcode = SubsNode->getOpcode();
22780
22781 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
22782 return SDValue();
22783
22784 // There is a SUBS feeding this condition. Is it fed by a mask we can
22785 // use?
22786
22787 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22788 unsigned MaskBits = 0;
22789
22790 if (AndNode->getOpcode() != ISD::AND)
22791 return SDValue();
22792
22793 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22794 CmpIndex, CC))
22795 return Val;
22796
22797 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22798 uint32_t CNV = CN->getZExtValue();
22799 if (CNV == 255)
22800 MaskBits = 8;
22801 else if (CNV == 65535)
22802 MaskBits = 16;
22803 }
22804
22805 if (!MaskBits)
22806 return SDValue();
22807
22808 SDValue AddValue = AndNode->getOperand(0);
22809
22810 if (AddValue.getOpcode() != ISD::ADD)
22811 return SDValue();
22812
22813 // The basic dag structure is correct, grab the inputs and validate them.
22814
22815 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22816 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22817 SDValue SubsInputValue = SubsNode->getOperand(1);
22818
22819 // The mask is present and the provenance of all the values is a smaller type,
22820 // lets see if the mask is superfluous.
22821
22822 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22823 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22824 return SDValue();
22825
22826 ISD::LoadExtType ExtType;
22827
22828 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22829 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22830 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22831 return SDValue();
22832
22833 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22834 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22835 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22836 return SDValue();
22837
22838 // The AND is not necessary, remove it.
22839
22840 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22841 SubsNode->getValueType(1));
22842 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22843
22844 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22845 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22846
22847 return SDValue(N, 0);
22848}
22849
22850// Optimize compare with zero and branch.
22853 SelectionDAG &DAG) {
22855 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22856 // will not be produced, as they are conditional branch instructions that do
22857 // not set flags.
22858 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22859 return SDValue();
22860
22861 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22862 N = NV.getNode();
22863 SDValue Chain = N->getOperand(0);
22864 SDValue Dest = N->getOperand(1);
22865 SDValue CCVal = N->getOperand(2);
22866 SDValue Cmp = N->getOperand(3);
22867
22868 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22869 unsigned CC = CCVal->getAsZExtVal();
22870 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22871 return SDValue();
22872
22873 unsigned CmpOpc = Cmp.getOpcode();
22874 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22875 return SDValue();
22876
22877 // Only attempt folding if there is only one use of the flag and no use of the
22878 // value.
22879 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22880 return SDValue();
22881
22882 SDValue LHS = Cmp.getOperand(0);
22883 SDValue RHS = Cmp.getOperand(1);
22884
22885 assert(LHS.getValueType() == RHS.getValueType() &&
22886 "Expected the value type to be the same for both operands!");
22887 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22888 return SDValue();
22889
22890 if (isNullConstant(LHS))
22891 std::swap(LHS, RHS);
22892
22893 if (!isNullConstant(RHS))
22894 return SDValue();
22895
22896 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22897 LHS.getOpcode() == ISD::SRL)
22898 return SDValue();
22899
22900 // Fold the compare into the branch instruction.
22901 SDValue BR;
22902 if (CC == AArch64CC::EQ)
22903 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22904 else
22905 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22906
22907 // Do not add new nodes to DAG combiner worklist.
22908 DCI.CombineTo(N, BR, false);
22909
22910 return SDValue();
22911}
22912
22914 unsigned CC = N->getConstantOperandVal(2);
22915 SDValue SUBS = N->getOperand(3);
22916 SDValue Zero, CTTZ;
22917
22918 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22919 Zero = N->getOperand(0);
22920 CTTZ = N->getOperand(1);
22921 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22922 Zero = N->getOperand(1);
22923 CTTZ = N->getOperand(0);
22924 } else
22925 return SDValue();
22926
22927 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22928 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22929 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22930 return SDValue();
22931
22932 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22933 "Illegal type in CTTZ folding");
22934
22935 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
22936 return SDValue();
22937
22938 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22939 ? CTTZ.getOperand(0).getOperand(0)
22940 : CTTZ.getOperand(0);
22941
22942 if (X != SUBS.getOperand(0))
22943 return SDValue();
22944
22945 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22946 ? CTTZ.getOperand(0).getValueSizeInBits()
22947 : CTTZ.getValueSizeInBits();
22948 SDValue BitWidthMinusOne =
22949 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
22950 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
22951 BitWidthMinusOne);
22952}
22953
22954// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
22955// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
22956// Where x and y are constants and x != y
22957
22958// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
22959// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
22960// Where x and y are constants and x != y
22962 SDValue L = Op->getOperand(0);
22963 SDValue R = Op->getOperand(1);
22964 AArch64CC::CondCode OpCC =
22965 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
22966
22967 SDValue OpCmp = Op->getOperand(3);
22968 if (!isCMP(OpCmp))
22969 return SDValue();
22970
22971 SDValue CmpLHS = OpCmp.getOperand(0);
22972 SDValue CmpRHS = OpCmp.getOperand(1);
22973
22974 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
22975 std::swap(CmpLHS, CmpRHS);
22976 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
22977 return SDValue();
22978
22979 SDValue X = CmpLHS->getOperand(0);
22980 SDValue Y = CmpLHS->getOperand(1);
22981 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
22982 return SDValue();
22983 }
22984
22985 // If one of the constant is opaque constant, x,y sdnode is still different
22986 // but the real value maybe the same. So check APInt here to make sure the
22987 // code is correct.
22988 ConstantSDNode *CX = cast<ConstantSDNode>(X);
22989 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
22990 if (CX->getAPIntValue() == CY->getAPIntValue())
22991 return SDValue();
22992
22994 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
22995 SDValue Cond = CmpLHS->getOperand(3);
22996
22997 if (CmpRHS == Y)
22999 else if (CmpRHS != X)
23000 return SDValue();
23001
23002 if (OpCC == AArch64CC::NE)
23004 else if (OpCC != AArch64CC::EQ)
23005 return SDValue();
23006
23007 SDLoc DL(Op);
23008 EVT VT = Op->getValueType(0);
23009
23010 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23011 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23012}
23013
23014// Optimize CSEL instructions
23017 SelectionDAG &DAG) {
23018 // CSEL x, x, cc -> x
23019 if (N->getOperand(0) == N->getOperand(1))
23020 return N->getOperand(0);
23021
23022 if (SDValue R = foldCSELOfCSEL(N, DAG))
23023 return R;
23024
23025 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23026 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23027 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23028 return Folded;
23029
23030 return performCONDCombine(N, DCI, DAG, 2, 3);
23031}
23032
23033// Try to re-use an already extended operand of a vector SetCC feeding a
23034// extended select. Doing so avoids requiring another full extension of the
23035// SET_CC result when lowering the select.
23037 EVT Op0MVT = Op->getOperand(0).getValueType();
23038 if (!Op0MVT.isVector() || Op->use_empty())
23039 return SDValue();
23040
23041 // Make sure that all uses of Op are VSELECTs with result matching types where
23042 // the result type has a larger element type than the SetCC operand.
23043 SDNode *FirstUse = *Op->use_begin();
23044 if (FirstUse->getOpcode() != ISD::VSELECT)
23045 return SDValue();
23046 EVT UseMVT = FirstUse->getValueType(0);
23047 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23048 return SDValue();
23049 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23050 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23051 }))
23052 return SDValue();
23053
23054 APInt V;
23055 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23056 return SDValue();
23057
23058 SDLoc DL(Op);
23059 SDValue Op0ExtV;
23060 SDValue Op1ExtV;
23061 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23062 // Check if the first operand of the SET_CC is already extended. If it is,
23063 // split the SET_CC and re-use the extended version of the operand.
23064 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23065 Op->getOperand(0));
23066 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23067 Op->getOperand(0));
23068 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23069 Op0ExtV = SDValue(Op0SExt, 0);
23070 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23071 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23072 Op0ExtV = SDValue(Op0ZExt, 0);
23073 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23074 } else
23075 return SDValue();
23076
23077 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23078 Op0ExtV, Op1ExtV, Op->getOperand(2));
23079}
23080
23081static SDValue
23083 SelectionDAG &DAG) {
23084 SDValue Vec = N->getOperand(0);
23085 if (DCI.isBeforeLegalize() &&
23086 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23089 SDLoc DL(N);
23090 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23091 DAG);
23092 }
23093
23094 return SDValue();
23095}
23096
23099 SelectionDAG &DAG) {
23100 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23101 SDValue LHS = N->getOperand(0);
23102 SDValue RHS = N->getOperand(1);
23103 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23104 SDLoc DL(N);
23105 EVT VT = N->getValueType(0);
23106
23107 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23108 return V;
23109
23110 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23111 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23112 LHS->getOpcode() == AArch64ISD::CSEL &&
23113 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23114 LHS->hasOneUse()) {
23115 // Invert CSEL's condition.
23116 auto OldCond =
23117 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23118 auto NewCond = getInvertedCondCode(OldCond);
23119
23120 // csel 0, 1, !cond, X
23121 SDValue CSEL =
23122 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23123 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23124 LHS.getOperand(3));
23125 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23126 }
23127
23128 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23129 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23130 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23131 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23132 LHS->hasOneUse()) {
23133 EVT TstVT = LHS->getValueType(0);
23134 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23135 // this pattern will get better opt in emitComparison
23136 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23137 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23138 DAG.getConstant(TstImm, DL, TstVT));
23139 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23140 }
23141 }
23142
23143 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23144 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23145 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23146 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23147 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23148 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23150 LHS->getOpcode() == ISD::BITCAST) {
23151 EVT ToVT = LHS->getValueType(0);
23152 EVT FromVT = LHS->getOperand(0).getValueType();
23153 if (FromVT.isFixedLengthVector() &&
23154 FromVT.getVectorElementType() == MVT::i1) {
23155 bool IsNull = isNullConstant(RHS);
23157 DL, MVT::i1, LHS->getOperand(0));
23158 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23159 LHS);
23160 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23161 }
23162 }
23163
23164 // Try to perform the memcmp when the result is tested for [in]equality with 0
23165 if (SDValue V = performOrXorChainCombine(N, DAG))
23166 return V;
23167
23168 return SDValue();
23169}
23170
23171// Replace a flag-setting operator (eg ANDS) with the generic version
23172// (eg AND) if the flag is unused.
23175 unsigned GenericOpcode) {
23176 SDLoc DL(N);
23177 SDValue LHS = N->getOperand(0);
23178 SDValue RHS = N->getOperand(1);
23179 EVT VT = N->getValueType(0);
23180
23181 // If the flag result isn't used, convert back to a generic opcode.
23182 if (!N->hasAnyUseOfValue(1)) {
23183 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23184 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23185 DL);
23186 }
23187
23188 // Combine identical generic nodes into this node, re-using the result.
23189 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23190 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23191 DCI.CombineTo(Generic, SDValue(N, 0));
23192
23193 return SDValue();
23194}
23195
23197 // setcc_merge_zero pred
23198 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23199 // => extract_subvector (inner setcc_merge_zero)
23200 SDValue Pred = N->getOperand(0);
23201 SDValue LHS = N->getOperand(1);
23202 SDValue RHS = N->getOperand(2);
23203 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23204
23205 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23206 LHS->getOpcode() != ISD::SIGN_EXTEND)
23207 return SDValue();
23208
23209 SDValue Extract = LHS->getOperand(0);
23210 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23211 Extract->getValueType(0) != N->getValueType(0) ||
23212 Extract->getConstantOperandVal(1) != 0)
23213 return SDValue();
23214
23215 SDValue InnerSetCC = Extract->getOperand(0);
23216 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23217 return SDValue();
23218
23219 // By this point we've effectively got
23220 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23221 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23222 // can operate on A directly.
23223 SDValue InnerPred = InnerSetCC.getOperand(0);
23224 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23225 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23226 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23227 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23228 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23229 return Extract;
23230
23231 return SDValue();
23232}
23233
23234static SDValue
23236 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23237 "Unexpected opcode!");
23238
23239 SelectionDAG &DAG = DCI.DAG;
23240 SDValue Pred = N->getOperand(0);
23241 SDValue LHS = N->getOperand(1);
23242 SDValue RHS = N->getOperand(2);
23243 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23244
23245 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23246 return V;
23247
23248 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23249 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23250 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23251 // setcc_merge_zero(
23252 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23253 // => setcc_merge_zero(pred, ...)
23254 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23255 LHS->getOperand(0)->getOperand(0) == Pred)
23256 return LHS->getOperand(0);
23257
23258 // setcc_merge_zero(
23259 // all_active, extend(nxvNi1 ...), != splat(0))
23260 // -> nxvNi1 ...
23261 if (isAllActivePredicate(DAG, Pred))
23262 return LHS->getOperand(0);
23263
23264 // setcc_merge_zero(
23265 // pred, extend(nxvNi1 ...), != splat(0))
23266 // -> nxvNi1 and(pred, ...)
23267 if (DCI.isAfterLegalizeDAG())
23268 // Do this after legalization to allow more folds on setcc_merge_zero
23269 // to be recognized.
23270 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23271 LHS->getOperand(0), Pred);
23272 }
23273
23274 return SDValue();
23275}
23276
23277// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23278// as well as whether the test should be inverted. This code is required to
23279// catch these cases (as opposed to standard dag combines) because
23280// AArch64ISD::TBZ is matched during legalization.
23281static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23282 SelectionDAG &DAG) {
23283
23284 if (!Op->hasOneUse())
23285 return Op;
23286
23287 // We don't handle undef/constant-fold cases below, as they should have
23288 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23289 // etc.)
23290
23291 // (tbz (trunc x), b) -> (tbz x, b)
23292 // This case is just here to enable more of the below cases to be caught.
23293 if (Op->getOpcode() == ISD::TRUNCATE &&
23294 Bit < Op->getValueType(0).getSizeInBits()) {
23295 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23296 }
23297
23298 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23299 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23300 Bit < Op->getOperand(0).getValueSizeInBits()) {
23301 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23302 }
23303
23304 if (Op->getNumOperands() != 2)
23305 return Op;
23306
23307 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23308 if (!C)
23309 return Op;
23310
23311 switch (Op->getOpcode()) {
23312 default:
23313 return Op;
23314
23315 // (tbz (and x, m), b) -> (tbz x, b)
23316 case ISD::AND:
23317 if ((C->getZExtValue() >> Bit) & 1)
23318 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23319 return Op;
23320
23321 // (tbz (shl x, c), b) -> (tbz x, b-c)
23322 case ISD::SHL:
23323 if (C->getZExtValue() <= Bit &&
23324 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23325 Bit = Bit - C->getZExtValue();
23326 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23327 }
23328 return Op;
23329
23330 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23331 case ISD::SRA:
23332 Bit = Bit + C->getZExtValue();
23333 if (Bit >= Op->getValueType(0).getSizeInBits())
23334 Bit = Op->getValueType(0).getSizeInBits() - 1;
23335 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23336
23337 // (tbz (srl x, c), b) -> (tbz x, b+c)
23338 case ISD::SRL:
23339 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23340 Bit = Bit + C->getZExtValue();
23341 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23342 }
23343 return Op;
23344
23345 // (tbz (xor x, -1), b) -> (tbnz x, b)
23346 case ISD::XOR:
23347 if ((C->getZExtValue() >> Bit) & 1)
23348 Invert = !Invert;
23349 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23350 }
23351}
23352
23353// Optimize test single bit zero/non-zero and branch.
23356 SelectionDAG &DAG) {
23357 unsigned Bit = N->getConstantOperandVal(2);
23358 bool Invert = false;
23359 SDValue TestSrc = N->getOperand(1);
23360 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23361
23362 if (TestSrc == NewTestSrc)
23363 return SDValue();
23364
23365 unsigned NewOpc = N->getOpcode();
23366 if (Invert) {
23367 if (NewOpc == AArch64ISD::TBZ)
23368 NewOpc = AArch64ISD::TBNZ;
23369 else {
23370 assert(NewOpc == AArch64ISD::TBNZ);
23371 NewOpc = AArch64ISD::TBZ;
23372 }
23373 }
23374
23375 SDLoc DL(N);
23376 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23377 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23378}
23379
23380// Swap vselect operands where it may allow a predicated operation to achieve
23381// the `sel`.
23382//
23383// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23384// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23386 auto SelectA = N->getOperand(1);
23387 auto SelectB = N->getOperand(2);
23388 auto NTy = N->getValueType(0);
23389
23390 if (!NTy.isScalableVector())
23391 return SDValue();
23392 SDValue SetCC = N->getOperand(0);
23393 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23394 return SDValue();
23395
23396 switch (SelectB.getOpcode()) {
23397 default:
23398 return SDValue();
23399 case ISD::FMUL:
23400 case ISD::FSUB:
23401 case ISD::FADD:
23402 break;
23403 }
23404 if (SelectA != SelectB.getOperand(0))
23405 return SDValue();
23406
23407 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23408 ISD::CondCode InverseCC =
23410 auto InverseSetCC =
23411 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23412 SetCC.getOperand(1), InverseCC);
23413
23414 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23415 {InverseSetCC, SelectB, SelectA});
23416}
23417
23418// vselect (v1i1 setcc) ->
23419// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23420// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23421// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23422// such VSELECT.
23424 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23425 return SwapResult;
23426
23427 SDValue N0 = N->getOperand(0);
23428 EVT CCVT = N0.getValueType();
23429
23430 if (isAllActivePredicate(DAG, N0))
23431 return N->getOperand(1);
23432
23433 if (isAllInactivePredicate(N0))
23434 return N->getOperand(2);
23435
23436 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23437 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23438 // supported types.
23439 SDValue SetCC = N->getOperand(0);
23440 if (SetCC.getOpcode() == ISD::SETCC &&
23441 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23442 SDValue CmpLHS = SetCC.getOperand(0);
23443 EVT VT = CmpLHS.getValueType();
23444 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23445 SDNode *SplatLHS = N->getOperand(1).getNode();
23446 SDNode *SplatRHS = N->getOperand(2).getNode();
23447 APInt SplatLHSVal;
23448 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23449 VT.isSimple() &&
23450 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23451 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23452 VT.getSimpleVT().SimpleTy) &&
23453 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23454 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23456 unsigned NumElts = VT.getVectorNumElements();
23458 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23459 VT.getScalarType()));
23460 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23461
23462 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23463 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23464 return Or;
23465 }
23466 }
23467
23468 EVT CmpVT = N0.getOperand(0).getValueType();
23469 if (N0.getOpcode() != ISD::SETCC ||
23471 CCVT.getVectorElementType() != MVT::i1 ||
23473 return SDValue();
23474
23475 EVT ResVT = N->getValueType(0);
23476 // Only combine when the result type is of the same size as the compared
23477 // operands.
23478 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23479 return SDValue();
23480
23481 SDValue IfTrue = N->getOperand(1);
23482 SDValue IfFalse = N->getOperand(2);
23483 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23484 N0.getOperand(0), N0.getOperand(1),
23485 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23486 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23487 IfTrue, IfFalse);
23488}
23489
23490/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23491/// the compare-mask instructions rather than going via NZCV, even if LHS and
23492/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23493/// with a vector one followed by a DUP shuffle on the result.
23496 SelectionDAG &DAG = DCI.DAG;
23497 SDValue N0 = N->getOperand(0);
23498 EVT ResVT = N->getValueType(0);
23499
23500 if (N0.getOpcode() != ISD::SETCC)
23501 return SDValue();
23502
23503 if (ResVT.isScalableVT())
23504 return SDValue();
23505
23506 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23507 // scalar SetCCResultType. We also don't expect vectors, because we assume
23508 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23509 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23510 "Scalar-SETCC feeding SELECT has unexpected result type!");
23511
23512 // If NumMaskElts == 0, the comparison is larger than select result. The
23513 // largest real NEON comparison is 64-bits per lane, which means the result is
23514 // at most 32-bits and an illegal vector. Just bail out for now.
23515 EVT SrcVT = N0.getOperand(0).getValueType();
23516
23517 // Don't try to do this optimization when the setcc itself has i1 operands.
23518 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23519 // ruled out to prevent the creation of setcc that need to be scalarized.
23520 if (SrcVT == MVT::i1 ||
23521 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23522 return SDValue();
23523
23524 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23525 if (!ResVT.isVector() || NumMaskElts == 0)
23526 return SDValue();
23527
23528 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23530
23531 // Also bail out if the vector CCVT isn't the same size as ResVT.
23532 // This can happen if the SETCC operand size doesn't divide the ResVT size
23533 // (e.g., f64 vs v3f32).
23534 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23535 return SDValue();
23536
23537 // Make sure we didn't create illegal types, if we're not supposed to.
23538 assert(DCI.isBeforeLegalize() ||
23539 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23540
23541 // First perform a vector comparison, where lane 0 is the one we're interested
23542 // in.
23543 SDLoc DL(N0);
23544 SDValue LHS =
23545 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23546 SDValue RHS =
23547 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23548 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23549
23550 // Now duplicate the comparison mask we want across all other lanes.
23551 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23552 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23553 Mask = DAG.getNode(ISD::BITCAST, DL,
23554 ResVT.changeVectorElementTypeToInteger(), Mask);
23555
23556 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23557}
23558
23561 EVT VT = N->getValueType(0);
23562 SDLoc DL(N);
23563 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23564 // 128bit vector version.
23565 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23567 SmallVector<SDValue> Ops(N->ops());
23568 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23569 DCI.DAG.getVTList(LVT), Ops)) {
23570 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23571 DCI.DAG.getConstant(0, DL, MVT::i64));
23572 }
23573 }
23574
23575 if (N->getOpcode() == AArch64ISD::DUP) {
23576 if (DCI.isAfterLegalizeDAG()) {
23577 // If scalar dup's operand is extract_vector_elt, try to combine them into
23578 // duplane. For example,
23579 //
23580 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23581 // t18: v4i32 = AArch64ISD::DUP t21
23582 // ==>
23583 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23584 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23585 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23586 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23587 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23588 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23589 EXTRACT_VEC_ELT.getOperand(1));
23590 }
23591 }
23592 }
23593
23594 return performPostLD1Combine(N, DCI, false);
23595 }
23596
23597 return SDValue();
23598}
23599
23600/// Get rid of unnecessary NVCASTs (that don't change the type).
23602 if (N->getValueType(0) == N->getOperand(0).getValueType())
23603 return N->getOperand(0);
23604 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23605 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23606 N->getOperand(0).getOperand(0));
23607
23608 return SDValue();
23609}
23610
23611// If all users of the globaladdr are of the form (globaladdr + constant), find
23612// the smallest constant, fold it into the globaladdr's offset and rewrite the
23613// globaladdr as (globaladdr + constant) - constant.
23615 const AArch64Subtarget *Subtarget,
23616 const TargetMachine &TM) {
23617 auto *GN = cast<GlobalAddressSDNode>(N);
23618 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23620 return SDValue();
23621
23622 uint64_t MinOffset = -1ull;
23623 for (SDNode *N : GN->uses()) {
23624 if (N->getOpcode() != ISD::ADD)
23625 return SDValue();
23626 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23627 if (!C)
23628 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23629 if (!C)
23630 return SDValue();
23631 MinOffset = std::min(MinOffset, C->getZExtValue());
23632 }
23633 uint64_t Offset = MinOffset + GN->getOffset();
23634
23635 // Require that the new offset is larger than the existing one. Otherwise, we
23636 // can end up oscillating between two possible DAGs, for example,
23637 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23638 if (Offset <= uint64_t(GN->getOffset()))
23639 return SDValue();
23640
23641 // Check whether folding this offset is legal. It must not go out of bounds of
23642 // the referenced object to avoid violating the code model, and must be
23643 // smaller than 2^20 because this is the largest offset expressible in all
23644 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23645 // stores an immediate signed 21 bit offset.)
23646 //
23647 // This check also prevents us from folding negative offsets, which will end
23648 // up being treated in the same way as large positive ones. They could also
23649 // cause code model violations, and aren't really common enough to matter.
23650 if (Offset >= (1 << 20))
23651 return SDValue();
23652
23653 const GlobalValue *GV = GN->getGlobal();
23654 Type *T = GV->getValueType();
23655 if (!T->isSized() ||
23657 return SDValue();
23658
23659 SDLoc DL(GN);
23660 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23661 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23662 DAG.getConstant(MinOffset, DL, MVT::i64));
23663}
23664
23666 const AArch64Subtarget *Subtarget) {
23667 SDValue BR = N->getOperand(0);
23668 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23669 !BR.getValueType().isScalarInteger())
23670 return SDValue();
23671
23672 SDLoc DL(N);
23673 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23674}
23675
23676// Turns the vector of indices into a vector of byte offstes by scaling Offset
23677// by (BitWidth / 8).
23679 SDLoc DL, unsigned BitWidth) {
23680 assert(Offset.getValueType().isScalableVector() &&
23681 "This method is only for scalable vectors of offsets");
23682
23683 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23684 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23685
23686 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23687}
23688
23689/// Check if the value of \p OffsetInBytes can be used as an immediate for
23690/// the gather load/prefetch and scatter store instructions with vector base and
23691/// immediate offset addressing mode:
23692///
23693/// [<Zn>.[S|D]{, #<imm>}]
23694///
23695/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23696inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23697 unsigned ScalarSizeInBytes) {
23698 // The immediate is not a multiple of the scalar size.
23699 if (OffsetInBytes % ScalarSizeInBytes)
23700 return false;
23701
23702 // The immediate is out of range.
23703 if (OffsetInBytes / ScalarSizeInBytes > 31)
23704 return false;
23705
23706 return true;
23707}
23708
23709/// Check if the value of \p Offset represents a valid immediate for the SVE
23710/// gather load/prefetch and scatter store instructiona with vector base and
23711/// immediate offset addressing mode:
23712///
23713/// [<Zn>.[S|D]{, #<imm>}]
23714///
23715/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23717 unsigned ScalarSizeInBytes) {
23718 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23719 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23720 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23721}
23722
23724 unsigned Opcode,
23725 bool OnlyPackedOffsets = true) {
23726 const SDValue Src = N->getOperand(2);
23727 const EVT SrcVT = Src->getValueType(0);
23728 assert(SrcVT.isScalableVector() &&
23729 "Scatter stores are only possible for SVE vectors");
23730
23731 SDLoc DL(N);
23732 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23733
23734 // Make sure that source data will fit into an SVE register
23736 return SDValue();
23737
23738 // For FPs, ACLE only supports _packed_ single and double precision types.
23739 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23740 if (SrcElVT.isFloatingPoint())
23741 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23742 ((Opcode != AArch64ISD::SST1Q_PRED &&
23743 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23744 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23745 return SDValue();
23746
23747 // Depending on the addressing mode, this is either a pointer or a vector of
23748 // pointers (that fits into one register)
23749 SDValue Base = N->getOperand(4);
23750 // Depending on the addressing mode, this is either a single offset or a
23751 // vector of offsets (that fits into one register)
23752 SDValue Offset = N->getOperand(5);
23753
23754 // For "scalar + vector of indices", just scale the indices. This only
23755 // applies to non-temporal scatters because there's no instruction that takes
23756 // indicies.
23757 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23758 Offset =
23760 Opcode = AArch64ISD::SSTNT1_PRED;
23761 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23762 Offset =
23764 Opcode = AArch64ISD::SST1Q_PRED;
23765 }
23766
23767 // In the case of non-temporal gather loads there's only one SVE instruction
23768 // per data-size: "scalar + vector", i.e.
23769 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23770 // Since we do have intrinsics that allow the arguments to be in a different
23771 // order, we may need to swap them to match the spec.
23772 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23773 Offset.getValueType().isVector())
23775
23776 // SST1_IMM requires that the offset is an immediate that is:
23777 // * a multiple of #SizeInBytes,
23778 // * in the range [0, 31 x #SizeInBytes],
23779 // where #SizeInBytes is the size in bytes of the stored items. For
23780 // immediates outside that range and non-immediate scalar offsets use SST1 or
23781 // SST1_UXTW instead.
23782 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23784 SrcVT.getScalarSizeInBits() / 8)) {
23785 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23787 else
23788 Opcode = AArch64ISD::SST1_PRED;
23789
23791 }
23792 }
23793
23794 auto &TLI = DAG.getTargetLoweringInfo();
23795 if (!TLI.isTypeLegal(Base.getValueType()))
23796 return SDValue();
23797
23798 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23799 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23800 // nxv2i64. Legalize accordingly.
23801 if (!OnlyPackedOffsets &&
23802 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23803 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23804
23805 if (!TLI.isTypeLegal(Offset.getValueType()))
23806 return SDValue();
23807
23808 // Source value type that is representable in hardware
23809 EVT HwSrcVt = getSVEContainerType(SrcVT);
23810
23811 // Keep the original type of the input data to store - this is needed to be
23812 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23813 // FP values we want the integer equivalent, so just use HwSrcVt.
23814 SDValue InputVT = DAG.getValueType(SrcVT);
23815 if (SrcVT.isFloatingPoint())
23816 InputVT = DAG.getValueType(HwSrcVt);
23817
23818 SDVTList VTs = DAG.getVTList(MVT::Other);
23819 SDValue SrcNew;
23820
23821 if (Src.getValueType().isFloatingPoint())
23822 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23823 else
23824 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23825
23826 SDValue Ops[] = {N->getOperand(0), // Chain
23827 SrcNew,
23828 N->getOperand(3), // Pg
23829 Base,
23830 Offset,
23831 InputVT};
23832
23833 return DAG.getNode(Opcode, DL, VTs, Ops);
23834}
23835
23837 unsigned Opcode,
23838 bool OnlyPackedOffsets = true) {
23839 const EVT RetVT = N->getValueType(0);
23840 assert(RetVT.isScalableVector() &&
23841 "Gather loads are only possible for SVE vectors");
23842
23843 SDLoc DL(N);
23844
23845 // Make sure that the loaded data will fit into an SVE register
23847 return SDValue();
23848
23849 // Depending on the addressing mode, this is either a pointer or a vector of
23850 // pointers (that fits into one register)
23851 SDValue Base = N->getOperand(3);
23852 // Depending on the addressing mode, this is either a single offset or a
23853 // vector of offsets (that fits into one register)
23854 SDValue Offset = N->getOperand(4);
23855
23856 // For "scalar + vector of indices", scale the indices to obtain unscaled
23857 // offsets. This applies to non-temporal and quadword gathers, which do not
23858 // have an addressing mode with scaled offset.
23861 RetVT.getScalarSizeInBits());
23863 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23865 RetVT.getScalarSizeInBits());
23867 }
23868
23869 // In the case of non-temporal gather loads and quadword gather loads there's
23870 // only one addressing mode : "vector + scalar", e.g.
23871 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23872 // Since we do have intrinsics that allow the arguments to be in a different
23873 // order, we may need to swap them to match the spec.
23874 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23875 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23876 Offset.getValueType().isVector())
23878
23879 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23880 // * a multiple of #SizeInBytes,
23881 // * in the range [0, 31 x #SizeInBytes],
23882 // where #SizeInBytes is the size in bytes of the loaded items. For
23883 // immediates outside that range and non-immediate scalar offsets use
23884 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23885 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23888 RetVT.getScalarSizeInBits() / 8)) {
23889 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23890 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23893 else
23894 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23897
23899 }
23900 }
23901
23902 auto &TLI = DAG.getTargetLoweringInfo();
23903 if (!TLI.isTypeLegal(Base.getValueType()))
23904 return SDValue();
23905
23906 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23907 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23908 // nxv2i64. Legalize accordingly.
23909 if (!OnlyPackedOffsets &&
23910 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23911 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23912
23913 // Return value type that is representable in hardware
23914 EVT HwRetVt = getSVEContainerType(RetVT);
23915
23916 // Keep the original output value type around - this is needed to be able to
23917 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23918 // values we want the integer equivalent, so just use HwRetVT.
23919 SDValue OutVT = DAG.getValueType(RetVT);
23920 if (RetVT.isFloatingPoint())
23921 OutVT = DAG.getValueType(HwRetVt);
23922
23923 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23924 SDValue Ops[] = {N->getOperand(0), // Chain
23925 N->getOperand(2), // Pg
23926 Base, Offset, OutVT};
23927
23928 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23929 SDValue LoadChain = SDValue(Load.getNode(), 1);
23930
23931 if (RetVT.isInteger() && (RetVT != HwRetVt))
23932 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
23933
23934 // If the original return value was FP, bitcast accordingly. Doing it here
23935 // means that we can avoid adding TableGen patterns for FPs.
23936 if (RetVT.isFloatingPoint())
23937 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
23938
23939 return DAG.getMergeValues({Load, LoadChain}, DL);
23940}
23941
23942static SDValue
23944 SelectionDAG &DAG) {
23945 SDLoc DL(N);
23946 SDValue Src = N->getOperand(0);
23947 unsigned Opc = Src->getOpcode();
23948
23949 // Sign extend of an unsigned unpack -> signed unpack
23950 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
23951
23952 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
23954
23955 // Push the sign extend to the operand of the unpack
23956 // This is necessary where, for example, the operand of the unpack
23957 // is another unpack:
23958 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
23959 // ->
23960 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
23961 // ->
23962 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
23963 SDValue ExtOp = Src->getOperand(0);
23964 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
23965 EVT EltTy = VT.getVectorElementType();
23966 (void)EltTy;
23967
23968 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
23969 "Sign extending from an invalid type");
23970
23971 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
23972
23974 ExtOp, DAG.getValueType(ExtVT));
23975
23976 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
23977 }
23978
23979 if (DCI.isBeforeLegalizeOps())
23980 return SDValue();
23981
23983 return SDValue();
23984
23985 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
23986 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
23987 unsigned NewOpc;
23988 unsigned MemVTOpNum = 4;
23989 switch (Opc) {
23992 MemVTOpNum = 3;
23993 break;
23996 MemVTOpNum = 3;
23997 break;
24000 MemVTOpNum = 3;
24001 break;
24004 break;
24007 break;
24010 break;
24013 break;
24016 break;
24019 break;
24022 break;
24025 break;
24028 break;
24031 break;
24034 break;
24037 break;
24040 break;
24043 break;
24046 break;
24047 default:
24048 return SDValue();
24049 }
24050
24051 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24052 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24053
24054 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24055 return SDValue();
24056
24057 EVT DstVT = N->getValueType(0);
24058 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24059
24061 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24062 Ops.push_back(Src->getOperand(I));
24063
24064 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24065 DCI.CombineTo(N, ExtLoad);
24066 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24067
24068 // Return N so it doesn't get rechecked
24069 return SDValue(N, 0);
24070}
24071
24072/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24073/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24074/// != nxv2i32) do not need legalization.
24076 const unsigned OffsetPos = 4;
24077 SDValue Offset = N->getOperand(OffsetPos);
24078
24079 // Not an unpacked vector, bail out.
24080 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24081 return SDValue();
24082
24083 // Extend the unpacked offset vector to 64-bit lanes.
24084 SDLoc DL(N);
24085 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24086 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24087 // Replace the offset operand with the 64-bit one.
24088 Ops[OffsetPos] = Offset;
24089
24090 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24091}
24092
24093/// Combines a node carrying the intrinsic
24094/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24095/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24096/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24097/// sve gather prefetch instruction with vector plus immediate addressing mode.
24099 unsigned ScalarSizeInBytes) {
24100 const unsigned ImmPos = 4, OffsetPos = 3;
24101 // No need to combine the node if the immediate is valid...
24102 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24103 return SDValue();
24104
24105 // ...otherwise swap the offset base with the offset...
24106 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24107 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24108 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24109 // `aarch64_sve_prfb_gather_uxtw_index`.
24110 SDLoc DL(N);
24111 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24112 MVT::i64);
24113
24114 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24115}
24116
24117// Return true if the vector operation can guarantee only the first lane of its
24118// result contains data, with all bits in other lanes set to zero.
24120 switch (Op.getOpcode()) {
24121 default:
24122 return false;
24138 return true;
24139 }
24140}
24141
24143 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24144 SDValue InsertVec = N->getOperand(0);
24145 SDValue InsertElt = N->getOperand(1);
24146 SDValue InsertIdx = N->getOperand(2);
24147
24148 // We only care about inserts into the first element...
24149 if (!isNullConstant(InsertIdx))
24150 return SDValue();
24151 // ...of a zero'd vector...
24153 return SDValue();
24154 // ...where the inserted data was previously extracted...
24155 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24156 return SDValue();
24157
24158 SDValue ExtractVec = InsertElt.getOperand(0);
24159 SDValue ExtractIdx = InsertElt.getOperand(1);
24160
24161 // ...from the first element of a vector.
24162 if (!isNullConstant(ExtractIdx))
24163 return SDValue();
24164
24165 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24166
24167 // Ensure there's no type conversion going on.
24168 if (N->getValueType(0) != ExtractVec.getValueType())
24169 return SDValue();
24170
24171 if (!isLanes1toNKnownZero(ExtractVec))
24172 return SDValue();
24173
24174 // The explicit zeroing is redundant.
24175 return ExtractVec;
24176}
24177
24178static SDValue
24181 return Res;
24182
24183 return performPostLD1Combine(N, DCI, true);
24184}
24185
24187 EVT Ty = N->getValueType(0);
24188 if (Ty.isInteger())
24189 return SDValue();
24190
24193 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24195 return SDValue();
24196
24197 SDLoc DL(N);
24198 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
24199 DL, ExtIntTy);
24200 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
24201 DL, ExtIntTy);
24202 SDValue Idx = N->getOperand(2);
24203 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
24204 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
24205 return DAG.getBitcast(Ty, Trunc);
24206}
24207
24210 const AArch64Subtarget *Subtarget) {
24211 SDValue N0 = N->getOperand(0);
24212 EVT VT = N->getValueType(0);
24213
24214 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24215 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24216 return SDValue();
24217
24218 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24219 EVT EltVT = VT.getVectorElementType();
24220 return EltVT == MVT::f32 || EltVT == MVT::f64;
24221 };
24222
24223 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24224 // We purposefully don't care about legality of the nodes here as we know
24225 // they can be split down into something legal.
24226 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24227 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24228 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24229 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24230 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24231 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24232 LN0->getChain(), LN0->getBasePtr(),
24233 N0.getValueType(), LN0->getMemOperand());
24234 DCI.CombineTo(N, ExtLoad);
24235 DCI.CombineTo(
24236 N0.getNode(),
24237 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24238 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24239 ExtLoad.getValue(1));
24240 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24241 }
24242
24243 return SDValue();
24244}
24245
24247 const AArch64Subtarget *Subtarget) {
24248 EVT VT = N->getValueType(0);
24249
24250 // Don't expand for NEON, SVE2 or SME
24251 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24252 return SDValue();
24253
24254 SDLoc DL(N);
24255
24256 SDValue Mask = N->getOperand(0);
24257 SDValue In1 = N->getOperand(1);
24258 SDValue In2 = N->getOperand(2);
24259
24260 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24261 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24262 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24263 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24264}
24265
24267 EVT VT = N->getValueType(0);
24268
24269 SDValue Insert = N->getOperand(0);
24270 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24271 return SDValue();
24272
24273 if (!Insert.getOperand(0).isUndef())
24274 return SDValue();
24275
24276 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24277 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24278 if (IdxInsert != 0 || IdxDupLane != 0)
24279 return SDValue();
24280
24281 SDValue Bitcast = Insert.getOperand(1);
24282 if (Bitcast.getOpcode() != ISD::BITCAST)
24283 return SDValue();
24284
24285 SDValue Subvec = Bitcast.getOperand(0);
24286 EVT SubvecVT = Subvec.getValueType();
24287 if (!SubvecVT.is128BitVector())
24288 return SDValue();
24289 EVT NewSubvecVT =
24291
24292 SDLoc DL(N);
24293 SDValue NewInsert =
24294 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24295 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24296 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24297 NewInsert, N->getOperand(1));
24298 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24299}
24300
24301// Try to combine mull with uzp1.
24304 SelectionDAG &DAG) {
24305 if (DCI.isBeforeLegalizeOps())
24306 return SDValue();
24307
24308 SDValue LHS = N->getOperand(0);
24309 SDValue RHS = N->getOperand(1);
24310
24311 SDValue ExtractHigh;
24312 SDValue ExtractLow;
24313 SDValue TruncHigh;
24314 SDValue TruncLow;
24315 SDLoc DL(N);
24316
24317 // Check the operands are trunc and extract_high.
24319 RHS.getOpcode() == ISD::TRUNCATE) {
24320 TruncHigh = RHS;
24321 if (LHS.getOpcode() == ISD::BITCAST)
24322 ExtractHigh = LHS.getOperand(0);
24323 else
24324 ExtractHigh = LHS;
24326 LHS.getOpcode() == ISD::TRUNCATE) {
24327 TruncHigh = LHS;
24328 if (LHS.getOpcode() == ISD::BITCAST)
24329 ExtractHigh = RHS.getOperand(0);
24330 else
24331 ExtractHigh = RHS;
24332 } else
24333 return SDValue();
24334
24335 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24336 // with uzp1.
24337 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24338 SDValue TruncHighOp = TruncHigh.getOperand(0);
24339 EVT TruncHighOpVT = TruncHighOp.getValueType();
24340 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24341 DAG.isSplatValue(TruncHighOp, false))
24342 return SDValue();
24343
24344 // Check there is other extract_high with same source vector.
24345 // For example,
24346 //
24347 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24348 // t12: v4i16 = truncate t11
24349 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24350 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24351 // t16: v4i16 = truncate t15
24352 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24353 //
24354 // This dagcombine assumes the two extract_high uses same source vector in
24355 // order to detect the pair of the mull. If they have different source vector,
24356 // this code will not work.
24357 bool HasFoundMULLow = true;
24358 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24359 if (ExtractHighSrcVec->use_size() != 2)
24360 HasFoundMULLow = false;
24361
24362 // Find ExtractLow.
24363 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24364 if (User == ExtractHigh.getNode())
24365 continue;
24366
24367 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24369 HasFoundMULLow = false;
24370 break;
24371 }
24372
24373 ExtractLow.setNode(User);
24374 }
24375
24376 if (!ExtractLow || !ExtractLow->hasOneUse())
24377 HasFoundMULLow = false;
24378
24379 // Check ExtractLow's user.
24380 if (HasFoundMULLow) {
24381 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24382 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24383 HasFoundMULLow = false;
24384 } else {
24385 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24386 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24387 TruncLow = ExtractLowUser->getOperand(1);
24388 else
24389 HasFoundMULLow = false;
24390 } else {
24391 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24392 TruncLow = ExtractLowUser->getOperand(0);
24393 else
24394 HasFoundMULLow = false;
24395 }
24396 }
24397 }
24398
24399 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24400 // with uzp1.
24401 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24402 EVT TruncHighVT = TruncHigh.getValueType();
24403 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24404 SDValue TruncLowOp =
24405 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24406 EVT TruncLowOpVT = TruncLowOp.getValueType();
24407 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24408 DAG.isSplatValue(TruncLowOp, false)))
24409 return SDValue();
24410
24411 // Create uzp1, extract_high and extract_low.
24412 if (TruncHighOpVT != UZP1VT)
24413 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24414 if (TruncLowOpVT != UZP1VT)
24415 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24416
24417 SDValue UZP1 =
24418 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24419 SDValue HighIdxCst =
24420 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24421 SDValue NewTruncHigh =
24422 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24423 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24424
24425 if (HasFoundMULLow) {
24426 EVT TruncLowVT = TruncLow.getValueType();
24427 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24428 UZP1, ExtractLow.getOperand(1));
24429 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24430 }
24431
24432 return SDValue(N, 0);
24433}
24434
24437 SelectionDAG &DAG) {
24438 if (SDValue Val =
24440 return Val;
24441
24442 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24443 return Val;
24444
24445 return SDValue();
24446}
24447
24448static SDValue
24450 SelectionDAG &DAG) {
24451 // Let's do below transform.
24452 //
24453 // t34: v4i32 = AArch64ISD::UADDLV t2
24454 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24455 // t7: i64 = zero_extend t35
24456 // t20: v1i64 = scalar_to_vector t7
24457 // ==>
24458 // t34: v4i32 = AArch64ISD::UADDLV t2
24459 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24460 // t40: v1i64 = AArch64ISD::NVCAST t39
24461 if (DCI.isBeforeLegalizeOps())
24462 return SDValue();
24463
24464 EVT VT = N->getValueType(0);
24465 if (VT != MVT::v1i64)
24466 return SDValue();
24467
24468 SDValue ZEXT = N->getOperand(0);
24469 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24470 return SDValue();
24471
24472 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24473 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24474 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24475 return SDValue();
24476
24477 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24478 return SDValue();
24479
24480 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24481 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24482 UADDLV.getValueType() != MVT::v4i32 ||
24483 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24484 return SDValue();
24485
24486 // Let's generate new sequence with AArch64ISD::NVCAST.
24487 SDLoc DL(N);
24488 SDValue EXTRACT_SUBVEC =
24489 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24490 DAG.getConstant(0, DL, MVT::i64));
24491 SDValue NVCAST =
24492 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24493
24494 return NVCAST;
24495}
24496
24498 DAGCombinerInfo &DCI) const {
24499 SelectionDAG &DAG = DCI.DAG;
24500 switch (N->getOpcode()) {
24501 default:
24502 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24503 break;
24504 case ISD::VECREDUCE_AND:
24505 case ISD::VECREDUCE_OR:
24506 case ISD::VECREDUCE_XOR:
24507 return performVecReduceBitwiseCombine(N, DCI, DAG);
24508 case ISD::ADD:
24509 case ISD::SUB:
24510 return performAddSubCombine(N, DCI);
24511 case ISD::BUILD_VECTOR:
24512 return performBuildVectorCombine(N, DCI, DAG);
24513 case ISD::TRUNCATE:
24514 return performTruncateCombine(N, DAG);
24515 case AArch64ISD::ANDS:
24516 return performFlagSettingCombine(N, DCI, ISD::AND);
24517 case AArch64ISD::ADC:
24518 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24519 return R;
24520 return foldADCToCINC(N, DAG);
24521 case AArch64ISD::SBC:
24522 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24523 case AArch64ISD::ADCS:
24524 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24525 return R;
24527 case AArch64ISD::SBCS:
24528 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24529 return R;
24531 case AArch64ISD::BICi: {
24533 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
24534 APInt DemandedElts =
24535 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
24536
24538 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
24539 return SDValue();
24540
24541 break;
24542 }
24543 case ISD::XOR:
24544 return performXorCombine(N, DAG, DCI, Subtarget);
24545 case ISD::MUL:
24546 return performMulCombine(N, DAG, DCI, Subtarget);
24547 case ISD::SINT_TO_FP:
24548 case ISD::UINT_TO_FP:
24549 return performIntToFpCombine(N, DAG, Subtarget);
24550 case ISD::FP_TO_SINT:
24551 case ISD::FP_TO_UINT:
24554 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24555 case ISD::FDIV:
24556 return performFDivCombine(N, DAG, DCI, Subtarget);
24557 case ISD::OR:
24558 return performORCombine(N, DCI, Subtarget, *this);
24559 case ISD::AND:
24560 return performANDCombine(N, DCI);
24561 case ISD::FADD:
24562 return performFADDCombine(N, DCI);
24564 return performIntrinsicCombine(N, DCI, Subtarget);
24565 case ISD::ANY_EXTEND:
24566 case ISD::ZERO_EXTEND:
24567 case ISD::SIGN_EXTEND:
24568 return performExtendCombine(N, DCI, DAG);
24570 return performSignExtendInRegCombine(N, DCI, DAG);
24572 return performConcatVectorsCombine(N, DCI, DAG);
24574 return performExtractSubvectorCombine(N, DCI, DAG);
24576 return performInsertSubvectorCombine(N, DCI, DAG);
24577 case ISD::SELECT:
24578 return performSelectCombine(N, DCI);
24579 case ISD::VSELECT:
24580 return performVSelectCombine(N, DCI.DAG);
24581 case ISD::SETCC:
24582 return performSETCCCombine(N, DCI, DAG);
24583 case ISD::LOAD:
24584 return performLOADCombine(N, DCI, DAG, Subtarget);
24585 case ISD::STORE:
24586 return performSTORECombine(N, DCI, DAG, Subtarget);
24587 case ISD::MSTORE:
24588 return performMSTORECombine(N, DCI, DAG, Subtarget);
24589 case ISD::MGATHER:
24590 case ISD::MSCATTER:
24591 return performMaskedGatherScatterCombine(N, DCI, DAG);
24592 case ISD::VECTOR_SPLICE:
24593 return performSVESpliceCombine(N, DAG);
24594 case ISD::FP_EXTEND:
24595 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24596 case AArch64ISD::BRCOND:
24597 return performBRCONDCombine(N, DCI, DAG);
24598 case AArch64ISD::TBNZ:
24599 case AArch64ISD::TBZ:
24600 return performTBZCombine(N, DCI, DAG);
24601 case AArch64ISD::CSEL:
24602 return performCSELCombine(N, DCI, DAG);
24603 case AArch64ISD::DUP:
24608 return performDUPCombine(N, DCI);
24610 return performDupLane128Combine(N, DAG);
24611 case AArch64ISD::NVCAST:
24612 return performNVCASTCombine(N, DAG);
24613 case AArch64ISD::SPLICE:
24614 return performSpliceCombine(N, DAG);
24617 return performUnpackCombine(N, DAG, Subtarget);
24618 case AArch64ISD::UZP1:
24619 return performUzpCombine(N, DAG, Subtarget);
24621 return performSetccMergeZeroCombine(N, DCI);
24638 return performGLD1Combine(N, DAG);
24639 case AArch64ISD::VASHR:
24640 case AArch64ISD::VLSHR:
24641 return performVectorShiftCombine(N, *this, DCI);
24643 return performSunpkloCombine(N, DAG);
24644 case AArch64ISD::BSP:
24645 return performBSPExpandForSVE(N, DAG, Subtarget);
24647 return performInsertVectorEltCombine(N, DCI);
24649 return performExtractVectorEltCombine(N, DCI, Subtarget);
24650 case ISD::VECREDUCE_ADD:
24651 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24652 case AArch64ISD::UADDV:
24653 return performUADDVCombine(N, DAG);
24654 case AArch64ISD::SMULL:
24655 case AArch64ISD::UMULL:
24656 case AArch64ISD::PMULL:
24657 return performMULLCombine(N, DCI, DAG);
24660 switch (N->getConstantOperandVal(1)) {
24661 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24662 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24663 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24664 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24665 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24666 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24667 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24668 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24669 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24670 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24671 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24672 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24673 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24674 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24675 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24676 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24678 case Intrinsic::aarch64_neon_ld2:
24679 case Intrinsic::aarch64_neon_ld3:
24680 case Intrinsic::aarch64_neon_ld4:
24681 case Intrinsic::aarch64_neon_ld1x2:
24682 case Intrinsic::aarch64_neon_ld1x3:
24683 case Intrinsic::aarch64_neon_ld1x4:
24684 case Intrinsic::aarch64_neon_ld2lane:
24685 case Intrinsic::aarch64_neon_ld3lane:
24686 case Intrinsic::aarch64_neon_ld4lane:
24687 case Intrinsic::aarch64_neon_ld2r:
24688 case Intrinsic::aarch64_neon_ld3r:
24689 case Intrinsic::aarch64_neon_ld4r:
24690 case Intrinsic::aarch64_neon_st2:
24691 case Intrinsic::aarch64_neon_st3:
24692 case Intrinsic::aarch64_neon_st4:
24693 case Intrinsic::aarch64_neon_st1x2:
24694 case Intrinsic::aarch64_neon_st1x3:
24695 case Intrinsic::aarch64_neon_st1x4:
24696 case Intrinsic::aarch64_neon_st2lane:
24697 case Intrinsic::aarch64_neon_st3lane:
24698 case Intrinsic::aarch64_neon_st4lane:
24699 return performNEONPostLDSTCombine(N, DCI, DAG);
24700 case Intrinsic::aarch64_sve_ldnt1:
24701 return performLDNT1Combine(N, DAG);
24702 case Intrinsic::aarch64_sve_ld1rq:
24703 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24704 case Intrinsic::aarch64_sve_ld1ro:
24705 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24706 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24708 case Intrinsic::aarch64_sve_ldnt1_gather:
24710 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24711 return performGatherLoadCombine(N, DAG,
24713 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24715 case Intrinsic::aarch64_sve_ld1:
24717 case Intrinsic::aarch64_sve_ldnf1:
24719 case Intrinsic::aarch64_sve_ldff1:
24721 case Intrinsic::aarch64_sve_st1:
24722 return performST1Combine(N, DAG);
24723 case Intrinsic::aarch64_sve_stnt1:
24724 return performSTNT1Combine(N, DAG);
24725 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24727 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24729 case Intrinsic::aarch64_sve_stnt1_scatter:
24731 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24733 case Intrinsic::aarch64_sve_ld1_gather:
24735 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24736 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24738 case Intrinsic::aarch64_sve_ld1q_gather_index:
24739 return performGatherLoadCombine(N, DAG,
24741 case Intrinsic::aarch64_sve_ld1_gather_index:
24742 return performGatherLoadCombine(N, DAG,
24744 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24746 /*OnlyPackedOffsets=*/false);
24747 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24749 /*OnlyPackedOffsets=*/false);
24750 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24751 return performGatherLoadCombine(N, DAG,
24753 /*OnlyPackedOffsets=*/false);
24754 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24755 return performGatherLoadCombine(N, DAG,
24757 /*OnlyPackedOffsets=*/false);
24758 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24760 case Intrinsic::aarch64_sve_ldff1_gather:
24762 case Intrinsic::aarch64_sve_ldff1_gather_index:
24763 return performGatherLoadCombine(N, DAG,
24765 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24766 return performGatherLoadCombine(N, DAG,
24768 /*OnlyPackedOffsets=*/false);
24769 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24770 return performGatherLoadCombine(N, DAG,
24772 /*OnlyPackedOffsets=*/false);
24773 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24774 return performGatherLoadCombine(N, DAG,
24776 /*OnlyPackedOffsets=*/false);
24777 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24778 return performGatherLoadCombine(N, DAG,
24780 /*OnlyPackedOffsets=*/false);
24781 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24782 return performGatherLoadCombine(N, DAG,
24784 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24785 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24787 case Intrinsic::aarch64_sve_st1q_scatter_index:
24789 case Intrinsic::aarch64_sve_st1_scatter:
24791 case Intrinsic::aarch64_sve_st1_scatter_index:
24793 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24795 /*OnlyPackedOffsets=*/false);
24796 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24798 /*OnlyPackedOffsets=*/false);
24799 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24800 return performScatterStoreCombine(N, DAG,
24802 /*OnlyPackedOffsets=*/false);
24803 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24804 return performScatterStoreCombine(N, DAG,
24806 /*OnlyPackedOffsets=*/false);
24807 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24809 case Intrinsic::aarch64_rndr:
24810 case Intrinsic::aarch64_rndrrs: {
24811 unsigned IntrinsicID = N->getConstantOperandVal(1);
24812 auto Register =
24813 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24814 : AArch64SysReg::RNDRRS);
24815 SDLoc DL(N);
24816 SDValue A = DAG.getNode(
24817 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24818 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24819 SDValue B = DAG.getNode(
24820 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24821 DAG.getConstant(0, DL, MVT::i32),
24822 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24823 return DAG.getMergeValues(
24824 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24825 }
24826 case Intrinsic::aarch64_sme_ldr_zt:
24828 DAG.getVTList(MVT::Other), N->getOperand(0),
24829 N->getOperand(2), N->getOperand(3));
24830 case Intrinsic::aarch64_sme_str_zt:
24831 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24832 DAG.getVTList(MVT::Other), N->getOperand(0),
24833 N->getOperand(2), N->getOperand(3));
24834 default:
24835 break;
24836 }
24837 break;
24838 case ISD::GlobalAddress:
24839 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24840 case ISD::CTLZ:
24841 return performCTLZCombine(N, DAG, Subtarget);
24843 return performScalarToVectorCombine(N, DCI, DAG);
24844 }
24845 return SDValue();
24846}
24847
24848// Check if the return value is used as only a return value, as otherwise
24849// we can't perform a tail-call. In particular, we need to check for
24850// target ISD nodes that are returns and any other "odd" constructs
24851// that the generic analysis code won't necessarily catch.
24852bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24853 SDValue &Chain) const {
24854 if (N->getNumValues() != 1)
24855 return false;
24856 if (!N->hasNUsesOfValue(1, 0))
24857 return false;
24858
24859 SDValue TCChain = Chain;
24860 SDNode *Copy = *N->use_begin();
24861 if (Copy->getOpcode() == ISD::CopyToReg) {
24862 // If the copy has a glue operand, we conservatively assume it isn't safe to
24863 // perform a tail call.
24864 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24865 MVT::Glue)
24866 return false;
24867 TCChain = Copy->getOperand(0);
24868 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24869 return false;
24870
24871 bool HasRet = false;
24872 for (SDNode *Node : Copy->uses()) {
24873 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24874 return false;
24875 HasRet = true;
24876 }
24877
24878 if (!HasRet)
24879 return false;
24880
24881 Chain = TCChain;
24882 return true;
24883}
24884
24885// Return whether the an instruction can potentially be optimized to a tail
24886// call. This will cause the optimizers to attempt to move, or duplicate,
24887// return instructions to help enable tail call optimizations for this
24888// instruction.
24889bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24890 return CI->isTailCall();
24891}
24892
24893bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24894 Register Offset, bool IsPre,
24895 MachineRegisterInfo &MRI) const {
24896 auto CstOffset = getIConstantVRegVal(Offset, MRI);
24897 if (!CstOffset || CstOffset->isZero())
24898 return false;
24899
24900 // All of the indexed addressing mode instructions take a signed 9 bit
24901 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24902 // encodes the sign/indexing direction.
24903 return isInt<9>(CstOffset->getSExtValue());
24904}
24905
24906bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24907 SDValue &Base,
24908 SDValue &Offset,
24909 SelectionDAG &DAG) const {
24910 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24911 return false;
24912
24913 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24914 SDNode *ValOnlyUser = nullptr;
24915 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24916 ++UI) {
24917 if (UI.getUse().getResNo() == 1)
24918 continue; // Ignore chain.
24919 if (ValOnlyUser == nullptr)
24920 ValOnlyUser = *UI;
24921 else {
24922 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24923 break;
24924 }
24925 }
24926
24927 auto IsUndefOrZero = [](SDValue V) {
24928 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24929 };
24930
24931 // If the only user of the value is a scalable vector splat, it is
24932 // preferable to do a replicating load (ld1r*).
24933 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
24934 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24935 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24936 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
24937 return false;
24938
24939 Base = Op->getOperand(0);
24940 // All of the indexed addressing mode instructions take a signed
24941 // 9 bit immediate offset.
24942 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
24943 int64_t RHSC = RHS->getSExtValue();
24944 if (Op->getOpcode() == ISD::SUB)
24945 RHSC = -(uint64_t)RHSC;
24946 if (!isInt<9>(RHSC))
24947 return false;
24948 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24949 // when dealing with subtraction.
24950 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
24951 return true;
24952 }
24953 return false;
24954}
24955
24956bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24957 SDValue &Offset,
24959 SelectionDAG &DAG) const {
24960 EVT VT;
24961 SDValue Ptr;
24962 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24963 VT = LD->getMemoryVT();
24964 Ptr = LD->getBasePtr();
24965 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24966 VT = ST->getMemoryVT();
24967 Ptr = ST->getBasePtr();
24968 } else
24969 return false;
24970
24971 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
24972 return false;
24973 AM = ISD::PRE_INC;
24974 return true;
24975}
24976
24977bool AArch64TargetLowering::getPostIndexedAddressParts(
24979 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
24980 EVT VT;
24981 SDValue Ptr;
24982 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24983 VT = LD->getMemoryVT();
24984 Ptr = LD->getBasePtr();
24985 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24986 VT = ST->getMemoryVT();
24987 Ptr = ST->getBasePtr();
24988 } else
24989 return false;
24990
24991 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
24992 return false;
24993 // Post-indexing updates the base, so it's not a valid transform
24994 // if that's not the same as the load's pointer.
24995 if (Ptr != Base)
24996 return false;
24997 AM = ISD::POST_INC;
24998 return true;
24999}
25000
25003 SelectionDAG &DAG) {
25004 SDLoc DL(N);
25005 SDValue Op = N->getOperand(0);
25006 EVT VT = N->getValueType(0);
25007 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25008 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25009 "Must be bool vector.");
25010
25011 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25012 // elements, it adds a vector concatenation with undef(s). If we encounter
25013 // this here, we can skip the concat.
25014 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25015 bool AllUndef = true;
25016 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25017 AllUndef &= Op.getOperand(I).isUndef();
25018
25019 if (AllUndef)
25020 Op = Op.getOperand(0);
25021 }
25022
25023 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25024 if (VectorBits)
25025 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25026}
25027
25030 SelectionDAG &DAG, EVT ExtendVT,
25031 EVT CastVT) {
25032 SDLoc DL(N);
25033 SDValue Op = N->getOperand(0);
25034 EVT VT = N->getValueType(0);
25035
25036 // Use SCALAR_TO_VECTOR for lane zero
25037 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25038 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25039 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25040 Results.push_back(
25041 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25042}
25043
25044void AArch64TargetLowering::ReplaceBITCASTResults(
25046 SDLoc DL(N);
25047 SDValue Op = N->getOperand(0);
25048 EVT VT = N->getValueType(0);
25049 EVT SrcVT = Op.getValueType();
25050
25051 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25052 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25053 return;
25054 }
25055
25056 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25057 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25058 return;
25059 }
25060
25061 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25062 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25063 return;
25064 }
25065
25066 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25067 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25068 "Expected fp->int bitcast!");
25069
25070 // Bitcasting between unpacked vector types of different element counts is
25071 // not a NOP because the live elements are laid out differently.
25072 // 01234567
25073 // e.g. nxv2i32 = XX??XX??
25074 // nxv4f16 = X?X?X?X?
25075 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25076 return;
25077
25078 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25079 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25080 return;
25081 }
25082
25083 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25084 !VT.isVector())
25085 return replaceBoolVectorBitcast(N, Results, DAG);
25086
25087 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25088 return;
25089
25090 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25091 DAG.getUNDEF(MVT::i32), Op);
25092 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25093 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25094}
25095
25097 SelectionDAG &DAG,
25098 const AArch64Subtarget *Subtarget) {
25099 EVT VT = N->getValueType(0);
25100 if (!VT.is256BitVector() ||
25102 !N->getFlags().hasAllowReassociation()) ||
25103 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25104 VT.getScalarType() == MVT::bf16)
25105 return;
25106
25107 SDValue X = N->getOperand(0);
25108 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25109 if (!Shuf) {
25110 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25111 X = N->getOperand(1);
25112 if (!Shuf)
25113 return;
25114 }
25115
25116 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25117 return;
25118
25119 // Check the mask is 1,0,3,2,5,4,...
25120 ArrayRef<int> Mask = Shuf->getMask();
25121 for (int I = 0, E = Mask.size(); I < E; I++)
25122 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25123 return;
25124
25125 SDLoc DL(N);
25126 auto LoHi = DAG.SplitVector(X, DL);
25127 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25128 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25129 LoHi.first, LoHi.second);
25130
25131 // Shuffle the elements back into order.
25132 SmallVector<int> NMask;
25133 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25134 NMask.push_back(I);
25135 NMask.push_back(I);
25136 }
25137 Results.push_back(
25138 DAG.getVectorShuffle(VT, DL,
25139 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25140 DAG.getUNDEF(LoHi.first.getValueType())),
25141 DAG.getUNDEF(VT), NMask));
25142}
25143
25146 SelectionDAG &DAG, unsigned InterOp,
25147 unsigned AcrossOp) {
25148 EVT LoVT, HiVT;
25149 SDValue Lo, Hi;
25150 SDLoc dl(N);
25151 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25152 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25153 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25154 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25155 Results.push_back(SplitVal);
25156}
25157
25158void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25160 SDValue In = N->getOperand(0);
25161 EVT InVT = In.getValueType();
25162
25163 // Common code will handle these just fine.
25164 if (!InVT.isScalableVector() || !InVT.isInteger())
25165 return;
25166
25167 SDLoc DL(N);
25168 EVT VT = N->getValueType(0);
25169
25170 // The following checks bail if this is not a halving operation.
25171
25173
25174 if (InVT.getVectorElementCount() != (ResEC * 2))
25175 return;
25176
25177 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25178 if (!CIndex)
25179 return;
25180
25181 unsigned Index = CIndex->getZExtValue();
25182 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25183 return;
25184
25185 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25186 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25187
25188 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25189 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25190}
25191
25192// Create an even/odd pair of X registers holding integer value V.
25194 SDLoc dl(V.getNode());
25195 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25196 if (DAG.getDataLayout().isBigEndian())
25197 std::swap (VLo, VHi);
25198 SDValue RegClass =
25199 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25200 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25201 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25202 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25203 return SDValue(
25204 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25205}
25206
25209 SelectionDAG &DAG,
25210 const AArch64Subtarget *Subtarget) {
25211 assert(N->getValueType(0) == MVT::i128 &&
25212 "AtomicCmpSwap on types less than 128 should be legal");
25213
25214 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25215 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25216 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25217 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25218 SDValue Ops[] = {
25219 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25220 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25221 N->getOperand(1), // Ptr
25222 N->getOperand(0), // Chain in
25223 };
25224
25225 unsigned Opcode;
25226 switch (MemOp->getMergedOrdering()) {
25228 Opcode = AArch64::CASPX;
25229 break;
25231 Opcode = AArch64::CASPAX;
25232 break;
25234 Opcode = AArch64::CASPLX;
25235 break;
25238 Opcode = AArch64::CASPALX;
25239 break;
25240 default:
25241 llvm_unreachable("Unexpected ordering!");
25242 }
25243
25244 MachineSDNode *CmpSwap = DAG.getMachineNode(
25245 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25246 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25247
25248 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25249 if (DAG.getDataLayout().isBigEndian())
25250 std::swap(SubReg1, SubReg2);
25251 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25252 SDValue(CmpSwap, 0));
25253 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25254 SDValue(CmpSwap, 0));
25255 Results.push_back(
25256 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25257 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25258 return;
25259 }
25260
25261 unsigned Opcode;
25262 switch (MemOp->getMergedOrdering()) {
25264 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25265 break;
25267 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25268 break;
25270 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25271 break;
25274 Opcode = AArch64::CMP_SWAP_128;
25275 break;
25276 default:
25277 llvm_unreachable("Unexpected ordering!");
25278 }
25279
25280 SDLoc DL(N);
25281 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25282 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25283 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25284 New.first, New.second, N->getOperand(0)};
25285 SDNode *CmpSwap = DAG.getMachineNode(
25286 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25287 Ops);
25288 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25289
25290 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25291 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25292 Results.push_back(SDValue(CmpSwap, 3));
25293}
25294
25295static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25296 AtomicOrdering Ordering) {
25297 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25298 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25299 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25300 // ATOMIC_LOAD_CLR at any point.
25301 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25302 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25303 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25304 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25305
25306 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25307 // The operand will need to be XORed in a separate step.
25308 switch (Ordering) {
25310 return AArch64::LDCLRP;
25311 break;
25313 return AArch64::LDCLRPA;
25314 break;
25316 return AArch64::LDCLRPL;
25317 break;
25320 return AArch64::LDCLRPAL;
25321 break;
25322 default:
25323 llvm_unreachable("Unexpected ordering!");
25324 }
25325 }
25326
25327 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25328 switch (Ordering) {
25330 return AArch64::LDSETP;
25331 break;
25333 return AArch64::LDSETPA;
25334 break;
25336 return AArch64::LDSETPL;
25337 break;
25340 return AArch64::LDSETPAL;
25341 break;
25342 default:
25343 llvm_unreachable("Unexpected ordering!");
25344 }
25345 }
25346
25347 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25348 switch (Ordering) {
25350 return AArch64::SWPP;
25351 break;
25353 return AArch64::SWPPA;
25354 break;
25356 return AArch64::SWPPL;
25357 break;
25360 return AArch64::SWPPAL;
25361 break;
25362 default:
25363 llvm_unreachable("Unexpected ordering!");
25364 }
25365 }
25366
25367 llvm_unreachable("Unexpected ISDOpcode!");
25368}
25369
25372 SelectionDAG &DAG,
25373 const AArch64Subtarget *Subtarget) {
25374 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25375 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25376 // rather than the CASP instructions, because CASP has register classes for
25377 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25378 // to present them as single operands. LSE128 instructions use the GPR64
25379 // register class (because the pair does not have to be sequential), like
25380 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25381
25382 assert(N->getValueType(0) == MVT::i128 &&
25383 "AtomicLoadXXX on types less than 128 should be legal");
25384
25385 if (!Subtarget->hasLSE128())
25386 return;
25387
25388 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25389 const SDValue &Chain = N->getOperand(0);
25390 const SDValue &Ptr = N->getOperand(1);
25391 const SDValue &Val128 = N->getOperand(2);
25392 std::pair<SDValue, SDValue> Val2x64 =
25393 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25394
25395 const unsigned ISDOpcode = N->getOpcode();
25396 const unsigned MachineOpcode =
25397 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25398
25399 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25400 SDLoc dl(Val128);
25401 Val2x64.first =
25402 DAG.getNode(ISD::XOR, dl, MVT::i64,
25403 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25404 Val2x64.second =
25405 DAG.getNode(ISD::XOR, dl, MVT::i64,
25406 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25407 }
25408
25409 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25410 if (DAG.getDataLayout().isBigEndian())
25411 std::swap(Ops[0], Ops[1]);
25412
25413 MachineSDNode *AtomicInst =
25414 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25415 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25416
25417 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25418
25419 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25420 if (DAG.getDataLayout().isBigEndian())
25421 std::swap(Lo, Hi);
25422
25423 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25424 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25425}
25426
25427void AArch64TargetLowering::ReplaceNodeResults(
25429 switch (N->getOpcode()) {
25430 default:
25431 llvm_unreachable("Don't know how to custom expand this");
25432 case ISD::BITCAST:
25433 ReplaceBITCASTResults(N, Results, DAG);
25434 return;
25435 case ISD::VECREDUCE_ADD:
25440 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25441 return;
25442 case ISD::ADD:
25443 case ISD::FADD:
25444 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25445 return;
25446
25447 case ISD::CTPOP:
25448 case ISD::PARITY:
25449 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25450 Results.push_back(Result);
25451 return;
25452 case AArch64ISD::SADDV:
25454 return;
25455 case AArch64ISD::UADDV:
25457 return;
25458 case AArch64ISD::SMINV:
25460 return;
25461 case AArch64ISD::UMINV:
25463 return;
25464 case AArch64ISD::SMAXV:
25466 return;
25467 case AArch64ISD::UMAXV:
25469 return;
25470 case ISD::MULHS:
25472 Results.push_back(
25473 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25474 return;
25475 case ISD::MULHU:
25477 Results.push_back(
25478 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25479 return;
25480 case ISD::FP_TO_UINT:
25481 case ISD::FP_TO_SINT:
25484 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25485 // Let normal code take care of it by not adding anything to Results.
25486 return;
25488 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25489 return;
25491 assert(N->getValueType(0) != MVT::i128 &&
25492 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25493 break;
25496 case ISD::ATOMIC_SWAP: {
25497 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25498 "Expected 128-bit atomicrmw.");
25499 // These need custom type legalisation so we go directly to instruction.
25500 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25501 return;
25502 }
25503 case ISD::ATOMIC_LOAD:
25504 case ISD::LOAD: {
25505 MemSDNode *LoadNode = cast<MemSDNode>(N);
25506 EVT MemVT = LoadNode->getMemoryVT();
25507 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25508 // targets.
25509 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25510 MemVT.getSizeInBits() == 256u &&
25511 (MemVT.getScalarSizeInBits() == 8u ||
25512 MemVT.getScalarSizeInBits() == 16u ||
25513 MemVT.getScalarSizeInBits() == 32u ||
25514 MemVT.getScalarSizeInBits() == 64u)) {
25515
25518 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25519 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25520 MVT::Other}),
25521 {LoadNode->getChain(), LoadNode->getBasePtr()},
25522 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25523
25524 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25525 Result.getValue(0), Result.getValue(1));
25526 Results.append({Pair, Result.getValue(2) /* Chain */});
25527 return;
25528 }
25529
25530 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25531 LoadNode->getMemoryVT() != MVT::i128) {
25532 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25533 // optimizer.
25534 return;
25535 }
25536
25537 if (SDValue(N, 0).getValueType() == MVT::i128) {
25538 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25539 bool isLoadAcquire =
25541 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25542
25543 if (isLoadAcquire)
25544 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25545
25547 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25548 {LoadNode->getChain(), LoadNode->getBasePtr()},
25549 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25550
25551 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25552
25553 SDValue Pair =
25554 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25555 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25556 Results.append({Pair, Result.getValue(2) /* Chain */});
25557 }
25558 return;
25559 }
25561 ReplaceExtractSubVectorResults(N, Results, DAG);
25562 return;
25565 // Custom lowering has been requested for INSERT_SUBVECTOR and
25566 // CONCAT_VECTORS -- but delegate to common code for result type
25567 // legalisation
25568 return;
25570 EVT VT = N->getValueType(0);
25571 assert((VT == MVT::i8 || VT == MVT::i16) &&
25572 "custom lowering for unexpected type");
25573
25574 Intrinsic::ID IntID =
25575 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25576 switch (IntID) {
25577 default:
25578 return;
25579 case Intrinsic::aarch64_sve_clasta_n: {
25580 SDLoc DL(N);
25581 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25582 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25583 N->getOperand(1), Op2, N->getOperand(3));
25584 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25585 return;
25586 }
25587 case Intrinsic::aarch64_sve_clastb_n: {
25588 SDLoc DL(N);
25589 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25590 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25591 N->getOperand(1), Op2, N->getOperand(3));
25592 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25593 return;
25594 }
25595 case Intrinsic::aarch64_sve_lasta: {
25596 SDLoc DL(N);
25597 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25598 N->getOperand(1), N->getOperand(2));
25599 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25600 return;
25601 }
25602 case Intrinsic::aarch64_sve_lastb: {
25603 SDLoc DL(N);
25604 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25605 N->getOperand(1), N->getOperand(2));
25606 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25607 return;
25608 }
25609 }
25610 }
25611 case ISD::READ_REGISTER: {
25612 SDLoc DL(N);
25613 assert(N->getValueType(0) == MVT::i128 &&
25614 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25615 SDValue Chain = N->getOperand(0);
25616 SDValue SysRegName = N->getOperand(1);
25617
25618 SDValue Result = DAG.getNode(
25619 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25620 Chain, SysRegName);
25621
25622 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25623 // of the 128-bit System Register value.
25624 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25625 Result.getValue(0), Result.getValue(1));
25626 Results.push_back(Pair);
25627 Results.push_back(Result.getValue(2)); // Chain
25628 return;
25629 }
25630 }
25631}
25632
25634 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25636 return true;
25637}
25638
25639unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25640 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25641 // reciprocal if there are three or more FDIVs.
25642 return 3;
25643}
25644
25647 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25648 // v4i16, v2i32 instead of to promote.
25649 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25650 VT == MVT::v1f32)
25651 return TypeWidenVector;
25652
25654}
25655
25656// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25657// provided the address is 16-byte aligned.
25659 if (!Subtarget->hasLSE2())
25660 return false;
25661
25662 if (auto LI = dyn_cast<LoadInst>(I))
25663 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25664 LI->getAlign() >= Align(16);
25665
25666 if (auto SI = dyn_cast<StoreInst>(I))
25667 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25668 SI->getAlign() >= Align(16);
25669
25670 return false;
25671}
25672
25674 if (!Subtarget->hasLSE128())
25675 return false;
25676
25677 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25678 // will clobber the two registers.
25679 if (const auto *SI = dyn_cast<StoreInst>(I))
25680 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25681 SI->getAlign() >= Align(16) &&
25682 (SI->getOrdering() == AtomicOrdering::Release ||
25683 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25684
25685 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25686 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25687 RMW->getAlign() >= Align(16) &&
25688 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25689 RMW->getOperation() == AtomicRMWInst::And ||
25690 RMW->getOperation() == AtomicRMWInst::Or);
25691
25692 return false;
25693}
25694
25696 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25697 return false;
25698
25699 if (auto LI = dyn_cast<LoadInst>(I))
25700 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25701 LI->getAlign() >= Align(16) &&
25702 LI->getOrdering() == AtomicOrdering::Acquire;
25703
25704 if (auto SI = dyn_cast<StoreInst>(I))
25705 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25706 SI->getAlign() >= Align(16) &&
25707 SI->getOrdering() == AtomicOrdering::Release;
25708
25709 return false;
25710}
25711
25713 const Instruction *I) const {
25715 return false;
25717 return false;
25719 return true;
25720 return false;
25721}
25722
25724 const Instruction *I) const {
25725 // Store-Release instructions only provide seq_cst guarantees when paired with
25726 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25727 // implement seq_cst loads and stores, so we need additional explicit fences
25728 // after memory writes.
25729 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25730 return false;
25731
25732 switch (I->getOpcode()) {
25733 default:
25734 return false;
25735 case Instruction::AtomicCmpXchg:
25736 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25738 case Instruction::AtomicRMW:
25739 return cast<AtomicRMWInst>(I)->getOrdering() ==
25741 case Instruction::Store:
25742 return cast<StoreInst>(I)->getOrdering() ==
25744 }
25745}
25746
25747// Loads and stores less than 128-bits are already atomic; ones above that
25748// are doomed anyway, so defer to the default libcall and blame the OS when
25749// things go wrong.
25752 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25753 if (Size != 128)
25755 if (isOpSuitableForRCPC3(SI))
25757 if (isOpSuitableForLSE128(SI))
25759 if (isOpSuitableForLDPSTP(SI))
25762}
25763
25764// Loads and stores less than 128-bits are already atomic; ones above that
25765// are doomed anyway, so defer to the default libcall and blame the OS when
25766// things go wrong.
25769 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25770
25771 if (Size != 128)
25773 if (isOpSuitableForRCPC3(LI))
25775 // No LSE128 loads
25776 if (isOpSuitableForLDPSTP(LI))
25778
25779 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25780 // implement atomicrmw without spilling. If the target address is also on the
25781 // stack and close enough to the spill slot, this can lead to a situation
25782 // where the monitor always gets cleared and the atomic operation can never
25783 // succeed. So at -O0 lower this operation to a CAS loop.
25784 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25786
25787 // Using CAS for an atomic load has a better chance of succeeding under high
25788 // contention situations. So use it if available.
25789 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25791}
25792
25793// The "default" for integer RMW operations is to expand to an LL/SC loop.
25794// However, with the LSE instructions (or outline-atomics mode, which provides
25795// library routines in place of the LSE-instructions), we can directly emit many
25796// operations instead.
25797//
25798// Floating-point operations are always emitted to a cmpxchg loop, because they
25799// may trigger a trap which aborts an LLSC sequence.
25802 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25803 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25804
25805 if (AI->isFloatingPointOperation())
25807
25808 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25812 if (CanUseLSE128)
25814
25815 // Nand is not supported in LSE.
25816 // Leave 128 bits to LLSC or CmpXChg.
25817 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25818 if (Subtarget->hasLSE())
25820 if (Subtarget->outlineAtomics()) {
25821 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25822 // Don't outline them unless
25823 // (1) high level <atomic> support approved:
25824 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25825 // (2) low level libgcc and compiler-rt support implemented by:
25826 // min/max outline atomics helpers
25827 if (AI->getOperation() != AtomicRMWInst::Min &&
25832 }
25833 }
25834 }
25835
25836 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25837 // implement atomicrmw without spilling. If the target address is also on the
25838 // stack and close enough to the spill slot, this can lead to a situation
25839 // where the monitor always gets cleared and the atomic operation can never
25840 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25841 // we have a single CAS instruction that can replace the loop.
25843 Subtarget->hasLSE())
25845
25847}
25848
25851 AtomicCmpXchgInst *AI) const {
25852 // If subtarget has LSE, leave cmpxchg intact for codegen.
25853 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25855 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25856 // implement cmpxchg without spilling. If the address being exchanged is also
25857 // on the stack and close enough to the spill slot, this can lead to a
25858 // situation where the monitor always gets cleared and the atomic operation
25859 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25860 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25862
25863 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25864 // it.
25866 if (Size > 64)
25868
25870}
25871
25873 Type *ValueTy, Value *Addr,
25874 AtomicOrdering Ord) const {
25875 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25876 bool IsAcquire = isAcquireOrStronger(Ord);
25877
25878 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25879 // intrinsic must return {i64, i64} and we have to recombine them into a
25880 // single i128 here.
25881 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25883 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25885
25886 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25887
25888 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25889 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25890 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25891 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25892 return Builder.CreateOr(
25893 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25894 }
25895
25896 Type *Tys[] = { Addr->getType() };
25898 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25899 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25900
25901 const DataLayout &DL = M->getDataLayout();
25902 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25903 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25904 CI->addParamAttr(
25905 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25906 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25907
25908 return Builder.CreateBitCast(Trunc, ValueTy);
25909}
25910
25912 IRBuilderBase &Builder) const {
25913 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25914 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25915}
25916
25918 Value *Val, Value *Addr,
25919 AtomicOrdering Ord) const {
25920 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25921 bool IsRelease = isReleaseOrStronger(Ord);
25922
25923 // Since the intrinsics must have legal type, the i128 intrinsics take two
25924 // parameters: "i64, i64". We must marshal Val into the appropriate form
25925 // before the call.
25926 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25928 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25930 Type *Int64Ty = Type::getInt64Ty(M->getContext());
25931
25932 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
25933 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
25934 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
25935 }
25936
25938 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25939 Type *Tys[] = { Addr->getType() };
25940 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
25941
25942 const DataLayout &DL = M->getDataLayout();
25943 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
25944 Val = Builder.CreateBitCast(Val, IntValTy);
25945
25946 CallInst *CI = Builder.CreateCall(
25947 Stxr, {Builder.CreateZExtOrBitCast(
25948 Val, Stxr->getFunctionType()->getParamType(0)),
25949 Addr});
25950 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
25951 Attribute::ElementType, Val->getType()));
25952 return CI;
25953}
25954
25956 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
25957 const DataLayout &DL) const {
25958 if (!Ty->isArrayTy()) {
25959 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
25960 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
25961 }
25962
25963 // All non aggregate members of the type must have the same type
25964 SmallVector<EVT> ValueVTs;
25965 ComputeValueVTs(*this, DL, Ty, ValueVTs);
25966 return all_equal(ValueVTs);
25967}
25968
25969bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
25970 EVT) const {
25971 return false;
25972}
25973
25974static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
25975 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
25976 Function *ThreadPointerFunc =
25977 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
25978 return IRB.CreatePointerCast(
25979 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
25980 Offset),
25981 IRB.getPtrTy(0));
25982}
25983
25985 // Android provides a fixed TLS slot for the stack cookie. See the definition
25986 // of TLS_SLOT_STACK_GUARD in
25987 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
25988 if (Subtarget->isTargetAndroid())
25989 return UseTlsOffset(IRB, 0x28);
25990
25991 // Fuchsia is similar.
25992 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
25993 if (Subtarget->isTargetFuchsia())
25994 return UseTlsOffset(IRB, -0x10);
25995
25997}
25998
26000 // MSVC CRT provides functionalities for stack protection.
26001 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26002 // MSVC CRT has a global variable holding security cookie.
26003 M.getOrInsertGlobal("__security_cookie",
26004 PointerType::getUnqual(M.getContext()));
26005
26006 // MSVC CRT has a function to validate security cookie.
26007 FunctionCallee SecurityCheckCookie =
26008 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26009 Type::getVoidTy(M.getContext()),
26010 PointerType::getUnqual(M.getContext()));
26011 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26012 F->setCallingConv(CallingConv::Win64);
26013 F->addParamAttr(0, Attribute::AttrKind::InReg);
26014 }
26015 return;
26016 }
26018}
26019
26021 // MSVC CRT has a global variable holding security cookie.
26022 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26023 return M.getGlobalVariable("__security_cookie");
26025}
26026
26028 // MSVC CRT has a function to validate security cookie.
26029 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26030 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26032}
26033
26034Value *
26036 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26037 // definition of TLS_SLOT_SAFESTACK in
26038 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26039 if (Subtarget->isTargetAndroid())
26040 return UseTlsOffset(IRB, 0x48);
26041
26042 // Fuchsia is similar.
26043 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26044 if (Subtarget->isTargetFuchsia())
26045 return UseTlsOffset(IRB, -0x8);
26046
26048}
26049
26051 const Instruction &AndI) const {
26052 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26053 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26054 // may be beneficial to sink in other cases, but we would have to check that
26055 // the cmp would not get folded into the br to form a cbz for these to be
26056 // beneficial.
26057 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26058 if (!Mask)
26059 return false;
26060 return Mask->getValue().isPowerOf2();
26061}
26062
26066 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26067 SelectionDAG &DAG) const {
26068 // Does baseline recommend not to perform the fold by default?
26070 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26071 return false;
26072 // Else, if this is a vector shift, prefer 'shl'.
26073 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26074}
26075
26078 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26080 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26083 ExpansionFactor);
26084}
26085
26087 // Update IsSplitCSR in AArch64unctionInfo.
26088 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26089 AFI->setIsSplitCSR(true);
26090}
26091
26093 MachineBasicBlock *Entry,
26094 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26095 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26096 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26097 if (!IStart)
26098 return;
26099
26100 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26101 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26102 MachineBasicBlock::iterator MBBI = Entry->begin();
26103 for (const MCPhysReg *I = IStart; *I; ++I) {
26104 const TargetRegisterClass *RC = nullptr;
26105 if (AArch64::GPR64RegClass.contains(*I))
26106 RC = &AArch64::GPR64RegClass;
26107 else if (AArch64::FPR64RegClass.contains(*I))
26108 RC = &AArch64::FPR64RegClass;
26109 else
26110 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26111
26112 Register NewVR = MRI->createVirtualRegister(RC);
26113 // Create copy from CSR to a virtual register.
26114 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26115 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26116 // nounwind. If we want to generalize this later, we may need to emit
26117 // CFI pseudo-instructions.
26118 assert(Entry->getParent()->getFunction().hasFnAttribute(
26119 Attribute::NoUnwind) &&
26120 "Function should be nounwind in insertCopiesSplitCSR!");
26121 Entry->addLiveIn(*I);
26122 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26123 .addReg(*I);
26124
26125 // Insert the copy-back instructions right before the terminator.
26126 for (auto *Exit : Exits)
26127 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26128 TII->get(TargetOpcode::COPY), *I)
26129 .addReg(NewVR);
26130 }
26131}
26132
26134 // Integer division on AArch64 is expensive. However, when aggressively
26135 // optimizing for code size, we prefer to use a div instruction, as it is
26136 // usually smaller than the alternative sequence.
26137 // The exception to this is vector division. Since AArch64 doesn't have vector
26138 // integer division, leaving the division as-is is a loss even in terms of
26139 // size, because it will have to be scalarized, while the alternative code
26140 // sequence can be performed in vector form.
26141 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26142 return OptSize && !VT.isVector();
26143}
26144
26146 // We want inc-of-add for scalars and sub-of-not for vectors.
26147 return VT.isScalarInteger();
26148}
26149
26151 EVT VT) const {
26152 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26153 // legalize.
26154 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26155 return false;
26156 if (FPVT == MVT::v8bf16)
26157 return false;
26158 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26159}
26160
26164 const TargetInstrInfo *TII) const {
26165 assert(MBBI->isCall() && MBBI->getCFIType() &&
26166 "Invalid call instruction for a KCFI check");
26167
26168 switch (MBBI->getOpcode()) {
26169 case AArch64::BLR:
26170 case AArch64::BLRNoIP:
26171 case AArch64::TCRETURNri:
26172 case AArch64::TCRETURNrix16x17:
26173 case AArch64::TCRETURNrix17:
26174 case AArch64::TCRETURNrinotx16:
26175 break;
26176 default:
26177 llvm_unreachable("Unexpected CFI call opcode");
26178 }
26179
26180 MachineOperand &Target = MBBI->getOperand(0);
26181 assert(Target.isReg() && "Invalid target operand for an indirect call");
26182 Target.setIsRenamable(false);
26183
26184 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26185 .addReg(Target.getReg())
26186 .addImm(MBBI->getCFIType())
26187 .getInstr();
26188}
26189
26191 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26192}
26193
26194unsigned
26196 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26197 return getPointerTy(DL).getSizeInBits();
26198
26199 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26200}
26201
26202void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26203 MachineFrameInfo &MFI = MF.getFrameInfo();
26204 // If we have any vulnerable SVE stack objects then the stack protector
26205 // needs to be placed at the top of the SVE stack area, as the SVE locals
26206 // are placed above the other locals, so we allocate it as if it were a
26207 // scalable vector.
26208 // FIXME: It may be worthwhile having a specific interface for this rather
26209 // than doing it here in finalizeLowering.
26210 if (MFI.hasStackProtectorIndex()) {
26211 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26217 break;
26218 }
26219 }
26220 }
26223}
26224
26225// Unlike X86, we let frame lowering assign offsets to all catch objects.
26227 return false;
26228}
26229
26230bool AArch64TargetLowering::shouldLocalize(
26231 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26232 auto &MF = *MI.getMF();
26233 auto &MRI = MF.getRegInfo();
26234 auto maxUses = [](unsigned RematCost) {
26235 // A cost of 1 means remats are basically free.
26236 if (RematCost == 1)
26237 return std::numeric_limits<unsigned>::max();
26238 if (RematCost == 2)
26239 return 2U;
26240
26241 // Remat is too expensive, only sink if there's one user.
26242 if (RematCost > 2)
26243 return 1U;
26244 llvm_unreachable("Unexpected remat cost");
26245 };
26246
26247 unsigned Opc = MI.getOpcode();
26248 switch (Opc) {
26249 case TargetOpcode::G_GLOBAL_VALUE: {
26250 // On Darwin, TLS global vars get selected into function calls, which
26251 // we don't want localized, as they can get moved into the middle of a
26252 // another call sequence.
26253 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26254 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26255 return false;
26256 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26257 }
26258 case TargetOpcode::G_FCONSTANT:
26259 case TargetOpcode::G_CONSTANT: {
26260 const ConstantInt *CI;
26261 unsigned AdditionalCost = 0;
26262
26263 if (Opc == TargetOpcode::G_CONSTANT)
26264 CI = MI.getOperand(1).getCImm();
26265 else {
26266 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26267 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26268 // materialized as integers.
26269 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26270 break;
26271 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26272 bool OptForSize =
26275 OptForSize))
26276 return true; // Constant should be cheap.
26277 CI =
26278 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26279 // FP materialization also costs an extra move, from gpr to fpr.
26280 AdditionalCost = 1;
26281 }
26282 APInt Imm = CI->getValue();
26285 assert(Cost.isValid() && "Expected a valid imm cost");
26286
26287 unsigned RematCost = *Cost.getValue();
26288 RematCost += AdditionalCost;
26289 Register Reg = MI.getOperand(0).getReg();
26290 unsigned MaxUses = maxUses(RematCost);
26291 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26292 if (MaxUses == std::numeric_limits<unsigned>::max())
26293 --MaxUses;
26294 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26295 }
26296 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26297 // localizable.
26298 case AArch64::ADRP:
26299 case AArch64::G_ADD_LOW:
26300 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26301 case TargetOpcode::G_PTR_ADD:
26302 return true;
26303 default:
26304 break;
26305 }
26307}
26308
26310 if (Inst.getType()->isScalableTy())
26311 return true;
26312
26313 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26314 if (Inst.getOperand(i)->getType()->isScalableTy())
26315 return true;
26316
26317 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26318 if (AI->getAllocatedType()->isScalableTy())
26319 return true;
26320 }
26321
26322 // Checks to allow the use of SME instructions
26323 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26324 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26325 auto CalleeAttrs = SMEAttrs(*Base);
26326 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26327 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26328 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26329 return true;
26330 }
26331 return false;
26332}
26333
26334// Return the largest legal scalable vector type that matches VT's element type.
26338 "Expected legal fixed length vector!");
26339 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26340 default:
26341 llvm_unreachable("unexpected element type for SVE container");
26342 case MVT::i8:
26343 return EVT(MVT::nxv16i8);
26344 case MVT::i16:
26345 return EVT(MVT::nxv8i16);
26346 case MVT::i32:
26347 return EVT(MVT::nxv4i32);
26348 case MVT::i64:
26349 return EVT(MVT::nxv2i64);
26350 case MVT::bf16:
26351 return EVT(MVT::nxv8bf16);
26352 case MVT::f16:
26353 return EVT(MVT::nxv8f16);
26354 case MVT::f32:
26355 return EVT(MVT::nxv4f32);
26356 case MVT::f64:
26357 return EVT(MVT::nxv2f64);
26358 }
26359}
26360
26361// Return a PTRUE with active lanes corresponding to the extent of VT.
26363 EVT VT) {
26366 "Expected legal fixed length vector!");
26367
26368 std::optional<unsigned> PgPattern =
26370 assert(PgPattern && "Unexpected element count for SVE predicate");
26371
26372 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26373 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26374 // variants of instructions when available.
26375 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26376 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26377 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26378 if (MaxSVESize && MinSVESize == MaxSVESize &&
26379 MaxSVESize == VT.getSizeInBits())
26380 PgPattern = AArch64SVEPredPattern::all;
26381
26382 MVT MaskVT;
26383 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26384 default:
26385 llvm_unreachable("unexpected element type for SVE predicate");
26386 case MVT::i8:
26387 MaskVT = MVT::nxv16i1;
26388 break;
26389 case MVT::i16:
26390 case MVT::f16:
26391 case MVT::bf16:
26392 MaskVT = MVT::nxv8i1;
26393 break;
26394 case MVT::i32:
26395 case MVT::f32:
26396 MaskVT = MVT::nxv4i1;
26397 break;
26398 case MVT::i64:
26399 case MVT::f64:
26400 MaskVT = MVT::nxv2i1;
26401 break;
26402 }
26403
26404 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26405}
26406
26408 EVT VT) {
26410 "Expected legal scalable vector!");
26411 auto PredTy = VT.changeVectorElementType(MVT::i1);
26412 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26413}
26414
26416 if (VT.isFixedLengthVector())
26417 return getPredicateForFixedLengthVector(DAG, DL, VT);
26418
26419 return getPredicateForScalableVector(DAG, DL, VT);
26420}
26421
26422// Grow V to consume an entire SVE register.
26424 assert(VT.isScalableVector() &&
26425 "Expected to convert into a scalable vector!");
26426 assert(V.getValueType().isFixedLengthVector() &&
26427 "Expected a fixed length vector operand!");
26428 SDLoc DL(V);
26429 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26430 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26431}
26432
26433// Shrink V so it's just big enough to maintain a VT's worth of data.
26436 "Expected to convert into a fixed length vector!");
26437 assert(V.getValueType().isScalableVector() &&
26438 "Expected a scalable vector operand!");
26439 SDLoc DL(V);
26440 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26441 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26442}
26443
26444// Convert all fixed length vector loads larger than NEON to masked_loads.
26445SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26446 SDValue Op, SelectionDAG &DAG) const {
26447 auto Load = cast<LoadSDNode>(Op);
26448
26449 SDLoc DL(Op);
26450 EVT VT = Op.getValueType();
26451 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26452 EVT LoadVT = ContainerVT;
26453 EVT MemVT = Load->getMemoryVT();
26454
26455 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26456
26457 if (VT.isFloatingPoint()) {
26458 LoadVT = ContainerVT.changeTypeToInteger();
26459 MemVT = MemVT.changeTypeToInteger();
26460 }
26461
26462 SDValue NewLoad = DAG.getMaskedLoad(
26463 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26464 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26465 Load->getAddressingMode(), Load->getExtensionType());
26466
26467 SDValue Result = NewLoad;
26468 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26469 EVT ExtendVT = ContainerVT.changeVectorElementType(
26470 Load->getMemoryVT().getVectorElementType());
26471
26472 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26474 Pg, Result, DAG.getUNDEF(ContainerVT));
26475 } else if (VT.isFloatingPoint()) {
26476 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26477 }
26478
26479 Result = convertFromScalableVector(DAG, VT, Result);
26480 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26481 return DAG.getMergeValues(MergedValues, DL);
26482}
26483
26485 SelectionDAG &DAG) {
26486 SDLoc DL(Mask);
26487 EVT InVT = Mask.getValueType();
26488 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26489
26490 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26491
26492 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26493 return Pg;
26494
26495 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26496 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26497
26499 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26500}
26501
26502// Convert all fixed length vector loads larger than NEON to masked_loads.
26503SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26504 SDValue Op, SelectionDAG &DAG) const {
26505 auto Load = cast<MaskedLoadSDNode>(Op);
26506
26507 SDLoc DL(Op);
26508 EVT VT = Op.getValueType();
26509 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26510
26511 SDValue Mask = Load->getMask();
26512 // If this is an extending load and the mask type is not the same as
26513 // load's type then we have to extend the mask type.
26514 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26515 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26516 "Incorrect mask type");
26517 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26518 }
26520
26521 SDValue PassThru;
26522 bool IsPassThruZeroOrUndef = false;
26523
26524 if (Load->getPassThru()->isUndef()) {
26525 PassThru = DAG.getUNDEF(ContainerVT);
26526 IsPassThruZeroOrUndef = true;
26527 } else {
26528 if (ContainerVT.isInteger())
26529 PassThru = DAG.getConstant(0, DL, ContainerVT);
26530 else
26531 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26532 if (isZerosVector(Load->getPassThru().getNode()))
26533 IsPassThruZeroOrUndef = true;
26534 }
26535
26536 SDValue NewLoad = DAG.getMaskedLoad(
26537 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26538 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26539 Load->getAddressingMode(), Load->getExtensionType());
26540
26541 SDValue Result = NewLoad;
26542 if (!IsPassThruZeroOrUndef) {
26543 SDValue OldPassThru =
26544 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26545 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26546 }
26547
26548 Result = convertFromScalableVector(DAG, VT, Result);
26549 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26550 return DAG.getMergeValues(MergedValues, DL);
26551}
26552
26553// Convert all fixed length vector stores larger than NEON to masked_stores.
26554SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26555 SDValue Op, SelectionDAG &DAG) const {
26556 auto Store = cast<StoreSDNode>(Op);
26557
26558 SDLoc DL(Op);
26559 EVT VT = Store->getValue().getValueType();
26560 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26561 EVT MemVT = Store->getMemoryVT();
26562
26563 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26564 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26565
26566 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26567 EVT TruncVT = ContainerVT.changeVectorElementType(
26568 Store->getMemoryVT().getVectorElementType());
26569 MemVT = MemVT.changeTypeToInteger();
26570 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26571 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26572 DAG.getUNDEF(TruncVT));
26573 NewValue =
26574 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26575 } else if (VT.isFloatingPoint()) {
26576 MemVT = MemVT.changeTypeToInteger();
26577 NewValue =
26578 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26579 }
26580
26581 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26582 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26583 Store->getMemOperand(), Store->getAddressingMode(),
26584 Store->isTruncatingStore());
26585}
26586
26587SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26588 SDValue Op, SelectionDAG &DAG) const {
26589 auto *Store = cast<MaskedStoreSDNode>(Op);
26590
26591 SDLoc DL(Op);
26592 EVT VT = Store->getValue().getValueType();
26593 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26594
26595 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26597
26598 return DAG.getMaskedStore(
26599 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26600 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26601 Store->getAddressingMode(), Store->isTruncatingStore());
26602}
26603
26604SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26605 SDValue Op, SelectionDAG &DAG) const {
26606 SDLoc dl(Op);
26607 EVT VT = Op.getValueType();
26608 EVT EltVT = VT.getVectorElementType();
26609
26610 bool Signed = Op.getOpcode() == ISD::SDIV;
26611 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26612
26613 bool Negated;
26614 uint64_t SplatVal;
26615 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26616 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26617 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26618 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26619
26620 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26621 SDValue Res =
26622 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26623 if (Negated)
26624 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26625 DAG.getConstant(0, dl, ContainerVT), Res);
26626
26627 return convertFromScalableVector(DAG, VT, Res);
26628 }
26629
26630 // Scalable vector i32/i64 DIV is supported.
26631 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26632 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26633
26634 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26635 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26636 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26637 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26638
26639 // If the wider type is legal: extend, op, and truncate.
26640 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26641 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26642 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26643 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26644 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26645 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26646 }
26647
26648 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26649 &ExtendOpcode](SDValue Op) {
26650 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26651 SDValue IdxHalf =
26652 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26653 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26654 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26655 return std::pair<SDValue, SDValue>(
26656 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26657 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26658 };
26659
26660 // If wider type is not legal: split, extend, op, trunc and concat.
26661 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26662 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26663 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26664 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26665 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26666 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26667 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26668}
26669
26670SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26671 SDValue Op, SelectionDAG &DAG) const {
26672 EVT VT = Op.getValueType();
26673 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26674
26675 SDLoc DL(Op);
26676 SDValue Val = Op.getOperand(0);
26677 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26678 Val = convertToScalableVector(DAG, ContainerVT, Val);
26679
26680 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26681 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26682
26683 // Repeatedly unpack Val until the result is of the desired element type.
26684 switch (ContainerVT.getSimpleVT().SimpleTy) {
26685 default:
26686 llvm_unreachable("unimplemented container type");
26687 case MVT::nxv16i8:
26688 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26689 if (VT.getVectorElementType() == MVT::i16)
26690 break;
26691 [[fallthrough]];
26692 case MVT::nxv8i16:
26693 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26694 if (VT.getVectorElementType() == MVT::i32)
26695 break;
26696 [[fallthrough]];
26697 case MVT::nxv4i32:
26698 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26699 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26700 break;
26701 }
26702
26703 return convertFromScalableVector(DAG, VT, Val);
26704}
26705
26706SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26707 SDValue Op, SelectionDAG &DAG) const {
26708 EVT VT = Op.getValueType();
26709 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26710
26711 SDLoc DL(Op);
26712 SDValue Val = Op.getOperand(0);
26713 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26714 Val = convertToScalableVector(DAG, ContainerVT, Val);
26715
26716 // Repeatedly truncate Val until the result is of the desired element type.
26717 switch (ContainerVT.getSimpleVT().SimpleTy) {
26718 default:
26719 llvm_unreachable("unimplemented container type");
26720 case MVT::nxv2i64:
26721 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26722 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26723 if (VT.getVectorElementType() == MVT::i32)
26724 break;
26725 [[fallthrough]];
26726 case MVT::nxv4i32:
26727 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26728 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26729 if (VT.getVectorElementType() == MVT::i16)
26730 break;
26731 [[fallthrough]];
26732 case MVT::nxv8i16:
26733 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26734 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26735 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26736 break;
26737 }
26738
26739 return convertFromScalableVector(DAG, VT, Val);
26740}
26741
26742SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26743 SDValue Op, SelectionDAG &DAG) const {
26744 EVT VT = Op.getValueType();
26745 EVT InVT = Op.getOperand(0).getValueType();
26746 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26747
26748 SDLoc DL(Op);
26749 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26750 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26751
26752 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26753}
26754
26755SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26756 SDValue Op, SelectionDAG &DAG) const {
26757 EVT VT = Op.getValueType();
26758 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26759
26760 SDLoc DL(Op);
26761 EVT InVT = Op.getOperand(0).getValueType();
26762 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26763 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26764
26765 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26766 Op.getOperand(1), Op.getOperand(2));
26767
26768 return convertFromScalableVector(DAG, VT, ScalableRes);
26769}
26770
26771// Convert vector operation 'Op' to an equivalent predicated operation whereby
26772// the original operation's type is used to construct a suitable predicate.
26773// NOTE: The results for inactive lanes are undefined.
26774SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26775 SelectionDAG &DAG,
26776 unsigned NewOp) const {
26777 EVT VT = Op.getValueType();
26778 SDLoc DL(Op);
26779 auto Pg = getPredicateForVector(DAG, DL, VT);
26780
26781 if (VT.isFixedLengthVector()) {
26782 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26783 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26784
26785 // Create list of operands by converting existing ones to scalable types.
26787 for (const SDValue &V : Op->op_values()) {
26788 if (isa<CondCodeSDNode>(V)) {
26789 Operands.push_back(V);
26790 continue;
26791 }
26792
26793 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26794 EVT VTArg = VTNode->getVT().getVectorElementType();
26795 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26796 Operands.push_back(DAG.getValueType(NewVTArg));
26797 continue;
26798 }
26799
26800 assert(isTypeLegal(V.getValueType()) &&
26801 "Expected only legal fixed-width types");
26802 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26803 }
26804
26805 if (isMergePassthruOpcode(NewOp))
26806 Operands.push_back(DAG.getUNDEF(ContainerVT));
26807
26808 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26809 return convertFromScalableVector(DAG, VT, ScalableRes);
26810 }
26811
26812 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26813
26815 for (const SDValue &V : Op->op_values()) {
26816 assert((!V.getValueType().isVector() ||
26817 V.getValueType().isScalableVector()) &&
26818 "Only scalable vectors are supported!");
26819 Operands.push_back(V);
26820 }
26821
26822 if (isMergePassthruOpcode(NewOp))
26823 Operands.push_back(DAG.getUNDEF(VT));
26824
26825 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26826}
26827
26828// If a fixed length vector operation has no side effects when applied to
26829// undefined elements, we can safely use scalable vectors to perform the same
26830// operation without needing to worry about predication.
26831SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26832 SelectionDAG &DAG) const {
26833 EVT VT = Op.getValueType();
26835 "Only expected to lower fixed length vector operation!");
26836 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26837
26838 // Create list of operands by converting existing ones to scalable types.
26840 for (const SDValue &V : Op->op_values()) {
26841 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26842
26843 // Pass through non-vector operands.
26844 if (!V.getValueType().isVector()) {
26845 Ops.push_back(V);
26846 continue;
26847 }
26848
26849 // "cast" fixed length vector to a scalable vector.
26850 assert(V.getValueType().isFixedLengthVector() &&
26851 isTypeLegal(V.getValueType()) &&
26852 "Only fixed length vectors are supported!");
26853 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26854 }
26855
26856 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26857 return convertFromScalableVector(DAG, VT, ScalableRes);
26858}
26859
26860SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26861 SelectionDAG &DAG) const {
26862 SDLoc DL(ScalarOp);
26863 SDValue AccOp = ScalarOp.getOperand(0);
26864 SDValue VecOp = ScalarOp.getOperand(1);
26865 EVT SrcVT = VecOp.getValueType();
26866 EVT ResVT = SrcVT.getVectorElementType();
26867
26868 EVT ContainerVT = SrcVT;
26869 if (SrcVT.isFixedLengthVector()) {
26870 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26871 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26872 }
26873
26874 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26875 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26876
26877 // Convert operands to Scalable.
26878 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26879 DAG.getUNDEF(ContainerVT), AccOp, Zero);
26880
26881 // Perform reduction.
26882 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26883 Pg, AccOp, VecOp);
26884
26885 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26886}
26887
26888SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26889 SelectionDAG &DAG) const {
26890 SDLoc DL(ReduceOp);
26891 SDValue Op = ReduceOp.getOperand(0);
26892 EVT OpVT = Op.getValueType();
26893 EVT VT = ReduceOp.getValueType();
26894
26895 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26896 return SDValue();
26897
26898 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26899
26900 switch (ReduceOp.getOpcode()) {
26901 default:
26902 return SDValue();
26903 case ISD::VECREDUCE_OR:
26904 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26905 // The predicate can be 'Op' because
26906 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26907 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26908 else
26909 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26910 case ISD::VECREDUCE_AND: {
26911 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26912 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26913 }
26914 case ISD::VECREDUCE_XOR: {
26915 SDValue ID =
26916 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26917 if (OpVT == MVT::nxv1i1) {
26918 // Emulate a CNTP on .Q using .D and a different governing predicate.
26919 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26920 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26921 }
26922 SDValue Cntp =
26923 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26924 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
26925 }
26926 }
26927
26928 return SDValue();
26929}
26930
26931SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26932 SDValue ScalarOp,
26933 SelectionDAG &DAG) const {
26934 SDLoc DL(ScalarOp);
26935 SDValue VecOp = ScalarOp.getOperand(0);
26936 EVT SrcVT = VecOp.getValueType();
26937
26939 SrcVT,
26940 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
26941 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26942 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26943 }
26944
26945 // UADDV always returns an i64 result.
26946 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26947 SrcVT.getVectorElementType();
26948 EVT RdxVT = SrcVT;
26949 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
26950 RdxVT = getPackedSVEVectorVT(ResVT);
26951
26952 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26953 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
26954 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26955 Rdx, DAG.getConstant(0, DL, MVT::i64));
26956
26957 // The VEC_REDUCE nodes expect an element size result.
26958 if (ResVT != ScalarOp.getValueType())
26959 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
26960
26961 return Res;
26962}
26963
26964SDValue
26965AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
26966 SelectionDAG &DAG) const {
26967 EVT VT = Op.getValueType();
26968 SDLoc DL(Op);
26969
26970 EVT InVT = Op.getOperand(1).getValueType();
26971 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26972 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
26973 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
26974
26975 // Convert the mask to a predicated (NOTE: We don't need to worry about
26976 // inactive lanes since VSELECT is safe when given undefined elements).
26977 EVT MaskVT = Op.getOperand(0).getValueType();
26978 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
26979 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
26981 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
26982
26983 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
26984 Mask, Op1, Op2);
26985
26986 return convertFromScalableVector(DAG, VT, ScalableRes);
26987}
26988
26989SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
26990 SDValue Op, SelectionDAG &DAG) const {
26991 SDLoc DL(Op);
26992 EVT InVT = Op.getOperand(0).getValueType();
26993 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26994
26995 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
26996 "Only expected to lower fixed length vector operation!");
26997 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
26998 "Expected integer result of the same bit length as the inputs!");
26999
27000 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27001 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27002 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27003
27004 EVT CmpVT = Pg.getValueType();
27005 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27006 {Pg, Op1, Op2, Op.getOperand(2)});
27007
27008 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27009 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27010 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27011}
27012
27013SDValue
27014AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27015 SelectionDAG &DAG) const {
27016 SDLoc DL(Op);
27017 auto SrcOp = Op.getOperand(0);
27018 EVT VT = Op.getValueType();
27019 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27020 EVT ContainerSrcVT =
27021 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27022
27023 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27024 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27025 return convertFromScalableVector(DAG, VT, Op);
27026}
27027
27028SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27029 SDValue Op, SelectionDAG &DAG) const {
27030 SDLoc DL(Op);
27031 unsigned NumOperands = Op->getNumOperands();
27032
27033 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27034 "Unexpected number of operands in CONCAT_VECTORS");
27035
27036 auto SrcOp1 = Op.getOperand(0);
27037 auto SrcOp2 = Op.getOperand(1);
27038 EVT VT = Op.getValueType();
27039 EVT SrcVT = SrcOp1.getValueType();
27040
27041 if (NumOperands > 2) {
27043 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27044 for (unsigned I = 0; I < NumOperands; I += 2)
27045 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27046 Op->getOperand(I), Op->getOperand(I + 1)));
27047
27048 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27049 }
27050
27051 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27052
27054 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27055 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27056
27057 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27058
27059 return convertFromScalableVector(DAG, VT, Op);
27060}
27061
27062SDValue
27063AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27064 SelectionDAG &DAG) const {
27065 EVT VT = Op.getValueType();
27066 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27067
27068 SDLoc DL(Op);
27069 SDValue Val = Op.getOperand(0);
27070 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27071 EVT SrcVT = Val.getValueType();
27072 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27073 EVT ExtendVT = ContainerVT.changeVectorElementType(
27074 SrcVT.getVectorElementType());
27075
27076 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27077 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27078
27079 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27080 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27081 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27082 Pg, Val, DAG.getUNDEF(ContainerVT));
27083
27084 return convertFromScalableVector(DAG, VT, Val);
27085}
27086
27087SDValue
27088AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27089 SelectionDAG &DAG) const {
27090 EVT VT = Op.getValueType();
27091 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27092
27093 SDLoc DL(Op);
27094 SDValue Val = Op.getOperand(0);
27095 EVT SrcVT = Val.getValueType();
27096 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27097 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27099 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27100
27101 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27102 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27103 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27104 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27105 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27106
27107 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27108 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27109}
27110
27111SDValue
27112AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27113 SelectionDAG &DAG) const {
27114 EVT VT = Op.getValueType();
27115 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27116
27117 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27118 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27120
27121 SDLoc DL(Op);
27122 SDValue Val = Op.getOperand(0);
27123 EVT SrcVT = Val.getValueType();
27124 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27125 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27126
27127 if (VT.bitsGE(SrcVT)) {
27129
27130 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27131 VT.changeTypeToInteger(), Val);
27132
27133 // Safe to use a larger than specified operand because by promoting the
27134 // value nothing has changed from an arithmetic point of view.
27135 Val =
27136 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27137 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27138 DAG.getUNDEF(ContainerDstVT));
27139 return convertFromScalableVector(DAG, VT, Val);
27140 } else {
27141 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27142 ContainerDstVT.getVectorElementType());
27144
27145 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27146 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27147 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27148 Val = convertFromScalableVector(DAG, SrcVT, Val);
27149
27150 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27151 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27152 }
27153}
27154
27155SDValue
27156AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27157 SelectionDAG &DAG) const {
27158 SDLoc DL(Op);
27159 EVT OpVT = Op.getValueType();
27160 assert(OpVT.isScalableVector() &&
27161 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27162 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27163 Op.getOperand(1));
27164 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27165 Op.getOperand(1));
27166 return DAG.getMergeValues({Even, Odd}, DL);
27167}
27168
27169SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27170 SelectionDAG &DAG) const {
27171 SDLoc DL(Op);
27172 EVT OpVT = Op.getValueType();
27173 assert(OpVT.isScalableVector() &&
27174 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27175
27176 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27177 Op.getOperand(1));
27178 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27179 Op.getOperand(1));
27180 return DAG.getMergeValues({Lo, Hi}, DL);
27181}
27182
27183SDValue
27184AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27185 SelectionDAG &DAG) const {
27186 EVT VT = Op.getValueType();
27187 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27188
27189 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27190 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27192
27193 SDLoc DL(Op);
27194 SDValue Val = Op.getOperand(0);
27195 EVT SrcVT = Val.getValueType();
27196 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27197 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27198
27199 if (VT.bitsGT(SrcVT)) {
27200 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27201 ContainerSrcVT.getVectorElementType());
27203
27204 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27205 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27206
27207 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27208 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27209 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27210 DAG.getUNDEF(ContainerDstVT));
27211 return convertFromScalableVector(DAG, VT, Val);
27212 } else {
27213 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27215
27216 // Safe to use a larger than specified result since an fp_to_int where the
27217 // result doesn't fit into the destination is undefined.
27218 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27219 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27220 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27221
27222 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27223 }
27224}
27225
27227 ArrayRef<int> ShuffleMask, EVT VT,
27228 EVT ContainerVT, SelectionDAG &DAG) {
27229 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27230 SDLoc DL(Op);
27231 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27232 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27233 bool IsSingleOp =
27234 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27235
27236 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27237 MinSVESize = 128;
27238
27239 // Ignore two operands if no SVE2 or all index numbers couldn't
27240 // be represented.
27241 if (!IsSingleOp && !Subtarget.hasSVE2())
27242 return SDValue();
27243
27244 EVT VTOp1 = Op.getOperand(0).getValueType();
27245 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27246 unsigned IndexLen = MinSVESize / BitsPerElt;
27247 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27248 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27249 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27250 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27251 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27252 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27253 "Incorrectly legalised shuffle operation");
27254
27256 // If MinSVESize is not equal to MaxSVESize then we need to know which
27257 // TBL mask element needs adjustment.
27258 SmallVector<SDValue, 8> AddRuntimeVLMask;
27259
27260 // Bail out for 8-bits element types, because with 2048-bit SVE register
27261 // size 8 bits is only sufficient to index into the first source vector.
27262 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27263 return SDValue();
27264
27265 for (int Index : ShuffleMask) {
27266 // Handling poison index value.
27267 if (Index < 0)
27268 Index = 0;
27269 // If the mask refers to elements in the second operand, then we have to
27270 // offset the index by the number of elements in a vector. If this is number
27271 // is not known at compile-time, we need to maintain a mask with 'VL' values
27272 // to add at runtime.
27273 if ((unsigned)Index >= ElementsPerVectorReg) {
27274 if (MinMaxEqual) {
27275 Index += IndexLen - ElementsPerVectorReg;
27276 } else {
27277 Index = Index - ElementsPerVectorReg;
27278 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27279 }
27280 } else if (!MinMaxEqual)
27281 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27282 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27283 // to 255, this might point to the last element of in the second operand
27284 // of the shufflevector, thus we are rejecting this transform.
27285 if ((unsigned)Index >= MaxOffset)
27286 return SDValue();
27287 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27288 }
27289
27290 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27291 // value where it would perform first lane duplication for out of
27292 // index elements. For i8 elements an out-of-range index could be a valid
27293 // for 2048-bit vector register size.
27294 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27295 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27296 if (!MinMaxEqual)
27297 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27298 }
27299
27300 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27301 SDValue VecMask =
27302 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27303 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27304
27305 SDValue Shuffle;
27306 if (IsSingleOp)
27307 Shuffle =
27308 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27309 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27310 Op1, SVEMask);
27311 else if (Subtarget.hasSVE2()) {
27312 if (!MinMaxEqual) {
27313 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27314 SDValue VScale = (BitsPerElt == 64)
27315 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27316 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27317 SDValue VecMask =
27318 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27319 SDValue MulByMask = DAG.getNode(
27320 ISD::MUL, DL, MaskType,
27321 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27322 DAG.getBuildVector(MaskType, DL,
27323 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27324 SDValue UpdatedVecMask =
27325 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27326 SVEMask = convertToScalableVector(
27327 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27328 }
27329 Shuffle =
27330 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27331 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27332 Op1, Op2, SVEMask);
27333 }
27334 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27335 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27336}
27337
27338SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27339 SDValue Op, SelectionDAG &DAG) const {
27340 EVT VT = Op.getValueType();
27341 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27342
27343 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27344 auto ShuffleMask = SVN->getMask();
27345
27346 SDLoc DL(Op);
27347 SDValue Op1 = Op.getOperand(0);
27348 SDValue Op2 = Op.getOperand(1);
27349
27350 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27351 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27352 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27353
27354 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27355 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27356 return MVT::i32;
27357 return ScalarTy;
27358 };
27359
27360 if (SVN->isSplat()) {
27361 unsigned Lane = std::max(0, SVN->getSplatIndex());
27362 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27363 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27364 DAG.getConstant(Lane, DL, MVT::i64));
27365 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27366 return convertFromScalableVector(DAG, VT, Op);
27367 }
27368
27369 bool ReverseEXT = false;
27370 unsigned Imm;
27371 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27372 Imm == VT.getVectorNumElements() - 1) {
27373 if (ReverseEXT)
27374 std::swap(Op1, Op2);
27375 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27376 SDValue Scalar = DAG.getNode(
27377 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27378 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27379 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27380 return convertFromScalableVector(DAG, VT, Op);
27381 }
27382
27383 for (unsigned LaneSize : {64U, 32U, 16U}) {
27384 if (isREVMask(ShuffleMask, VT, LaneSize)) {
27385 EVT NewVT =
27387 unsigned RevOp;
27388 unsigned EltSz = VT.getScalarSizeInBits();
27389 if (EltSz == 8)
27391 else if (EltSz == 16)
27393 else
27395
27396 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27397 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27398 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27399 return convertFromScalableVector(DAG, VT, Op);
27400 }
27401 }
27402
27403 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
27404 isREVMask(ShuffleMask, VT, 128)) {
27405 if (!VT.isFloatingPoint())
27406 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27407
27409 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27410 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27411 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27412 return convertFromScalableVector(DAG, VT, Op);
27413 }
27414
27415 unsigned WhichResult;
27416 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27418 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27419
27420 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
27421 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27423 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27424 }
27425
27426 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27428 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27429
27430 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27431 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27433 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27434 }
27435
27436 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27437 // represents the same logical operation as performed by a ZIP instruction. In
27438 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27439 // equivalent to an AArch64 instruction. There's the extra component of
27440 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27441 // only operated on 64/128bit vector types that have a direct mapping to a
27442 // target register and so an exact mapping is implied.
27443 // However, when using SVE for fixed length vectors, most legal vector types
27444 // are actually sub-vectors of a larger SVE register. When mapping
27445 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27446 // how the mask's indices translate. Specifically, when the mapping requires
27447 // an exact meaning for a specific vector index (e.g. Index X is the last
27448 // vector element in the register) then such mappings are often only safe when
27449 // the exact SVE register size is know. The main exception to this is when
27450 // indices are logically relative to the first element of either
27451 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27452 // when converting from fixed-length to scalable vector types (i.e. the start
27453 // of a fixed length vector is always the start of a scalable vector).
27454 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27455 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27456 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27457 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27458 Op2.isUndef()) {
27459 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27460 return convertFromScalableVector(DAG, VT, Op);
27461 }
27462
27463 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27465 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27466
27467 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
27468 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27470 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27471 }
27472
27473 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27475 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27476
27477 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27478 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27480 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27481 }
27482 }
27483
27484 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27485 // unless NEON is not available and we can assume minimal SVE register size is
27486 // 128-bits.
27487 if (MinSVESize || !Subtarget->isNeonAvailable())
27488 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27489 DAG);
27490
27491 return SDValue();
27492}
27493
27494SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27495 SelectionDAG &DAG) const {
27496 SDLoc DL(Op);
27497 EVT InVT = Op.getValueType();
27498
27499 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27500 InVT.isScalableVector() && isTypeLegal(InVT) &&
27501 "Only expect to cast between legal scalable vector types!");
27502 assert(VT.getVectorElementType() != MVT::i1 &&
27503 InVT.getVectorElementType() != MVT::i1 &&
27504 "For predicate bitcasts, use getSVEPredicateBitCast");
27505
27506 if (InVT == VT)
27507 return Op;
27508
27510 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27511
27512 // Safe bitcasting between unpacked vector types of different element counts
27513 // is currently unsupported because the following is missing the necessary
27514 // work to ensure the result's elements live where they're supposed to within
27515 // an SVE register.
27516 // 01234567
27517 // e.g. nxv2i32 = XX??XX??
27518 // nxv4f16 = X?X?X?X?
27520 VT == PackedVT || InVT == PackedInVT) &&
27521 "Unexpected bitcast!");
27522
27523 // Pack input if required.
27524 if (InVT != PackedInVT)
27525 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27526
27527 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27528
27529 // Unpack result if required.
27530 if (VT != PackedVT)
27532
27533 return Op;
27534}
27535
27537 SDValue N) const {
27538 return ::isAllActivePredicate(DAG, N);
27539}
27540
27542 return ::getPromotedVTForPredicate(VT);
27543}
27544
27545bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27546 SDValue Op, const APInt &OriginalDemandedBits,
27547 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27548 unsigned Depth) const {
27549
27550 unsigned Opc = Op.getOpcode();
27551 switch (Opc) {
27552 case AArch64ISD::VSHL: {
27553 // Match (VSHL (VLSHR Val X) X)
27554 SDValue ShiftL = Op;
27555 SDValue ShiftR = Op->getOperand(0);
27556 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27557 return false;
27558
27559 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27560 return false;
27561
27562 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27563 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27564
27565 // Other cases can be handled as well, but this is not
27566 // implemented.
27567 if (ShiftRBits != ShiftLBits)
27568 return false;
27569
27570 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27571 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27572
27573 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27574 APInt UnusedBits = ~OriginalDemandedBits;
27575
27576 if ((ZeroBits & UnusedBits) != ZeroBits)
27577 return false;
27578
27579 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27580 // used - simplify to just Val.
27581 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27582 }
27583 case AArch64ISD::BICi: {
27584 // Fold BICi if all destination bits already known to be zeroed
27585 SDValue Op0 = Op.getOperand(0);
27586 KnownBits KnownOp0 =
27587 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
27588 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27589 uint64_t BitsToClear = Op->getConstantOperandVal(1)
27590 << Op->getConstantOperandVal(2);
27591 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27592 if (APInt(Known.getBitWidth(), BitsToClear)
27593 .isSubsetOf(AlreadyZeroedBitsToClear))
27594 return TLO.CombineTo(Op, Op0);
27595
27596 Known = KnownOp0 &
27597 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
27598
27599 return false;
27600 }
27602 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27603 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27604 if (!MaxSVEVectorSizeInBits)
27605 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27606 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27607 // The SVE count intrinsics don't support the multiplier immediate so we
27608 // don't have to account for that here. The value returned may be slightly
27609 // over the true required bits, as this is based on the "ALL" pattern. The
27610 // other patterns are also exposed by these intrinsics, but they all
27611 // return a value that's strictly less than "ALL".
27612 unsigned RequiredBits = llvm::bit_width(MaxElements);
27613 unsigned BitWidth = Known.Zero.getBitWidth();
27614 if (RequiredBits < BitWidth)
27615 Known.Zero.setHighBits(BitWidth - RequiredBits);
27616 return false;
27617 }
27618 }
27619 }
27620
27622 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27623}
27624
27625bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27626 return Op.getOpcode() == AArch64ISD::DUP ||
27627 Op.getOpcode() == AArch64ISD::MOVI ||
27628 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27629 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27631}
27632
27634 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27635 Subtarget->hasComplxNum();
27636}
27637
27640 auto *VTy = dyn_cast<VectorType>(Ty);
27641 if (!VTy)
27642 return false;
27643
27644 // If the vector is scalable, SVE is enabled, implying support for complex
27645 // numbers. Otherwise, we need to ensure complex number support is available
27646 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27647 return false;
27648
27649 auto *ScalarTy = VTy->getScalarType();
27650 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27651
27652 // We can only process vectors that have a bit size of 128 or higher (with an
27653 // additional 64 bits for Neon). Additionally, these vectors must have a
27654 // power-of-2 size, as we later split them into the smallest supported size
27655 // and merging them back together after applying complex operation.
27656 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27657 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27658 !llvm::isPowerOf2_32(VTyWidth))
27659 return false;
27660
27661 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27662 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27663 return 8 <= ScalarWidth && ScalarWidth <= 64;
27664 }
27665
27666 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27667 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27668}
27669
27672 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27673 Value *Accumulator) const {
27674 VectorType *Ty = cast<VectorType>(InputA->getType());
27675 bool IsScalable = Ty->isScalableTy();
27676 bool IsInt = Ty->getElementType()->isIntegerTy();
27677
27678 unsigned TyWidth =
27680
27681 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27682 "Vector type must be either 64 or a power of 2 that is at least 128");
27683
27684 if (TyWidth > 128) {
27685 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27686 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27687 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27688 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27689 auto *UpperSplitA =
27690 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27691 auto *UpperSplitB =
27692 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27693 Value *LowerSplitAcc = nullptr;
27694 Value *UpperSplitAcc = nullptr;
27695 if (Accumulator) {
27696 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27697 UpperSplitAcc =
27698 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27699 }
27700 auto *LowerSplitInt = createComplexDeinterleavingIR(
27701 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27702 auto *UpperSplitInt = createComplexDeinterleavingIR(
27703 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27704
27705 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27706 B.getInt64(0));
27707 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27708 }
27709
27710 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27711 if (Accumulator == nullptr)
27713
27714 if (IsScalable) {
27715 if (IsInt)
27716 return B.CreateIntrinsic(
27717 Intrinsic::aarch64_sve_cmla_x, Ty,
27718 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27719
27720 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27721 return B.CreateIntrinsic(
27722 Intrinsic::aarch64_sve_fcmla, Ty,
27723 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27724 }
27725
27726 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27727 Intrinsic::aarch64_neon_vcmla_rot90,
27728 Intrinsic::aarch64_neon_vcmla_rot180,
27729 Intrinsic::aarch64_neon_vcmla_rot270};
27730
27731
27732 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27733 {Accumulator, InputA, InputB});
27734 }
27735
27736 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27737 if (IsScalable) {
27740 if (IsInt)
27741 return B.CreateIntrinsic(
27742 Intrinsic::aarch64_sve_cadd_x, Ty,
27743 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27744
27745 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27746 return B.CreateIntrinsic(
27747 Intrinsic::aarch64_sve_fcadd, Ty,
27748 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27749 }
27750 return nullptr;
27751 }
27752
27755 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27757 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27758
27759 if (IntId == Intrinsic::not_intrinsic)
27760 return nullptr;
27761
27762 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27763 }
27764
27765 return nullptr;
27766}
27767
27768bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27769 unsigned Opc = N->getOpcode();
27770 if (ISD::isExtOpcode(Opc)) {
27771 if (any_of(N->uses(),
27772 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27773 return false;
27774 }
27775 return true;
27776}
27777
27778unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27779 return Subtarget->getMinimumJumpTableEntries();
27780}
27781
27784 EVT VT) const {
27785 bool NonUnitFixedLengthVector =
27787 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27789
27790 EVT VT1;
27791 MVT RegisterVT;
27792 unsigned NumIntermediates;
27793 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27794 RegisterVT);
27795 return RegisterVT;
27796}
27797
27799 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27800 bool NonUnitFixedLengthVector =
27802 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27804
27805 EVT VT1;
27806 MVT VT2;
27807 unsigned NumIntermediates;
27809 NumIntermediates, VT2);
27810}
27811
27813 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27814 unsigned &NumIntermediates, MVT &RegisterVT) const {
27816 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27817 if (!RegisterVT.isFixedLengthVector() ||
27818 RegisterVT.getFixedSizeInBits() <= 128)
27819 return NumRegs;
27820
27821 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27822 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27823 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27824
27825 // A size mismatch here implies either type promotion or widening and would
27826 // have resulted in scalarisation if larger vectors had not be available.
27827 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27828 EVT EltTy = VT.getVectorElementType();
27830 if (!isTypeLegal(NewVT))
27831 NewVT = EltTy;
27832
27833 IntermediateVT = NewVT;
27834 NumIntermediates = VT.getVectorNumElements();
27835 RegisterVT = getRegisterType(Context, NewVT);
27836 return NumIntermediates;
27837 }
27838
27839 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27840 // types for vector arguments and returns.
27841
27842 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27843 NumIntermediates *= NumSubRegs;
27844 NumRegs *= NumSubRegs;
27845
27846 switch (RegisterVT.getVectorElementType().SimpleTy) {
27847 default:
27848 llvm_unreachable("unexpected element type for vector");
27849 case MVT::i8:
27850 IntermediateVT = RegisterVT = MVT::v16i8;
27851 break;
27852 case MVT::i16:
27853 IntermediateVT = RegisterVT = MVT::v8i16;
27854 break;
27855 case MVT::i32:
27856 IntermediateVT = RegisterVT = MVT::v4i32;
27857 break;
27858 case MVT::i64:
27859 IntermediateVT = RegisterVT = MVT::v2i64;
27860 break;
27861 case MVT::f16:
27862 IntermediateVT = RegisterVT = MVT::v8f16;
27863 break;
27864 case MVT::f32:
27865 IntermediateVT = RegisterVT = MVT::v4f32;
27866 break;
27867 case MVT::f64:
27868 IntermediateVT = RegisterVT = MVT::v2f64;
27869 break;
27870 case MVT::bf16:
27871 IntermediateVT = RegisterVT = MVT::v8bf16;
27872 break;
27873 }
27874
27875 return NumRegs;
27876}
27877
27879 const MachineFunction &MF) const {
27880 return !Subtarget->isTargetWindows() &&
27881 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27882}
27883
27884#ifndef NDEBUG
27886 switch (N->getOpcode()) {
27887 default:
27888 break;
27892 case AArch64ISD::UUNPKHI: {
27893 assert(N->getNumValues() == 1 && "Expected one result!");
27894 assert(N->getNumOperands() == 1 && "Expected one operand!");
27895 EVT VT = N->getValueType(0);
27896 EVT OpVT = N->getOperand(0).getValueType();
27897 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
27898 VT.isInteger() && "Expected integer vectors!");
27899 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
27900 "Expected vectors of equal size!");
27901 // TODO: Enable assert once bogus creations have been fixed.
27902 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
27903 // "Expected result vector with half the lanes of its input!");
27904 break;
27905 }
27906 case AArch64ISD::TRN1:
27907 case AArch64ISD::TRN2:
27908 case AArch64ISD::UZP1:
27909 case AArch64ISD::UZP2:
27910 case AArch64ISD::ZIP1:
27911 case AArch64ISD::ZIP2: {
27912 assert(N->getNumValues() == 1 && "Expected one result!");
27913 assert(N->getNumOperands() == 2 && "Expected two operands!");
27914 EVT VT = N->getValueType(0);
27915 EVT Op0VT = N->getOperand(0).getValueType();
27916 EVT Op1VT = N->getOperand(1).getValueType();
27917 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
27918 "Expected vectors!");
27919 // TODO: Enable assert once bogus creations have been fixed.
27920 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
27921 break;
27922 }
27923 }
27924}
27925#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
bool empty() const
Definition: Function.h:809
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1907
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
arg_iterator arg_end()
Definition: Function.h:827
arg_iterator arg_begin()
Definition: Function.h:818
size_t size() const
Definition: Function.h:808
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1037
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2472
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2067
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1437
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:476
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2081
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1416
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:690
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:456
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:670
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:662
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:629
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1132
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1128
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:476
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1345
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1376
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1161
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1247
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1037
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:436
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1361
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1365
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1375
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:477
@ STRICT_FLOG2
Definition: ISDOpcodes.h:421
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1273
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1274
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:411
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1406
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:885
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:662
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:450
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1358
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1227
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1362
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:994
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:758
@ STRICT_LROUND
Definition: ISDOpcodes.h:431
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1083
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:327
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1058
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1062
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:586
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:646
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ STRICT_FPOWI
Definition: ISDOpcodes.h:413
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1243
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1377
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:627
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1157
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1370
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1272
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1271
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:435
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:424
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1217
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:856
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:425
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1335
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1254
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1221
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1047
@ STRICT_LRINT
Definition: ISDOpcodes.h:433
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:591
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ STRICT_FROUND
Definition: ISDOpcodes.h:428
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:449
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1378
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1269
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:990
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1270
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1188
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1214
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:657
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:400
@ STRICT_FLOG10
Definition: ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:434
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:612
@ STRICT_FEXP2
Definition: ISDOpcodes.h:418
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1268
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:432
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:855
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1366
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1152
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1076
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:580
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:313
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1600
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1491
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1478
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1529
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1509
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1480
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1469
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:777
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:836
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:293
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1507
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:434
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:376
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:291
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64