LLVM 23.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
20#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/Constants.h"
63#include "llvm/IR/DataLayout.h"
64#include "llvm/IR/DebugLoc.h"
66#include "llvm/IR/Function.h"
68#include "llvm/IR/GlobalValue.h"
69#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/Instruction.h"
73#include "llvm/IR/Intrinsics.h"
74#include "llvm/IR/IntrinsicsAArch64.h"
75#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Use.h"
79#include "llvm/IR/Value.h"
84#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <bitset>
96#include <cassert>
97#include <cctype>
98#include <cstdint>
99#include <cstdlib>
100#include <iterator>
101#include <limits>
102#include <optional>
103#include <tuple>
104#include <utility>
105#include <vector>
106
107using namespace llvm;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251static inline bool isPackedPredicateType(EVT VT, SelectionDAG &DAG) {
253 "Expected legal type!");
254 return VT == MVT::nxv16i1;
255}
256
257/// Returns true if the conceptual representation for \p VT does not map
258/// directly to its physical register representation, meaning there are gaps
259/// between elements in the register. In practice, the vector elements will be
260/// strided by a power of two and placed starting from lane 0. For example,
261/// nxv8i1 or nxv2f32 are unpacked types.
262///
263///\pre VT is a legal type.
264static inline bool isUnpackedType(EVT VT, SelectionDAG &DAG) {
265 bool Res = !isPackedVectorType(VT, DAG) && !isPackedPredicateType(VT, DAG);
266 assert((!Res || VT.isScalableVector()) &&
267 "Unexpected fixed-size unpacked type.");
268 return Res;
269}
270
271// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
272// predicate and end with a passthru value matching the result type.
273static bool isMergePassthruOpcode(unsigned Opc) {
274 switch (Opc) {
275 default:
276 return false;
277 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
278 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
279 case AArch64ISD::REVH_MERGE_PASSTHRU:
280 case AArch64ISD::REVW_MERGE_PASSTHRU:
281 case AArch64ISD::REVD_MERGE_PASSTHRU:
282 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
283 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
284 case AArch64ISD::DUP_MERGE_PASSTHRU:
285 case AArch64ISD::ABS_MERGE_PASSTHRU:
286 case AArch64ISD::NEG_MERGE_PASSTHRU:
287 case AArch64ISD::FNEG_MERGE_PASSTHRU:
288 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
289 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
290 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
291 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
292 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
293 case AArch64ISD::FRINT_MERGE_PASSTHRU:
294 case AArch64ISD::FRINT32_MERGE_PASSTHRU:
295 case AArch64ISD::FRINT64_MERGE_PASSTHRU:
296 case AArch64ISD::FROUND_MERGE_PASSTHRU:
297 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
298 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
299 case AArch64ISD::FTRUNC32_MERGE_PASSTHRU:
300 case AArch64ISD::FTRUNC64_MERGE_PASSTHRU:
301 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
302 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
303 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
304 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
305 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
306 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
307 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
308 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
309 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
310 case AArch64ISD::FABS_MERGE_PASSTHRU:
311 return true;
312 }
313}
314
315// Returns true if inactive lanes are known to be zeroed by construction.
317 switch (Op.getOpcode()) {
318 default:
319 return false;
320 // We guarantee i1 splat_vectors to zero the other lanes
323 case AArch64ISD::PTRUE:
324 case AArch64ISD::SETCC_MERGE_ZERO:
325 return true;
327 switch (Op.getConstantOperandVal(0)) {
328 default:
329 return false;
330 case Intrinsic::aarch64_sve_ptrue:
331 case Intrinsic::aarch64_sve_pnext:
332 case Intrinsic::aarch64_sve_cmpeq:
333 case Intrinsic::aarch64_sve_cmpne:
334 case Intrinsic::aarch64_sve_cmpge:
335 case Intrinsic::aarch64_sve_cmpgt:
336 case Intrinsic::aarch64_sve_cmphs:
337 case Intrinsic::aarch64_sve_cmphi:
338 case Intrinsic::aarch64_sve_cmpeq_wide:
339 case Intrinsic::aarch64_sve_cmpne_wide:
340 case Intrinsic::aarch64_sve_cmpge_wide:
341 case Intrinsic::aarch64_sve_cmpgt_wide:
342 case Intrinsic::aarch64_sve_cmplt_wide:
343 case Intrinsic::aarch64_sve_cmple_wide:
344 case Intrinsic::aarch64_sve_cmphs_wide:
345 case Intrinsic::aarch64_sve_cmphi_wide:
346 case Intrinsic::aarch64_sve_cmplo_wide:
347 case Intrinsic::aarch64_sve_cmpls_wide:
348 case Intrinsic::aarch64_sve_fcmpeq:
349 case Intrinsic::aarch64_sve_fcmpne:
350 case Intrinsic::aarch64_sve_fcmpge:
351 case Intrinsic::aarch64_sve_fcmpgt:
352 case Intrinsic::aarch64_sve_fcmpuo:
353 case Intrinsic::aarch64_sve_facgt:
354 case Intrinsic::aarch64_sve_facge:
355 case Intrinsic::aarch64_sve_whilege:
356 case Intrinsic::aarch64_sve_whilegt:
357 case Intrinsic::aarch64_sve_whilehi:
358 case Intrinsic::aarch64_sve_whilehs:
359 case Intrinsic::aarch64_sve_whilele:
360 case Intrinsic::aarch64_sve_whilelo:
361 case Intrinsic::aarch64_sve_whilels:
362 case Intrinsic::aarch64_sve_whilelt:
363 case Intrinsic::aarch64_sve_match:
364 case Intrinsic::aarch64_sve_nmatch:
365 case Intrinsic::aarch64_sve_whilege_x2:
366 case Intrinsic::aarch64_sve_whilegt_x2:
367 case Intrinsic::aarch64_sve_whilehi_x2:
368 case Intrinsic::aarch64_sve_whilehs_x2:
369 case Intrinsic::aarch64_sve_whilele_x2:
370 case Intrinsic::aarch64_sve_whilelo_x2:
371 case Intrinsic::aarch64_sve_whilels_x2:
372 case Intrinsic::aarch64_sve_whilelt_x2:
373 return true;
374 }
375 }
376}
377
378static std::tuple<SDValue, SDValue>
380 SDLoc DL(Disc);
381 SDValue AddrDisc;
382 SDValue ConstDisc;
383
384 // If this is a blend, remember the constant and address discriminators.
385 // Otherwise, it's either a constant discriminator, or a non-blended
386 // address discriminator.
387 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
388 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
389 AddrDisc = Disc->getOperand(1);
390 ConstDisc = Disc->getOperand(2);
391 } else {
392 ConstDisc = Disc;
393 }
394
395 // If the constant discriminator (either the blend RHS, or the entire
396 // discriminator value) isn't a 16-bit constant, bail out, and let the
397 // discriminator be computed separately.
398 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
399 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
400 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
401
402 // If there's no address discriminator, use NoRegister, which we'll later
403 // replace with XZR, or directly use a Z variant of the inst. when available.
404 if (!AddrDisc)
405 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
406
407 return std::make_tuple(
408 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
409 AddrDisc);
410}
411
413 const AArch64Subtarget &STI)
414 : TargetLowering(TM, STI), Subtarget(&STI) {
415 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
416 // we have to make something up. Arbitrarily, choose ZeroOrOne.
418 // When comparing vectors the result sets the different elements in the
419 // vector to all-one or all-zero.
421
422 // Set up the register classes.
423 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
424 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
425
426 if (Subtarget->hasLS64()) {
427 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
428 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
430 }
431
432 if (Subtarget->hasFPARMv8()) {
433 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
434 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
435 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
436 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
437 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
438 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
439 }
440
441 if (Subtarget->hasNEON()) {
442 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
443 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
444
445 addDRType(MVT::v2f32);
446 addDRType(MVT::v8i8);
447 addDRType(MVT::v4i16);
448 addDRType(MVT::v2i32);
449 addDRType(MVT::v1i64);
450 addDRType(MVT::v1f64);
451 addDRType(MVT::v4f16);
452 addDRType(MVT::v4bf16);
453
454 addQRType(MVT::v4f32);
455 addQRType(MVT::v2f64);
456 addQRType(MVT::v16i8);
457 addQRType(MVT::v8i16);
458 addQRType(MVT::v4i32);
459 addQRType(MVT::v2i64);
460 addQRType(MVT::v8f16);
461 addQRType(MVT::v8bf16);
462 }
463
464 if (Subtarget->isSVEorStreamingSVEAvailable()) {
465 // Add legal sve predicate types
466 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
467 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
468 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
469 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
470 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
471
472 // Add sve predicate as counter type
473 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
474
475 // Add legal sve data types
476 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
477 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
478 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
479 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
480
481 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
482 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
483 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
484 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
485 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
486 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
487
488 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
489 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
490 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
491
492 if (Subtarget->useSVEForFixedLengthVectors()) {
495 addRegisterClass(VT, &AArch64::ZPRRegClass);
496
499 addRegisterClass(VT, &AArch64::ZPRRegClass);
500 }
501 }
502
503 // Compute derived properties from the register classes
504 computeRegisterProperties(Subtarget->getRegisterInfo());
505
506 // Provide all sorts of operation actions
534 if (Subtarget->hasFPARMv8()) {
537 }
550
552
556
559
561
562 // Custom lowering hooks are needed for XOR
563 // to fold it into CSINC/CSINV.
566
569
570 // Virtually no operation on f128 is legal, but LLVM can't expand them when
571 // there's a valid register class, so we need custom operations in most cases.
596 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
597 // aren't handled.
598
599 // Lowering for many of the conversions is actually specified by the non-f128
600 // type. The LowerXXX function will be trivial when f128 isn't involved.
625 if (Subtarget->hasFPARMv8()) {
628 }
631 if (Subtarget->hasFPARMv8()) {
634 }
637
642
643 // Variable arguments.
648
649 // Variable-sized objects.
652
653 // Lowering Funnel Shifts to EXTR
658
660
661 // Constant pool entries
663
664 // BlockAddress
666
667 // AArch64 lacks both left-rotate and popcount instructions.
673 }
674
675 // AArch64 doesn't have i32 MULH{S|U}.
678
679 // AArch64 doesn't have {U|S}MUL_LOHI.
684
685 if (Subtarget->hasCSSC()) {
689
691
695
698
703
708 } else {
712
715
718 }
719
725 }
732
733 // Custom lower Add/Sub/Mul with overflow.
746
755
764 if (Subtarget->hasFullFP16()) {
767 } else {
770 }
771
772 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
786 setOperationAction(Op, MVT::f16, Promote);
787 setOperationAction(Op, MVT::v4f16, Expand);
788 setOperationAction(Op, MVT::v8f16, Expand);
789 setOperationAction(Op, MVT::bf16, Promote);
790 setOperationAction(Op, MVT::v4bf16, Expand);
791 setOperationAction(Op, MVT::v8bf16, Expand);
792 }
793
794 // Legalize fcanonicalize to circumvent default expansion
795 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
796 if (Subtarget->hasFullFP16()) {
798 }
799
800 // fpextend from f16 or bf16 to f32 is legal
805 // fpextend from bf16 to f64 needs to be split into two fpextends
808
809 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
810 for (auto Op : {
814 ISD::FADD,
815 ISD::FSUB,
816 ISD::FMUL,
817 ISD::FDIV,
818 ISD::FMA,
851 })
852 setOperationAction(Op, ScalarVT, Promote);
853
854 for (auto Op : {ISD::FNEG, ISD::FABS})
855 setOperationAction(Op, ScalarVT, Legal);
856
857 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
858 // because the result type is integer.
862 setOperationAction(Op, ScalarVT, Custom);
863
864 // promote v4f16 to v4f32 when that is known to be safe.
865 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
866 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
867 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
868 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
869 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
870 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
871 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
872 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
873 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
874 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
875 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
876 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
877 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
878 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
879
888
889 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
890 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
891 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
892
913 };
914
915 if (!Subtarget->hasFullFP16()) {
916 LegalizeNarrowFP(MVT::f16);
917 }
918 LegalizeNarrowFP(MVT::bf16);
921
922 // AArch64 has implementations of a lot of rounding-like FP operations.
923 // clang-format off
924 for (auto Op :
936 for (MVT Ty : {MVT::f32, MVT::f64})
938 if (Subtarget->hasFullFP16())
939 setOperationAction(Op, MVT::f16, Legal);
940 }
941 // clang-format on
942
943 // Basic strict FP operations are legal
946 for (MVT Ty : {MVT::f32, MVT::f64})
948 if (Subtarget->hasFullFP16())
949 setOperationAction(Op, MVT::f16, Legal);
950 }
951
953
959
961 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
964 } else {
967 }
970
971 // Generate outline atomics library calls only if LSE was not specified for
972 // subtarget
973 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
999 }
1000
1001 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
1006
1011
1016
1021
1026 }
1027
1028 if (Subtarget->hasLSE128()) {
1029 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1030 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1034 }
1035
1036 // 128-bit loads and stores can be done without expanding
1037 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1039
1040 // Aligned 128-bit loads and stores are single-copy atomic according to the
1041 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1042 if (Subtarget->hasLSE2()) {
1045 }
1046
1047 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1048 // custom lowering, as there are no un-paired non-temporal stores and
1049 // legalization will break up 256 bit inputs.
1050 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1051 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1052 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1053 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1054 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1055 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1056 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1057 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1058
1059 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1060 // custom lowering, as there are no un-paired non-temporal loads legalization
1061 // will break up 256 bit inputs.
1062 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1063 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1064 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1065 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1066 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1067 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1068 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1069 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1070
1071 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1073
1074 // Issue __sincos_stret if available.
1077
1078 // Make floating-point constants legal for the large code model, so they don't
1079 // become loads from the constant pool.
1080 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1083 }
1084
1085 // AArch64 does not have floating-point extending loads, i1 sign-extending
1086 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1087 for (MVT VT : MVT::fp_valuetypes()) {
1088 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1089 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1090 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1091 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1092 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1093 }
1094 for (MVT VT : MVT::integer_valuetypes())
1095 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1096
1097 for (MVT WideVT : MVT::fp_valuetypes()) {
1098 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1099 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1100 setTruncStoreAction(WideVT, NarrowVT, Expand);
1101 }
1102 }
1103 }
1104
1105 if (Subtarget->hasFPARMv8()) {
1109 }
1110
1111 // Indexed loads and stores are supported.
1112 for (unsigned im = (unsigned)ISD::PRE_INC;
1114 setIndexedLoadAction(im, MVT::i8, Legal);
1115 setIndexedLoadAction(im, MVT::i16, Legal);
1116 setIndexedLoadAction(im, MVT::i32, Legal);
1117 setIndexedLoadAction(im, MVT::i64, Legal);
1118 setIndexedLoadAction(im, MVT::f64, Legal);
1119 setIndexedLoadAction(im, MVT::f32, Legal);
1120 setIndexedLoadAction(im, MVT::f16, Legal);
1121 setIndexedLoadAction(im, MVT::bf16, Legal);
1122 setIndexedStoreAction(im, MVT::i8, Legal);
1123 setIndexedStoreAction(im, MVT::i16, Legal);
1124 setIndexedStoreAction(im, MVT::i32, Legal);
1125 setIndexedStoreAction(im, MVT::i64, Legal);
1126 setIndexedStoreAction(im, MVT::f64, Legal);
1127 setIndexedStoreAction(im, MVT::f32, Legal);
1128 setIndexedStoreAction(im, MVT::f16, Legal);
1129 setIndexedStoreAction(im, MVT::bf16, Legal);
1130 }
1131
1132 // Trap.
1133 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1136
1137 // We combine OR nodes for ccmp operations.
1139 // Try to create BICs for vector ANDs.
1141
1142 // llvm.init.trampoline and llvm.adjust.trampoline
1145
1146 // Vector add and sub nodes may conceal a high-half opportunity.
1147 // Also, try to fold ADD into CSINC/CSINV..
1150
1153
1154 // Try and combine setcc with csel
1156
1158
1166
1168
1170
1172
1176
1179
1181
1183
1185
1187
1191
1193
1197
1198 // In case of strict alignment, avoid an excessive number of byte wide stores.
1201 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1202
1206 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1207
1210 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1211
1214 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1215
1217
1219
1220 EnableExtLdPromotion = true;
1221
1222 // Set required alignment.
1224 // Set preferred alignments.
1225
1226 // Don't align loops on Windows. The SEH unwind info generation needs to
1227 // know the exact length of functions before the alignments have been
1228 // expanded.
1229 if (!Subtarget->isTargetWindows())
1233
1234 // Only change the limit for entries in a jump table if specified by
1235 // the sub target, but not at the command line.
1236 unsigned MaxJT = STI.getMaximumJumpTableSize();
1237 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1239
1241
1243
1245 if (Subtarget->hasSME())
1247
1248 if (Subtarget->isNeonAvailable()) {
1249 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1250 // silliness like this:
1251 // clang-format off
1252 for (auto Op :
1273 setOperationAction(Op, MVT::v1f64, Expand);
1274 // clang-format on
1275
1276 for (auto Op :
1281 setOperationAction(Op, MVT::v1i64, Expand);
1282
1283 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1284 // elements smaller than i32, so promote the input to i32 first.
1285 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1286 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1287
1288 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1289 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1290 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1293 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1295
1296 if (Subtarget->hasFullFP16()) {
1299
1308 } else {
1309 // when AArch64 doesn't have fullfp16 support, promote the input
1310 // to i32 first.
1311 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1312 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1313 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1314 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1315 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1316 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1317 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1318 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1319 }
1320
1321 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1322 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1329 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1334 }
1335
1336 // Custom handling for some quad-vector types to detect MULL.
1337 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1338 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1339 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1340 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1341 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1342 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1343
1344 // Saturates
1345 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1346 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1351 }
1352
1353 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1354 MVT::v4i32}) {
1361 }
1362
1363 // Vector reductions
1364 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1365 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1366 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1371
1373 }
1374 }
1375 if (Subtarget->hasFullFP16())
1377
1378 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1379 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1388 }
1393
1395 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1396 // Likewise, narrowing and extending vector loads/stores aren't handled
1397 // directly.
1400
1401 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1404 } else {
1407 }
1410
1413
1414 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1415 setTruncStoreAction(VT, InnerVT, Expand);
1416 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1417 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1418 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1419 }
1420 }
1421
1422 for (auto Op :
1428 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1430 if (Subtarget->hasFullFP16())
1431 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1433 }
1434
1435 // LRINT and LLRINT.
1436 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1437 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1439 if (Subtarget->hasFullFP16())
1440 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1442 }
1443
1444 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1445
1450
1454
1455 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1456 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1457 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1458 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1459 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1460 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1461 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1462 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1463 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1464 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1465 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1466 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1467 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1468 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1469 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1470 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1471 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1472 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1473
1474 // ADDP custom lowering
1475 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1477 // FADDP custom lowering
1478 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1480
1481 if (Subtarget->hasDotProd()) {
1482 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1484
1485 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1486 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1487 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
1488 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1489
1490 if (Subtarget->hasMatMulInt8()) {
1492 MVT::v16i8, Legal);
1494 MVT::v16i8, Custom);
1495
1497 MVT::v8i8, Legal);
1498 }
1499 }
1500
1501 } else /* !isNeonAvailable */ {
1503 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1505
1506 if (VT.is128BitVector() || VT.is64BitVector()) {
1510 Subtarget->isLittleEndian() ? Legal : Expand);
1511 }
1512 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1513 setTruncStoreAction(VT, InnerVT, Expand);
1514 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1515 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1516 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1517 }
1518 }
1519 }
1520
1521 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1525 }
1526
1527 if (Subtarget->hasSME()) {
1529 }
1530
1531 // FIXME: Move lowering for more nodes here if those are common between
1532 // SVE and SME.
1533 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1534 for (auto VT :
1535 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1540 }
1541 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1544 }
1545
1546 if (Subtarget->hasSVE2p1() ||
1547 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1549
1550 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1552
1553 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v2f64})
1555 }
1556
1557 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1558 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1602
1608
1617
1622
1626
1627 if (!Subtarget->isLittleEndian())
1629
1630 if (Subtarget->hasSVE2() ||
1631 (Subtarget->hasSME() && Subtarget->isStreaming()))
1632 // For SLI/SRI.
1634 }
1635
1636 // Illegal unpacked integer vector types.
1637 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1640 }
1641
1642 // Type legalize unpacked bitcasts.
1643 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1645
1646 for (auto VT :
1647 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1648 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1650
1651 // Promote predicate as counter load/stores to standard predicates.
1652 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
1653 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
1654
1655 // Predicate as counter legalization actions.
1656 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
1657 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
1658
1659 for (auto VT :
1660 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1668
1672
1673 // There are no legal MVT::nxv16f## based types.
1674 if (VT != MVT::nxv16i1) {
1679 }
1680 }
1681
1682 // NEON doesn't support masked loads/stores, but SME and SVE do.
1683 for (auto VT :
1684 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1685 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1686 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1689 }
1690
1691 // Firstly, exclude all scalable vector extending loads/truncating stores,
1692 // include both integer and floating scalable vector.
1694 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1695 setTruncStoreAction(VT, InnerVT, Expand);
1696 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1697 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1698 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1699 }
1700 }
1701
1702 // Then, selectively enable those which we directly support.
1703 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1704 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1705 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1706 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1707 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1708 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1709 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1710 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1711 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1712 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1713 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1714 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1715 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1716 }
1717
1718 // SVE supports truncating stores of 64 and 128-bit vectors
1719 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1720 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1721 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1722 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1723 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1724
1725 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1726 MVT::nxv4f32, MVT::nxv2f64}) {
1768
1790
1802 }
1803
1804 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1821 }
1822
1823 if (Subtarget->hasSVEB16B16() &&
1824 Subtarget->isNonStreamingSVEorSME2Available()) {
1825 // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
1826 for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
1827 MVT::nxv8bf16}) {
1836 }
1837 }
1838
1839 for (auto Opcode :
1844 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1845 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1846 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1847 }
1848
1849 if (!Subtarget->hasSVEB16B16() ||
1850 !Subtarget->isNonStreamingSVEorSME2Available()) {
1851 for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1852 MVT PromotedVT = VT.changeVectorElementType(MVT::f32);
1853 setOperationPromotedToType(ISD::FADD, VT, PromotedVT);
1854 setOperationPromotedToType(ISD::FMA, VT, PromotedVT);
1859 setOperationPromotedToType(ISD::FSUB, VT, PromotedVT);
1860
1861 if (VT != MVT::nxv2bf16 && Subtarget->hasBF16())
1863 else
1864 setOperationPromotedToType(ISD::FMUL, VT, PromotedVT);
1865 }
1866
1867 if (Subtarget->hasBF16() && Subtarget->isNeonAvailable())
1868 setOperationAction(ISD::FMUL, MVT::v8bf16, Custom);
1869 }
1870
1873
1874 // A number of operations like MULH and integer divides are not supported by
1875 // NEON but are available in SVE.
1876 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1877 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1882 }
1883
1884 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1885 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1886 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1887
1888 // NOTE: Currently this has to happen after computeRegisterProperties rather
1889 // than the preferred option of combining it with the addRegisterClass call.
1890 if (Subtarget->useSVEForFixedLengthVectors()) {
1893 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1894 addTypeForFixedLengthSVE(VT);
1895 }
1898 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1899 addTypeForFixedLengthSVE(VT);
1900 }
1901
1902 // 64bit results can mean a bigger than NEON input.
1903 for (auto VT : {MVT::v8i8, MVT::v4i16})
1906
1907 // 128bit results imply a bigger than NEON input.
1908 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1910 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v8bf16})
1912
1913 // These operations are not supported on NEON but SVE can do them.
1915 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1916 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1917 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1918 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1919 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1920 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1921 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1922 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1923 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1924 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1925 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1930
1931 // Int operations with no NEON support.
1932 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1933 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1939 }
1940
1941 // Use SVE for vectors with more than 2 elements.
1942 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1944 }
1945
1947 MVT::nxv2i64);
1949 MVT::nxv2i64);
1951 MVT::nxv4i32);
1953 MVT::nxv4i32);
1955 MVT::nxv8i16);
1957 MVT::nxv8i16);
1959 MVT::nxv16i8);
1961 MVT::nxv16i8);
1962
1964
1965 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1967 }
1968
1969 // Handle partial reduction operations
1970 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1971 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1972 // Other pairs will default to 'Expand'.
1973 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1975 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1976 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1977
1978 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1979
1980 if (Subtarget->hasMatMulInt8()) {
1982 MVT::nxv16i8, Legal);
1984 MVT::nxv16i8, Custom);
1985 }
1986
1987 // Wide add types
1988 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1989 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1990 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1991 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1992 }
1993
1994 // Handle floating-point partial reduction
1995 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
1997 MVT::nxv8f16, Legal);
1998 // We can use SVE2p1 fdot to emulate the fixed-length variant.
2000 MVT::v8f16, Custom);
2001 }
2002 }
2003
2004 // Handle non-aliasing elements mask
2005 if (Subtarget->hasSVE2() ||
2006 (Subtarget->hasSME() && Subtarget->isStreaming())) {
2007 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
2008 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
2011 }
2012 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
2015 }
2016 }
2017
2018 // Handle operations that are only available in non-streaming SVE mode.
2019 if (Subtarget->isSVEAvailable()) {
2020 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
2021 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2022 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
2023 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
2024 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
2025 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2026 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
2029 }
2030
2031 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2032 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
2033 MVT::v2f32, MVT::v4f32, MVT::v2f64})
2035
2036 // We can lower types that have <vscale x {2|4}> elements to compact.
2037 for (auto VT :
2038 {MVT::nxv4i32, MVT::nxv2i64, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64})
2040
2041 // If we have SVE, we can use SVE logic for legal NEON vectors in the lowest
2042 // bits of the SVE register.
2043 for (auto VT : {MVT::v2i32, MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32,
2044 MVT::v2f64})
2046
2047 // Promote v4i16/f16 to v4i32/f32 as the SVE container for v4i16 is nxv8,
2048 // which is not supported with for compact (with only +sve).
2049 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4bf16, MVT::v4i16);
2050 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4f16, MVT::v4i16);
2051 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4i16, MVT::v4i32);
2052
2053 for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
2054 MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
2055 MVT::nxv4i32, MVT::nxv4f32}) {
2056 // Use a custom lowering for masked stores that could be a supported
2057 // compressing store. Note: These types still use the normal (Legal)
2058 // lowering for non-compressing masked stores.
2060 }
2061
2062 // Histcnt is SVE2 only
2063 if (Subtarget->hasSVE2()) {
2065 Custom);
2067 Custom);
2068
2069 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2071 // Must be lowered to SVE instructions.
2072 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
2073 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
2074 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
2075 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
2076 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
2077 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
2078 }
2079 }
2080
2081 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
2082 // Only required for llvm.aarch64.mops.memset.tag
2084 }
2085
2087
2088 if (Subtarget->hasSVE()) {
2093 }
2094
2095 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2096
2097 IsStrictFPEnabled = true;
2099
2100 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2101 // it, but it's just a wrapper around ldexp.
2102 if (Subtarget->isTargetWindows()) {
2104 if (isOperationExpand(Op, MVT::f32))
2105 setOperationAction(Op, MVT::f32, Promote);
2106 }
2107
2108 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2109 // isn't legal.
2111 if (isOperationExpand(Op, MVT::f16))
2112 setOperationAction(Op, MVT::f16, Promote);
2113}
2114
2116 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2117}
2118
2119void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2120 assert(VT.isVector() && "VT should be a vector type");
2121
2122 if (VT.isFloatingPoint()) {
2124 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2125 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2126 }
2127
2128 // Mark vector float intrinsics as expand.
2129 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2147 }
2148
2149 // But we do support custom-lowering for FCOPYSIGN.
2150 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2151 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2152 VT == MVT::v8f16) &&
2153 Subtarget->hasFullFP16()))
2155
2170
2174 for (MVT InnerVT : MVT::all_valuetypes())
2175 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2176
2177 // CNT supports only B element sizes, then use UADDLP to widen.
2178 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2180
2186
2187 for (unsigned Opcode :
2190 setOperationAction(Opcode, VT, Custom);
2191
2192 if (!VT.isFloatingPoint())
2194
2195 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2196 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2197 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2198 setOperationAction(Opcode, VT, Legal);
2199
2200 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2201 // NEON types.
2202 if (VT.isFloatingPoint() &&
2203 VT.getVectorElementType() != MVT::bf16 &&
2204 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2205 for (unsigned Opcode :
2211 setOperationAction(Opcode, VT, Legal);
2212
2213 // Strict fp extend and trunc are legal
2214 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2216 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2218
2219 // FIXME: We could potentially make use of the vector comparison instructions
2220 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2221 // complications:
2222 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2223 // so we would need to expand when the condition code doesn't match the
2224 // kind of comparison.
2225 // * Some kinds of comparison require more than one FCMXY instruction so
2226 // would need to be expanded instead.
2227 // * The lowering of the non-strict versions involves target-specific ISD
2228 // nodes so we would likely need to add strict versions of all of them and
2229 // handle them appropriately.
2232
2233 // When little-endian we can use ordinary d and q register loads/stores for
2234 // vector types, but when big-endian we need to use structure load/store which
2235 // only allow post-index addressing.
2236 if (Subtarget->isLittleEndian()) {
2237 for (unsigned im = (unsigned)ISD::PRE_INC;
2238 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2241 }
2242 } else {
2245 }
2246
2247 if (Subtarget->hasD128()) {
2250 }
2251
2252 if (VT.isInteger()) {
2253 // Let common code emit inverted variants of compares we do support.
2259 }
2260}
2261
2263 EVT OpVT) const {
2264 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2265 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2266 ResVT.getVectorElementType() != MVT::i1)
2267 return true;
2268
2269 // Only support illegal types if the result is scalable and min elements > 1.
2270 if (ResVT.getVectorMinNumElements() == 1 ||
2271 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2272 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2273 return true;
2274
2275 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2276 // but anything larger should be expanded.
2277 if (OpVT.getFixedSizeInBits() > 64)
2278 return true;
2279
2280 return false;
2281}
2282
2284 if (!Subtarget->isSVEorStreamingSVEAvailable())
2285 return true;
2286
2287 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2288 // also support fixed-width predicates.
2289 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2290 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2291 VT != MVT::v4i1 && VT != MVT::v2i1;
2292}
2293
2295 unsigned SearchSize) const {
2296 // MATCH is SVE2 and only available in non-streaming mode.
2297 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2298 return true;
2299 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2300 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2301 return SearchSize != 8;
2302 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2303 return SearchSize != 8 && SearchSize != 16;
2304 return true;
2305}
2306
2307void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2308 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2309
2310 // By default everything must be expanded.
2311 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2313
2314 if (VT.isFloatingPoint()) {
2324 }
2325
2327 VT == MVT::v1f64 ? Expand : Custom;
2328
2329 // Mark integer truncating stores/extending loads as having custom lowering
2330 if (VT.isInteger()) {
2331 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2332 while (InnerVT != VT) {
2333 setTruncStoreAction(VT, InnerVT, Default);
2334 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2335 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2336 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2337 InnerVT = InnerVT.changeVectorElementType(
2338 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2339 }
2340 }
2341
2342 // Mark floating-point truncating stores/extending loads as having custom
2343 // lowering
2344 if (VT.isFloatingPoint()) {
2345 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2346 while (InnerVT != VT) {
2347 setTruncStoreAction(VT, InnerVT, Custom);
2348 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2349 InnerVT = InnerVT.changeVectorElementType(
2351 }
2352 }
2353
2354 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2355 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2356
2357 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2359 unsigned NumElts = VT.getVectorNumElements();
2360 if (VT.getVectorElementType() == MVT::i64) {
2361 setPartialReduceMLAAction(MLAOps, VT,
2362 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2363 setPartialReduceMLAAction(MLAOps, VT,
2364 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2365 setPartialReduceMLAAction(MLAOps, VT,
2366 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2367 } else if (VT.getVectorElementType() == MVT::i32) {
2368 setPartialReduceMLAAction(MLAOps, VT,
2369 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2370 setPartialReduceMLAAction(MLAOps, VT,
2371 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2372 } else if (VT.getVectorElementType() == MVT::i16) {
2373 setPartialReduceMLAAction(MLAOps, VT,
2374 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2375 }
2376 if (Subtarget->hasMatMulInt8()) {
2377 if (VT.getVectorElementType() == MVT::i32)
2379 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2380 else if (VT.getVectorElementType() == MVT::i64)
2382 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2383 }
2384
2385 if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
2387 MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
2388 }
2389
2390 // Lower fixed length vector operations to scalable equivalents.
2397 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2435 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2436 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2438 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2457 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2484}
2485
2486void AArch64TargetLowering::addDRType(MVT VT) {
2487 addRegisterClass(VT, &AArch64::FPR64RegClass);
2488 if (Subtarget->isNeonAvailable())
2489 addTypeForNEON(VT);
2490}
2491
2492void AArch64TargetLowering::addQRType(MVT VT) {
2493 addRegisterClass(VT, &AArch64::FPR128RegClass);
2494 if (Subtarget->isNeonAvailable())
2495 addTypeForNEON(VT);
2496}
2497
2499 LLVMContext &C, EVT VT) const {
2500 if (!VT.isVector())
2501 return MVT::i32;
2502 if (VT.isScalableVector())
2503 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2505}
2506
2507// isIntImmediate - This method tests to see if the node is a constant
2508// operand. If so Imm will receive the value.
2509static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2511 Imm = C->getZExtValue();
2512 return true;
2513 }
2514 return false;
2515}
2516
2517bool isVectorizedBinOp(unsigned Opcode) {
2518 switch (Opcode) {
2519 case AArch64ISD::SQDMULH:
2520 return true;
2521 default:
2522 return false;
2523 }
2524}
2525
2526// isOpcWithIntImmediate - This method tests to see if the node is a specific
2527// opcode and that it has a immediate integer right operand.
2528// If so Imm will receive the value.
2529static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2530 uint64_t &Imm) {
2531 return N->getOpcode() == Opc &&
2532 isIntImmediate(N->getOperand(1).getNode(), Imm);
2533}
2534
2535static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2536 const APInt &Demanded,
2538 unsigned NewOpc) {
2539 uint64_t OldImm = Imm, NewImm, Enc;
2540 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2541
2542 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2543 // bimm64.
2544 if (Imm == 0 || Imm == Mask ||
2546 return false;
2547
2548 unsigned EltSize = Size;
2549 uint64_t DemandedBits = Demanded.getZExtValue();
2550
2551 // Clear bits that are not demanded.
2552 Imm &= DemandedBits;
2553
2554 while (true) {
2555 // The goal here is to set the non-demanded bits in a way that minimizes
2556 // the number of switching between 0 and 1. In order to achieve this goal,
2557 // we set the non-demanded bits to the value of the preceding demanded bits.
2558 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2559 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2560 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2561 // The final result is 0b11000011.
2562 uint64_t NonDemandedBits = ~DemandedBits;
2563 uint64_t InvertedImm = ~Imm & DemandedBits;
2564 uint64_t RotatedImm =
2565 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2566 NonDemandedBits;
2567 uint64_t Sum = RotatedImm + NonDemandedBits;
2568 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2569 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2570 NewImm = (Imm | Ones) & Mask;
2571
2572 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2573 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2574 // we halve the element size and continue the search.
2575 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2576 break;
2577
2578 // We cannot shrink the element size any further if it is 2-bits.
2579 if (EltSize == 2)
2580 return false;
2581
2582 EltSize /= 2;
2583 Mask >>= EltSize;
2584 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2585
2586 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2587 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2588 return false;
2589
2590 // Merge the upper and lower halves of Imm and DemandedBits.
2591 Imm |= Hi;
2592 DemandedBits |= DemandedBitsHi;
2593 }
2594
2595 ++NumOptimizedImms;
2596
2597 // Replicate the element across the register width.
2598 while (EltSize < Size) {
2599 NewImm |= NewImm << EltSize;
2600 EltSize *= 2;
2601 }
2602
2603 (void)OldImm;
2604 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2605 "demanded bits should never be altered");
2606 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2607
2608 // Create the new constant immediate node.
2609 EVT VT = Op.getValueType();
2610 SDLoc DL(Op);
2611 SDValue New;
2612
2613 // If the new constant immediate is all-zeros or all-ones, let the target
2614 // independent DAG combine optimize this node.
2615 if (NewImm == 0 || NewImm == OrigMask) {
2616 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2617 TLO.DAG.getConstant(NewImm, DL, VT));
2618 // Otherwise, create a machine node so that target independent DAG combine
2619 // doesn't undo this optimization.
2620 } else {
2622 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2623 New = SDValue(
2624 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2625 }
2626
2627 return TLO.CombineTo(Op, New);
2628}
2629
2631 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2632 TargetLoweringOpt &TLO) const {
2633 // Delay this optimization to as late as possible.
2634 if (!TLO.LegalOps)
2635 return false;
2636
2638 return false;
2639
2640 EVT VT = Op.getValueType();
2641 if (VT.isVector())
2642 return false;
2643
2644 unsigned Size = VT.getSizeInBits();
2645
2646 if (Size != 32 && Size != 64)
2647 return false;
2648
2649 // Exit early if we demand all bits.
2650 if (DemandedBits.isAllOnes())
2651 return false;
2652
2653 unsigned NewOpc;
2654 switch (Op.getOpcode()) {
2655 default:
2656 return false;
2657 case ISD::AND:
2658 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2659 break;
2660 case ISD::OR:
2661 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2662 break;
2663 case ISD::XOR:
2664 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2665 break;
2666 }
2667 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2668 if (!C)
2669 return false;
2670 uint64_t Imm = C->getZExtValue();
2671 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2672}
2673
2674/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2675/// Mask are known to be either zero or one and return them Known.
2677 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2678 const SelectionDAG &DAG, unsigned Depth) const {
2679 switch (Op.getOpcode()) {
2680 default:
2681 break;
2682 case AArch64ISD::DUP: {
2683 SDValue SrcOp = Op.getOperand(0);
2684 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2685 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2686 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2687 "Expected DUP implicit truncation");
2688 Known = Known.trunc(Op.getScalarValueSizeInBits());
2689 }
2690 break;
2691 }
2692 case AArch64ISD::CSEL: {
2693 KnownBits Known2;
2694 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2695 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2696 Known = Known.intersectWith(Known2);
2697 break;
2698 }
2699 case AArch64ISD::CSNEG:
2700 case AArch64ISD::CSINC:
2701 case AArch64ISD::CSINV: {
2702 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2703 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2704
2705 // The result is either:
2706 // CSINC: KnownOp0 or KnownOp1 + 1
2707 // CSINV: KnownOp0 or ~KnownOp1
2708 // CSNEG: KnownOp0 or KnownOp1 * -1
2709 if (Op.getOpcode() == AArch64ISD::CSINC)
2710 KnownOp1 = KnownBits::add(
2711 KnownOp1,
2712 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2713 else if (Op.getOpcode() == AArch64ISD::CSINV)
2714 std::swap(KnownOp1.Zero, KnownOp1.One);
2715 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2716 KnownOp1 =
2718 Op.getScalarValueSizeInBits())));
2719
2720 Known = KnownOp0.intersectWith(KnownOp1);
2721 break;
2722 }
2723 case AArch64ISD::BICi: {
2724 // Compute the bit cleared value.
2725 APInt Mask =
2726 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2727 .trunc(Known.getBitWidth());
2728 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2729 Known &= KnownBits::makeConstant(Mask);
2730 break;
2731 }
2732 case AArch64ISD::VLSHR: {
2733 KnownBits Known2;
2734 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2735 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2736 Known = KnownBits::lshr(Known, Known2);
2737 break;
2738 }
2739 case AArch64ISD::VASHR: {
2740 KnownBits Known2;
2741 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2742 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2743 Known = KnownBits::ashr(Known, Known2);
2744 break;
2745 }
2746 case AArch64ISD::VSHL: {
2747 KnownBits Known2;
2748 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2749 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2750 Known = KnownBits::shl(Known, Known2);
2751 break;
2752 }
2753 case AArch64ISD::MOVI: {
2755 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2756 break;
2757 }
2758 case AArch64ISD::MOVIshift: {
2760 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2761 << Op->getConstantOperandVal(1)));
2762 break;
2763 }
2764 case AArch64ISD::MOVImsl: {
2765 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2767 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2768 break;
2769 }
2770 case AArch64ISD::MOVIedit: {
2772 Known.getBitWidth(),
2773 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2774 break;
2775 }
2776 case AArch64ISD::MVNIshift: {
2778 APInt(Known.getBitWidth(),
2779 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2780 /*isSigned*/ false, /*implicitTrunc*/ true));
2781 break;
2782 }
2783 case AArch64ISD::MVNImsl: {
2784 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2786 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2787 /*isSigned*/ false, /*implicitTrunc*/ true));
2788 break;
2789 }
2790 case AArch64ISD::LOADgot:
2791 case AArch64ISD::ADDlow: {
2792 if (!Subtarget->isTargetILP32())
2793 break;
2794 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2795 Known.Zero = APInt::getHighBitsSet(64, 32);
2796 break;
2797 }
2798 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2799 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2800 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2801 break;
2802 }
2804 Intrinsic::ID IntID =
2805 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2806 switch (IntID) {
2807 default: return;
2808 case Intrinsic::aarch64_ldaxr:
2809 case Intrinsic::aarch64_ldxr: {
2810 unsigned BitWidth = Known.getBitWidth();
2811 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2812 unsigned MemBits = VT.getScalarSizeInBits();
2813 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2814 return;
2815 }
2816 }
2817 break;
2818 }
2820 case ISD::INTRINSIC_VOID: {
2821 unsigned IntNo = Op.getConstantOperandVal(0);
2822 switch (IntNo) {
2823 default:
2824 break;
2825 case Intrinsic::aarch64_neon_uaddlv: {
2826 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2827 unsigned BitWidth = Known.getBitWidth();
2828 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2829 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2830 assert(BitWidth >= Bound && "Unexpected width!");
2832 Known.Zero |= Mask;
2833 }
2834 break;
2835 }
2836 case Intrinsic::aarch64_neon_umaxv:
2837 case Intrinsic::aarch64_neon_uminv: {
2838 // Figure out the datatype of the vector operand. The UMINV instruction
2839 // will zero extend the result, so we can mark as known zero all the
2840 // bits larger than the element datatype. 32-bit or larget doesn't need
2841 // this as those are legal types and will be handled by isel directly.
2842 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2843 unsigned BitWidth = Known.getBitWidth();
2844 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2845 assert(BitWidth >= 8 && "Unexpected width!");
2847 Known.Zero |= Mask;
2848 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2849 assert(BitWidth >= 16 && "Unexpected width!");
2851 Known.Zero |= Mask;
2852 }
2853 break;
2854 } break;
2855 }
2856 }
2857 }
2858}
2859
2861 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2862 unsigned Depth) const {
2863 EVT VT = Op.getValueType();
2864 unsigned VTBits = VT.getScalarSizeInBits();
2865 unsigned Opcode = Op.getOpcode();
2866 switch (Opcode) {
2867 case AArch64ISD::FCMEQ:
2868 case AArch64ISD::FCMGE:
2869 case AArch64ISD::FCMGT:
2870 // Compares return either 0 or all-ones
2871 return VTBits;
2872 case AArch64ISD::VASHR: {
2873 unsigned Tmp =
2874 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2875 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2876 }
2877 }
2878
2879 return 1;
2880}
2881
2883 EVT) const {
2884 return MVT::i64;
2885}
2886
2888 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2889 unsigned *Fast) const {
2890
2891 // Allow SVE loads/stores where the alignment >= the size of the element type,
2892 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2893 // for stores that come from IR, only require element-size alignment (even if
2894 // unaligned accesses are disabled). Without this, these will be forced to
2895 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2896 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2897 if (VT.isScalableVector()) {
2898 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2899 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2900 return true;
2901 }
2902
2903 if (Subtarget->requiresStrictAlign())
2904 return false;
2905
2906 if (Fast) {
2907 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2908 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2909 // See comments in performSTORECombine() for more details about
2910 // these conditions.
2911
2912 // Code that uses clang vector extensions can mark that it
2913 // wants unaligned accesses to be treated as fast by
2914 // underspecifying alignment to be 1 or 2.
2915 Alignment <= 2 ||
2916
2917 // Disregard v2i64. Memcpy lowering produces those and splitting
2918 // them regresses performance on micro-benchmarks and olden/bh.
2919 VT == MVT::v2i64;
2920 }
2921 return true;
2922}
2923
2924// Same as above but handling LLTs instead.
2926 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2927 unsigned *Fast) const {
2928 if (Subtarget->requiresStrictAlign())
2929 return false;
2930
2931 if (Fast) {
2932 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2933 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2934 Ty.getSizeInBytes() != 16 ||
2935 // See comments in performSTORECombine() for more details about
2936 // these conditions.
2937
2938 // Code that uses clang vector extensions can mark that it
2939 // wants unaligned accesses to be treated as fast by
2940 // underspecifying alignment to be 1 or 2.
2941 Alignment <= 2 ||
2942
2943 // Disregard v2i64. Memcpy lowering produces those and splitting
2944 // them regresses performance on micro-benchmarks and olden/bh.
2945 Ty == LLT::fixed_vector(2, 64);
2946 }
2947 return true;
2948}
2949
2951 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2952 const LibcallLoweringInfo *libcallLowering) const {
2953 return AArch64::createFastISel(funcInfo, libInfo, libcallLowering);
2954}
2955
2958 MachineBasicBlock *MBB) const {
2959 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2960 // phi node:
2961
2962 // OrigBB:
2963 // [... previous instrs leading to comparison ...]
2964 // b.ne TrueBB
2965 // b EndBB
2966 // TrueBB:
2967 // ; Fallthrough
2968 // EndBB:
2969 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2970
2971 MachineFunction *MF = MBB->getParent();
2972 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2973 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2974 DebugLoc DL = MI.getDebugLoc();
2975 MachineFunction::iterator It = ++MBB->getIterator();
2976
2977 Register DestReg = MI.getOperand(0).getReg();
2978 Register IfTrueReg = MI.getOperand(1).getReg();
2979 Register IfFalseReg = MI.getOperand(2).getReg();
2980 unsigned CondCode = MI.getOperand(3).getImm();
2981 bool NZCVKilled = MI.getOperand(4).isKill();
2982
2983 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2984 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2985 MF->insert(It, TrueBB);
2986 MF->insert(It, EndBB);
2987
2988 // Transfer rest of current basic-block to EndBB
2989 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2990 MBB->end());
2992
2993 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2994 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2995 MBB->addSuccessor(TrueBB);
2996 MBB->addSuccessor(EndBB);
2997
2998 // TrueBB falls through to the end.
2999 TrueBB->addSuccessor(EndBB);
3000
3001 if (!NZCVKilled) {
3002 TrueBB->addLiveIn(AArch64::NZCV);
3003 EndBB->addLiveIn(AArch64::NZCV);
3004 }
3005
3006 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3007 .addReg(IfTrueReg)
3008 .addMBB(TrueBB)
3009 .addReg(IfFalseReg)
3010 .addMBB(MBB);
3011
3012 MI.eraseFromParent();
3013 return EndBB;
3014}
3015
3023
3026 MachineBasicBlock *MBB) const {
3027 MachineFunction &MF = *MBB->getParent();
3028 MachineBasicBlock::iterator MBBI = MI.getIterator();
3029 const AArch64InstrInfo &TII =
3030 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3031 Register TargetReg = MI.getOperand(0).getReg();
3033 TII.probedStackAlloc(MBBI, TargetReg, false);
3034
3035 MI.eraseFromParent();
3036 return NextInst->getParent();
3037}
3038
3041 MachineBasicBlock *MBB) const {
3042 MachineFunction *MF = MBB->getParent();
3044
3045 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
3046 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
3047
3048 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
3049 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
3050 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
3051 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
3052
3053 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3054 DebugLoc DL = MI.getDebugLoc();
3055
3056 // RDVL requires GPR64, ADDSVL requires GPR64sp
3057 // We need to insert COPY instructions, these will later be removed by the
3058 // RegisterCoalescer
3059 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
3060 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
3061 .addReg(RegVL_GPR);
3062
3063 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
3064 .addReg(RegVL_GPRsp)
3065 .addImm(-1);
3066 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
3067 .addReg(RegSVL_GPRsp);
3068
3069 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3070 MachineFunction::iterator It = ++MBB->getIterator();
3071 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
3072 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
3073 MF->insert(It, TrapBB);
3074 MF->insert(It, PassBB);
3075
3076 // Continue if vector lengths match
3077 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
3078 .addReg(RegSVL_GPR)
3079 .addMBB(PassBB);
3080
3081 // Transfer rest of current BB to PassBB
3082 PassBB->splice(PassBB->begin(), MBB,
3083 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
3085
3086 // Trap if vector lengths mismatch
3087 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
3088
3089 MBB->addSuccessor(TrapBB);
3090 MBB->addSuccessor(PassBB);
3091
3092 MI.eraseFromParent();
3093 return PassBB;
3094}
3095
3097AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3099 MachineBasicBlock *BB) const {
3100 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3101 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3102
3103 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3104 MIB.add(MI.getOperand(1)); // slice index register
3105 MIB.add(MI.getOperand(2)); // slice index offset
3106 MIB.add(MI.getOperand(3)); // pg
3107 MIB.add(MI.getOperand(4)); // base
3108 MIB.add(MI.getOperand(5)); // offset
3109
3110 MI.eraseFromParent(); // The pseudo is gone now.
3111 return BB;
3112}
3113
3116 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3118 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3119
3120 MIB.addReg(AArch64::ZA, RegState::Define);
3121 MIB.add(MI.getOperand(0)); // Vector select register
3122 MIB.add(MI.getOperand(1)); // Vector select offset
3123 MIB.add(MI.getOperand(2)); // Base
3124 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3125
3126 MI.eraseFromParent(); // The pseudo is gone now.
3127 return BB;
3128}
3129
3132 unsigned Opcode,
3133 bool Op0IsDef) const {
3134 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3136
3137 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3138 .addReg(MI.getOperand(0).getReg(), getDefRegState(Op0IsDef));
3139 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3140 MIB.add(MI.getOperand(I));
3141
3142 MI.eraseFromParent(); // The pseudo is gone now.
3143 return BB;
3144}
3145
3147AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3149 MachineBasicBlock *BB) const {
3150 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3151 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3152 unsigned StartIdx = 0;
3153
3154 bool HasTile = BaseReg != AArch64::ZA;
3155 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3156 if (HasZPROut) {
3157 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3158 ++StartIdx;
3159 }
3160 if (HasTile) {
3161 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3162 RegState::Define); // Output ZA Tile
3163 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3164 StartIdx++;
3165 } else {
3166 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3167 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3168 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3169 ++StartIdx;
3170 }
3171 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3172 }
3173 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3174 MIB.add(MI.getOperand(I));
3175
3176 MI.eraseFromParent(); // The pseudo is gone now.
3177 return BB;
3178}
3179
3182 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3184 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3185 MIB.add(MI.getOperand(0)); // Mask
3186
3187 unsigned Mask = MI.getOperand(0).getImm();
3188 for (unsigned I = 0; I < 8; I++) {
3189 if (Mask & (1 << I))
3190 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3191 }
3192
3193 MI.eraseFromParent(); // The pseudo is gone now.
3194 return BB;
3195}
3196
3199 MachineBasicBlock *BB) const {
3200 MachineFunction *MF = BB->getParent();
3201 MachineFrameInfo &MFI = MF->getFrameInfo();
3203 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3204 if (TPIDR2.Uses > 0) {
3205 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3206 // generally don't support big-endian SVE/SME.
3207 if (!Subtarget->isLittleEndian())
3209 "TPIDR2 block initialization is not supported on big-endian targets");
3210
3211 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3212 // Store buffer pointer and num_za_save_slices.
3213 // Bytes 10-15 are implicitly zeroed.
3214 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3215 .addReg(MI.getOperand(0).getReg())
3216 .addReg(MI.getOperand(1).getReg())
3217 .addFrameIndex(TPIDR2.FrameIndex)
3218 .addImm(0);
3219 } else
3220 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3221
3222 BB->remove_instr(&MI);
3223 return BB;
3224}
3225
3228 MachineBasicBlock *BB) const {
3229 MachineFunction *MF = BB->getParent();
3230 MachineFrameInfo &MFI = MF->getFrameInfo();
3232 // TODO This function grows the stack with a subtraction, which doesn't work
3233 // on Windows. Some refactoring to share the functionality in
3234 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3235 // supports SME
3237 "Lazy ZA save is not yet supported on Windows");
3238
3239 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3240
3241 if (TPIDR2.Uses > 0) {
3242 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3244
3245 // The SUBXrs below won't always be emitted in a form that accepts SP
3246 // directly
3247 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3248 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3249 .addReg(AArch64::SP);
3250
3251 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3252 auto Size = MI.getOperand(1).getReg();
3253 auto Dest = MI.getOperand(0).getReg();
3254 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3255 .addReg(Size)
3256 .addReg(Size)
3257 .addReg(SP);
3258 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3259 AArch64::SP)
3260 .addReg(Dest);
3261
3262 // We have just allocated a variable sized object, tell this to PEI.
3263 MFI.CreateVariableSizedObject(Align(16), nullptr);
3264 }
3265
3266 BB->remove_instr(&MI);
3267 return BB;
3268}
3269
3270// TODO: Find a way to merge this with EmitAllocateZABuffer.
3273 MachineBasicBlock *BB) const {
3274 MachineFunction *MF = BB->getParent();
3275 MachineFrameInfo &MFI = MF->getFrameInfo();
3278 "Lazy ZA save is not yet supported on Windows");
3279
3280 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3281 if (FuncInfo->isSMESaveBufferUsed()) {
3282 // Allocate a buffer object of the size given by MI.getOperand(1).
3283 auto Size = MI.getOperand(1).getReg();
3284 auto Dest = MI.getOperand(0).getReg();
3285 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3286 .addReg(AArch64::SP)
3287 .addReg(Size)
3289 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3290 .addReg(AArch64::SP);
3291
3292 // We have just allocated a variable sized object, tell this to PEI.
3293 MFI.CreateVariableSizedObject(Align(16), nullptr);
3294 } else
3295 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3296 MI.getOperand(0).getReg());
3297
3298 BB->remove_instr(&MI);
3299 return BB;
3300}
3301
3304 MachineBasicBlock *BB) const {
3305 // If the buffer is used, emit a call to __arm_sme_state_size()
3306 MachineFunction *MF = BB->getParent();
3308 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3309 if (FuncInfo->isSMESaveBufferUsed()) {
3310 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3311 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3312 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3314 .addReg(AArch64::X0, RegState::ImplicitDefine)
3315 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3316 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3317 MI.getOperand(0).getReg())
3318 .addReg(AArch64::X0);
3319 } else
3320 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3321 MI.getOperand(0).getReg())
3322 .addReg(AArch64::XZR);
3323 BB->remove_instr(&MI);
3324 return BB;
3325}
3326
3329 MachineBasicBlock *BB) const {
3330 MachineFunction *MF = BB->getParent();
3331 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3332 const DebugLoc &DL = MI.getDebugLoc();
3333 Register ResultReg = MI.getOperand(0).getReg();
3334 if (MF->getRegInfo().use_empty(ResultReg)) {
3335 // Nothing to do. Pseudo erased below.
3336 } else if (Subtarget->hasSME()) {
3337 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3338 .addImm(AArch64SysReg::SVCR)
3339 .addReg(AArch64::VG, RegState::Implicit);
3340 } else {
3341 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3342 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3343 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3345 .addReg(AArch64::X0, RegState::ImplicitDefine)
3346 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3347 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3348 .addReg(AArch64::X0);
3349 }
3350 MI.eraseFromParent();
3351 return BB;
3352}
3353
3354// Helper function to find the instruction that defined a virtual register.
3355// If unable to find such instruction, returns nullptr.
3357 Register Reg) {
3358 while (Reg.isVirtual()) {
3359 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3360 assert(DefMI && "Virtual register definition not found");
3361 unsigned Opcode = DefMI->getOpcode();
3362
3363 if (Opcode == AArch64::COPY) {
3364 Reg = DefMI->getOperand(1).getReg();
3365 // Vreg is defined by copying from physreg.
3366 if (Reg.isPhysical())
3367 return DefMI;
3368 continue;
3369 }
3370 if (Opcode == AArch64::SUBREG_TO_REG) {
3371 Reg = DefMI->getOperand(2).getReg();
3372 continue;
3373 }
3374
3375 return DefMI;
3376 }
3377 return nullptr;
3378}
3379
3382 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3383 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3384 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3385 const DebugLoc &DL = MI.getDebugLoc();
3386
3387 Register AddrDisc = AddrDiscOp.getReg();
3388 int64_t IntDisc = IntDiscOp.getImm();
3389 assert(IntDisc == 0 && "Blend components are already expanded");
3390
3391 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3392 if (DiscMI) {
3393 switch (DiscMI->getOpcode()) {
3394 case AArch64::MOVKXi:
3395 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3396 // #imm should be an immediate and not a global symbol, for example.
3397 if (DiscMI->getOperand(2).isImm() &&
3398 DiscMI->getOperand(3).getImm() == 48) {
3399 AddrDisc = DiscMI->getOperand(1).getReg();
3400 IntDisc = DiscMI->getOperand(2).getImm();
3401 }
3402 break;
3403 case AArch64::MOVi32imm:
3404 case AArch64::MOVi64imm:
3405 // Small immediate integer constant passed via VReg.
3406 if (DiscMI->getOperand(1).isImm() &&
3407 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3408 AddrDisc = AArch64::NoRegister;
3409 IntDisc = DiscMI->getOperand(1).getImm();
3410 }
3411 break;
3412 }
3413 }
3414
3415 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3416 // in the requested register class.
3417 if (AddrDisc == AArch64::XZR)
3418 AddrDisc = AArch64::NoRegister;
3419
3420 // Make sure AddrDisc operand respects the register class imposed by MI.
3421 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3422 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3423 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3424 AddrDisc = TmpReg;
3425 }
3426
3427 AddrDiscOp.setReg(AddrDisc);
3428 IntDiscOp.setImm(IntDisc);
3429}
3430
3432 MachineInstr &MI, MachineBasicBlock *BB) const {
3433
3434 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3435 if (SMEOrigInstr != -1) {
3436 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3437 uint64_t SMEMatrixType =
3438 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3439 switch (SMEMatrixType) {
3441 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3443 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3445 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3447 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3449 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3451 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3452 }
3453 }
3454
3455 switch (MI.getOpcode()) {
3456 default:
3457#ifndef NDEBUG
3458 MI.dump();
3459#endif
3460 llvm_unreachable("Unexpected instruction for custom inserter!");
3461 case AArch64::InitTPIDR2Obj:
3462 return EmitInitTPIDR2Object(MI, BB);
3463 case AArch64::AllocateZABuffer:
3464 return EmitAllocateZABuffer(MI, BB);
3465 case AArch64::AllocateSMESaveBuffer:
3466 return EmitAllocateSMESaveBuffer(MI, BB);
3467 case AArch64::GetSMESaveSize:
3468 return EmitGetSMESaveSize(MI, BB);
3469 case AArch64::EntryPStateSM:
3470 return EmitEntryPStateSM(MI, BB);
3471 case AArch64::F128CSEL:
3472 return EmitF128CSEL(MI, BB);
3473 case TargetOpcode::STATEPOINT:
3474 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3475 // while bl call instruction (where statepoint will be lowered at the end)
3476 // has implicit def. This def is early-clobber as it will be set at
3477 // the moment of the call and earlier than any use is read.
3478 // Add this implicit dead def here as a workaround.
3479 MI.addOperand(*MI.getMF(),
3481 AArch64::LR, /*isDef*/ true,
3482 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3483 /*isUndef*/ false, /*isEarlyClobber*/ true));
3484 [[fallthrough]];
3485 case TargetOpcode::STACKMAP:
3486 case TargetOpcode::PATCHPOINT:
3487 return emitPatchPoint(MI, BB);
3488
3489 case TargetOpcode::PATCHABLE_EVENT_CALL:
3490 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3491 return BB;
3492
3493 case AArch64::CATCHRET:
3494 return EmitLoweredCatchRet(MI, BB);
3495
3496 case AArch64::PROBED_STACKALLOC_DYN:
3497 return EmitDynamicProbedAlloc(MI, BB);
3498
3499 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3500 return EmitCheckMatchingVL(MI, BB);
3501
3502 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3503 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3504 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3505 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3506 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3507 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3508 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3509 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3510 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3511 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3512 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3513 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3514 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3515 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3516 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3517 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3518 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3519 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3520 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3521 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3522 case AArch64::LDR_ZA_PSEUDO:
3523 return EmitFill(MI, BB);
3524 case AArch64::LDR_TX_PSEUDO:
3525 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3526 case AArch64::STR_TX_PSEUDO:
3527 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3528 case AArch64::ZERO_M_PSEUDO:
3529 return EmitZero(MI, BB);
3530 case AArch64::ZERO_T_PSEUDO:
3531 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3532 case AArch64::MOVT_TIZ_PSEUDO:
3533 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3534
3535 case AArch64::PAC:
3536 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3537 &AArch64::GPR64noipRegClass);
3538 return BB;
3539 }
3540}
3541
3542//===----------------------------------------------------------------------===//
3543// AArch64 Lowering private implementation.
3544//===----------------------------------------------------------------------===//
3545
3546//===----------------------------------------------------------------------===//
3547// Lowering Code
3548//===----------------------------------------------------------------------===//
3549
3550// Forward declarations of SVE fixed length lowering helpers
3555 SelectionDAG &DAG);
3558 EVT VT);
3559
3560/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3561static bool isZerosVector(const SDNode *N) {
3562 // Look through a bit convert.
3563 while (N->getOpcode() == ISD::BITCAST)
3564 N = N->getOperand(0).getNode();
3565
3567 return true;
3568
3569 if (N->getOpcode() != AArch64ISD::DUP)
3570 return false;
3571
3572 auto Opnd0 = N->getOperand(0);
3573 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3574}
3575
3576/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3577/// CC
3579 SDValue RHS = {}) {
3580 switch (CC) {
3581 default:
3582 llvm_unreachable("Unknown condition code!");
3583 case ISD::SETNE:
3584 return AArch64CC::NE;
3585 case ISD::SETEQ:
3586 return AArch64CC::EQ;
3587 case ISD::SETGT:
3588 return AArch64CC::GT;
3589 case ISD::SETGE:
3591 case ISD::SETLT:
3593 case ISD::SETLE:
3594 return AArch64CC::LE;
3595 case ISD::SETUGT:
3596 return AArch64CC::HI;
3597 case ISD::SETUGE:
3598 return AArch64CC::HS;
3599 case ISD::SETULT:
3600 return AArch64CC::LO;
3601 case ISD::SETULE:
3602 return AArch64CC::LS;
3603 }
3604}
3605
3606/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3608 AArch64CC::CondCode &CondCode,
3609 AArch64CC::CondCode &CondCode2) {
3610 CondCode2 = AArch64CC::AL;
3611 switch (CC) {
3612 default:
3613 llvm_unreachable("Unknown FP condition!");
3614 case ISD::SETEQ:
3615 case ISD::SETOEQ:
3616 CondCode = AArch64CC::EQ;
3617 break;
3618 case ISD::SETGT:
3619 case ISD::SETOGT:
3620 CondCode = AArch64CC::GT;
3621 break;
3622 case ISD::SETGE:
3623 case ISD::SETOGE:
3624 CondCode = AArch64CC::GE;
3625 break;
3626 case ISD::SETOLT:
3627 CondCode = AArch64CC::MI;
3628 break;
3629 case ISD::SETOLE:
3630 CondCode = AArch64CC::LS;
3631 break;
3632 case ISD::SETONE:
3633 CondCode = AArch64CC::MI;
3634 CondCode2 = AArch64CC::GT;
3635 break;
3636 case ISD::SETO:
3637 CondCode = AArch64CC::VC;
3638 break;
3639 case ISD::SETUO:
3640 CondCode = AArch64CC::VS;
3641 break;
3642 case ISD::SETUEQ:
3643 CondCode = AArch64CC::EQ;
3644 CondCode2 = AArch64CC::VS;
3645 break;
3646 case ISD::SETUGT:
3647 CondCode = AArch64CC::HI;
3648 break;
3649 case ISD::SETUGE:
3650 CondCode = AArch64CC::PL;
3651 break;
3652 case ISD::SETLT:
3653 case ISD::SETULT:
3654 CondCode = AArch64CC::LT;
3655 break;
3656 case ISD::SETLE:
3657 case ISD::SETULE:
3658 CondCode = AArch64CC::LE;
3659 break;
3660 case ISD::SETNE:
3661 case ISD::SETUNE:
3662 CondCode = AArch64CC::NE;
3663 break;
3664 }
3665}
3666
3667/// Convert a DAG fp condition code to an AArch64 CC.
3668/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3669/// should be AND'ed instead of OR'ed.
3671 AArch64CC::CondCode &CondCode,
3672 AArch64CC::CondCode &CondCode2) {
3673 CondCode2 = AArch64CC::AL;
3674 switch (CC) {
3675 default:
3676 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3677 assert(CondCode2 == AArch64CC::AL);
3678 break;
3679 case ISD::SETONE:
3680 // (a one b)
3681 // == ((a olt b) || (a ogt b))
3682 // == ((a ord b) && (a une b))
3683 CondCode = AArch64CC::VC;
3684 CondCode2 = AArch64CC::NE;
3685 break;
3686 case ISD::SETUEQ:
3687 // (a ueq b)
3688 // == ((a uno b) || (a oeq b))
3689 // == ((a ule b) && (a uge b))
3690 CondCode = AArch64CC::PL;
3691 CondCode2 = AArch64CC::LE;
3692 break;
3693 }
3694}
3695
3696/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3697/// CC usable with the vector instructions. Fewer operations are available
3698/// without a real NZCV register, so we have to use less efficient combinations
3699/// to get the same effect.
3701 AArch64CC::CondCode &CondCode,
3702 AArch64CC::CondCode &CondCode2,
3703 bool &Invert) {
3704 Invert = false;
3705 switch (CC) {
3706 default:
3707 // Mostly the scalar mappings work fine.
3708 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3709 break;
3710 case ISD::SETUO:
3711 Invert = true;
3712 [[fallthrough]];
3713 case ISD::SETO:
3714 CondCode = AArch64CC::MI;
3715 CondCode2 = AArch64CC::GE;
3716 break;
3717 case ISD::SETUEQ:
3718 case ISD::SETULT:
3719 case ISD::SETULE:
3720 case ISD::SETUGT:
3721 case ISD::SETUGE:
3722 // All of the compare-mask comparisons are ordered, but we can switch
3723 // between the two by a double inversion. E.g. ULE == !OGT.
3724 Invert = true;
3725 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3726 CondCode, CondCode2);
3727 break;
3728 }
3729}
3730
3731/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3733 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3734 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3735}
3736
3738 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3739 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3740 LLVM_DEBUG(dbgs() << "Is imm " << C
3741 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3742 return IsLegal;
3743}
3744
3746 // Works for negative immediates too, as it can be written as an ADDS
3747 // instruction with a negated immediate.
3748 return isLegalArithImmed(C.abs().getZExtValue());
3749}
3750
3752 uint64_t Imm = C.getZExtValue();
3754 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3755 return Insn.size();
3756}
3757
3759 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3760 if (Op->getFlags().hasNoSignedWrap())
3761 return true;
3762
3763 // We can still figure out if the second operand is safe to use
3764 // in a CMN instruction by checking if it is known to be not the minimum
3765 // signed value. If it is not, then we can safely use CMN.
3766 // Note: We can eventually remove this check and simply rely on
3767 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3768 // consistently sets them appropriately when making said nodes.
3769
3770 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3771 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3772}
3773
3774// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3775// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3776// can be set differently by this operation. It comes down to whether
3777// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3778// everything is fine. If not then the optimization is wrong. Thus general
3779// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3780//
3781// So, finally, the only LLVM-native comparisons that don't mention C or V
3782// are the ones that aren't unsigned comparisons. They're the only ones we can
3783// safely use CMN for in the absence of information about op2.
3785 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3786 (isIntEqualitySetCC(CC) ||
3787 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3788 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3789}
3790
3792 SelectionDAG &DAG, SDValue Chain,
3793 bool IsSignaling) {
3794 EVT VT = LHS.getValueType();
3795 assert(VT != MVT::f128);
3796
3797 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3798
3799 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3800 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3801 {Chain, LHS});
3802 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3803 {LHS.getValue(1), RHS});
3804 Chain = RHS.getValue(1);
3805 }
3806 unsigned Opcode =
3807 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3808 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3809}
3810
3812 const SDLoc &DL, SelectionDAG &DAG) {
3813 EVT VT = LHS.getValueType();
3814 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3815
3816 if (VT.isFloatingPoint()) {
3817 assert(VT != MVT::f128);
3818 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3819 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3820 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3821 }
3822 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3823 }
3824
3825 // The CMP instruction is just an alias for SUBS, and representing it as
3826 // SUBS means that it's possible to get CSE with subtract operations.
3827 // A later phase can perform the optimization of setting the destination
3828 // register to WZR/XZR if it ends up being unused.
3829 unsigned Opcode = AArch64ISD::SUBS;
3830
3831 if (isCMN(RHS, CC, DAG)) {
3832 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3833 Opcode = AArch64ISD::ADDS;
3834 RHS = RHS.getOperand(1);
3835 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3836 isIntEqualitySetCC(CC)) {
3837 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3838 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3839 Opcode = AArch64ISD::ADDS;
3840 LHS = LHS.getOperand(1);
3841 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3842 if (LHS.getOpcode() == ISD::AND) {
3843 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3844 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3845 // of the signed comparisons.
3846 const SDValue ANDSNode =
3847 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3848 LHS.getOperand(0), LHS.getOperand(1));
3849 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3850 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3851 return ANDSNode.getValue(1);
3852 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3853 // Use result of ANDS
3854 return LHS.getValue(1);
3855 }
3856 }
3857
3858 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3859 .getValue(1);
3860}
3861
3862/// \defgroup AArch64CCMP CMP;CCMP matching
3863///
3864/// These functions deal with the formation of CMP;CCMP;... sequences.
3865/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3866/// a comparison. They set the NZCV flags to a predefined value if their
3867/// predicate is false. This allows to express arbitrary conjunctions, for
3868/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3869/// expressed as:
3870/// cmp A
3871/// ccmp B, inv(CB), CA
3872/// check for CB flags
3873///
3874/// This naturally lets us implement chains of AND operations with SETCC
3875/// operands. And we can even implement some other situations by transforming
3876/// them:
3877/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3878/// negating the flags used in a CCMP/FCCMP operations.
3879/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3880/// by negating the flags we test for afterwards. i.e.
3881/// NEG (CMP CCMP CCCMP ...) can be implemented.
3882/// - Note that we can only ever negate all previously processed results.
3883/// What we can not implement by flipping the flags to test is a negation
3884/// of two sub-trees (because the negation affects all sub-trees emitted so
3885/// far, so the 2nd sub-tree we emit would also affect the first).
3886/// With those tools we can implement some OR operations:
3887/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3888/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3889/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3890/// elimination rules from earlier to implement the whole thing as a
3891/// CCMP/FCCMP chain.
3892///
3893/// As complete example:
3894/// or (or (setCA (cmp A)) (setCB (cmp B)))
3895/// (and (setCC (cmp C)) (setCD (cmp D)))"
3896/// can be reassociated to:
3897/// or (and (setCC (cmp C)) setCD (cmp D))
3898// (or (setCA (cmp A)) (setCB (cmp B)))
3899/// can be transformed to:
3900/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3901/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3902/// which can be implemented as:
3903/// cmp C
3904/// ccmp D, inv(CD), CC
3905/// ccmp A, CA, inv(CD)
3906/// ccmp B, CB, inv(CA)
3907/// check for CB flags
3908///
3909/// A counterexample is "or (and A B) (and C D)" which translates to
3910/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3911/// can only implement 1 of the inner (not) operations, but not both!
3912/// @{
3913
3914/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3916 ISD::CondCode CC, SDValue CCOp,
3918 AArch64CC::CondCode OutCC,
3919 const SDLoc &DL, SelectionDAG &DAG) {
3920 unsigned Opcode = 0;
3921 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3922
3923 if (LHS.getValueType().isFloatingPoint()) {
3924 assert(LHS.getValueType() != MVT::f128);
3925 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3926 LHS.getValueType() == MVT::bf16) {
3927 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3928 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3929 }
3930 Opcode = AArch64ISD::FCCMP;
3931 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3932 APInt Imm = Const->getAPIntValue();
3933 if (Imm.isNegative() && Imm.sgt(-32)) {
3934 Opcode = AArch64ISD::CCMN;
3935 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3936 }
3937 } else if (isCMN(RHS, CC, DAG)) {
3938 Opcode = AArch64ISD::CCMN;
3939 RHS = RHS.getOperand(1);
3940 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3941 isIntEqualitySetCC(CC)) {
3942 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3943 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3944 Opcode = AArch64ISD::CCMN;
3945 LHS = LHS.getOperand(1);
3946 }
3947 if (Opcode == 0)
3948 Opcode = AArch64ISD::CCMP;
3949
3950 SDValue Condition = getCondCode(DAG, Predicate);
3952 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3953 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3954 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3955}
3956
3957/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3958/// expressed as a conjunction. See \ref AArch64CCMP.
3959/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3960/// changing the conditions on the SETCC tests.
3961/// (this means we can call emitConjunctionRec() with
3962/// Negate==true on this sub-tree)
3963/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3964/// cannot do the negation naturally. We are required to
3965/// emit the subtree first in this case.
3966/// \param PreferFirst Set to true if processing this subtree first may
3967/// result in more efficient code.
3968/// \param WillNegate Is true if are called when the result of this
3969/// subexpression must be negated. This happens when the
3970/// outer expression is an OR. We can use this fact to know
3971/// that we have a double negation (or (or ...) ...) that
3972/// can be implemented for free.
3973static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val,
3974 bool &CanNegate, bool &MustBeFirst,
3975 bool &PreferFirst, bool WillNegate,
3976 unsigned Depth = 0) {
3977 if (!Val.hasOneUse())
3978 return false;
3979 unsigned Opcode = Val->getOpcode();
3980 if (Opcode == ISD::SETCC) {
3981 EVT VT = Val->getOperand(0).getValueType();
3982 if (VT == MVT::f128)
3983 return false;
3984 CanNegate = true;
3985 MustBeFirst = false;
3986 // Designate this operation as a preferred first operation if the result
3987 // of a SUB operation can be reused.
3988 PreferFirst = DAG.doesNodeExist(ISD::SUB, DAG.getVTList(VT),
3989 {Val->getOperand(0), Val->getOperand(1)});
3990 return true;
3991 }
3992 // Protect against exponential runtime and stack overflow.
3993 if (Depth > 6)
3994 return false;
3995 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3996 bool IsOR = Opcode == ISD::OR;
3997 SDValue O0 = Val->getOperand(0);
3998 SDValue O1 = Val->getOperand(1);
3999 bool CanNegateL;
4000 bool MustBeFirstL;
4001 bool PreferFirstL;
4002 if (!canEmitConjunction(DAG, O0, CanNegateL, MustBeFirstL, PreferFirstL,
4003 IsOR, Depth + 1))
4004 return false;
4005 bool CanNegateR;
4006 bool MustBeFirstR;
4007 bool PreferFirstR;
4008 if (!canEmitConjunction(DAG, O1, CanNegateR, MustBeFirstR, PreferFirstR,
4009 IsOR, Depth + 1))
4010 return false;
4011
4012 if (MustBeFirstL && MustBeFirstR)
4013 return false;
4014
4015 if (IsOR) {
4016 // For an OR expression we need to be able to naturally negate at least
4017 // one side or we cannot do the transformation at all.
4018 if (!CanNegateL && !CanNegateR)
4019 return false;
4020 // If we the result of the OR will be negated and we can naturally negate
4021 // the leafs, then this sub-tree as a whole negates naturally.
4022 CanNegate = WillNegate && CanNegateL && CanNegateR;
4023 // If we cannot naturally negate the whole sub-tree, then this must be
4024 // emitted first.
4025 MustBeFirst = !CanNegate;
4026 } else {
4027 assert(Opcode == ISD::AND && "Must be OR or AND");
4028 // We cannot naturally negate an AND operation.
4029 CanNegate = false;
4030 MustBeFirst = MustBeFirstL || MustBeFirstR;
4031 }
4032 PreferFirst = PreferFirstL || PreferFirstR;
4033 return true;
4034 }
4035 return false;
4036}
4037
4038/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
4039/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
4040/// Tries to transform the given i1 producing node @p Val to a series compare
4041/// and conditional compare operations. @returns an NZCV flags producing node
4042/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
4043/// transformation was not possible.
4044/// \p Negate is true if we want this sub-tree being negated just by changing
4045/// SETCC conditions.
4047 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
4049 // We're at a tree leaf, produce a conditional comparison operation.
4050 unsigned Opcode = Val->getOpcode();
4051 if (Opcode == ISD::SETCC) {
4052 SDValue LHS = Val->getOperand(0);
4053 SDValue RHS = Val->getOperand(1);
4054 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
4055 bool isInteger = LHS.getValueType().isInteger();
4056 if (Negate)
4057 CC = getSetCCInverse(CC, LHS.getValueType());
4058 SDLoc DL(Val);
4059 // Determine OutCC and handle FP special case.
4060 if (isInteger) {
4061 OutCC = changeIntCCToAArch64CC(CC, RHS);
4062 } else {
4063 assert(LHS.getValueType().isFloatingPoint());
4064 AArch64CC::CondCode ExtraCC;
4065 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4066 // Some floating point conditions can't be tested with a single condition
4067 // code. Construct an additional comparison in this case.
4068 if (ExtraCC != AArch64CC::AL) {
4069 SDValue ExtraCmp;
4070 if (!CCOp.getNode())
4071 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
4072 else
4073 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
4074 ExtraCC, DL, DAG);
4075 CCOp = ExtraCmp;
4076 Predicate = ExtraCC;
4077 }
4078 }
4079
4080 // Produce a normal comparison if we are first in the chain
4081 if (!CCOp)
4082 return emitComparison(LHS, RHS, CC, DL, DAG);
4083 // Otherwise produce a ccmp.
4084 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
4085 DAG);
4086 }
4087 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
4088
4089 bool IsOR = Opcode == ISD::OR;
4090
4091 SDValue LHS = Val->getOperand(0);
4092 bool CanNegateL;
4093 bool MustBeFirstL;
4094 bool PreferFirstL;
4095 bool ValidL = canEmitConjunction(DAG, LHS, CanNegateL, MustBeFirstL,
4096 PreferFirstL, IsOR);
4097 assert(ValidL && "Valid conjunction/disjunction tree");
4098 (void)ValidL;
4099
4100 SDValue RHS = Val->getOperand(1);
4101 bool CanNegateR;
4102 bool MustBeFirstR;
4103 bool PreferFirstR;
4104 bool ValidR = canEmitConjunction(DAG, RHS, CanNegateR, MustBeFirstR,
4105 PreferFirstR, IsOR);
4106 assert(ValidR && "Valid conjunction/disjunction tree");
4107 (void)ValidR;
4108
4109 bool ShouldFirstL = PreferFirstL && !PreferFirstR && !MustBeFirstR;
4110
4111 // Swap sub-tree that must or should come first to the right side.
4112 if (MustBeFirstL || ShouldFirstL) {
4113 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4114 std::swap(LHS, RHS);
4115 std::swap(CanNegateL, CanNegateR);
4116 std::swap(MustBeFirstL, MustBeFirstR);
4117 }
4118
4119 bool NegateR;
4120 bool NegateAfterR;
4121 bool NegateL;
4122 bool NegateAfterAll;
4123 if (Opcode == ISD::OR) {
4124 // Swap the sub-tree that we can negate naturally to the left.
4125 if (!CanNegateL) {
4126 assert(CanNegateR && "at least one side must be negatable");
4127 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4128 assert(!Negate);
4129 std::swap(LHS, RHS);
4130 NegateR = false;
4131 NegateAfterR = true;
4132 } else {
4133 // Negate the left sub-tree if possible, otherwise negate the result.
4134 NegateR = CanNegateR;
4135 NegateAfterR = !CanNegateR;
4136 }
4137 NegateL = true;
4138 NegateAfterAll = !Negate;
4139 } else {
4140 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4141 assert(!Negate && "Valid conjunction/disjunction tree");
4142
4143 NegateL = false;
4144 NegateR = false;
4145 NegateAfterR = false;
4146 NegateAfterAll = false;
4147 }
4148
4149 // Emit sub-trees.
4150 AArch64CC::CondCode RHSCC;
4151 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4152 if (NegateAfterR)
4153 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4154 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4155 if (NegateAfterAll)
4156 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4157 return CmpL;
4158}
4159
4160/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4161/// In some cases this is even possible with OR operations in the expression.
4162/// See \ref AArch64CCMP.
4163/// \see emitConjunctionRec().
4165 AArch64CC::CondCode &OutCC) {
4166 bool DummyCanNegate;
4167 bool DummyMustBeFirst;
4168 bool DummyPreferFirst;
4169 if (!canEmitConjunction(DAG, Val, DummyCanNegate, DummyMustBeFirst,
4170 DummyPreferFirst, false))
4171 return SDValue();
4172
4173 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4174}
4175
4176/// @}
4177
4178/// Returns how profitable it is to fold a comparison's operand's shift and/or
4179/// extension operations.
4181 auto isSupportedExtend = [&](SDValue V) {
4182 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4183 return true;
4184
4185 if (V.getOpcode() == ISD::AND)
4186 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4187 uint64_t Mask = MaskCst->getZExtValue();
4188 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4189 }
4190
4191 return false;
4192 };
4193
4194 if (!Op.hasOneUse())
4195 return 0;
4196
4197 if (isSupportedExtend(Op))
4198 return 1;
4199
4200 unsigned Opc = Op.getOpcode();
4201 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4202 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4203 uint64_t Shift = ShiftCst->getZExtValue();
4204 if (isSupportedExtend(Op.getOperand(0)))
4205 return (Shift <= 4) ? 2 : 1;
4206 EVT VT = Op.getValueType();
4207 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4208 return 1;
4209 }
4210
4211 return 0;
4212}
4213
4214// emitComparison() converts comparison with one or negative one to comparison
4215// with 0. Note that this only works for signed comparisons because of how ANDS
4216// works.
4218 // Only works for ANDS and AND.
4219 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4220 return false;
4221
4222 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4223 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4224 return true;
4225 }
4226
4227 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4228 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4229 return true;
4230 }
4231
4232 return false;
4233}
4234
4236 SDValue &AArch64cc, SelectionDAG &DAG,
4237 const SDLoc &DL) {
4238 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4239 EVT VT = RHS.getValueType();
4240 APInt C = RHSC->getAPIntValue();
4241 // shouldBeAdjustedToZero is a special case to better fold with
4242 // emitComparison().
4243 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4244 // Adjust the constant to zero.
4245 // CC has already been adjusted.
4246 RHS = DAG.getConstant(0, DL, VT);
4247 } else if (!isLegalCmpImmed(C)) {
4248 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4249 // Constant does not fit, try adjusting it by one?
4250 switch (CC) {
4251 default:
4252 break;
4253 case ISD::SETLT:
4254 case ISD::SETGE:
4255 if (!C.isMinSignedValue()) {
4256 APInt CMinusOne = C - 1;
4257 if (isLegalCmpImmed(CMinusOne) ||
4258 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4259 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4260 RHS = DAG.getConstant(CMinusOne, DL, VT);
4261 }
4262 }
4263 break;
4264 case ISD::SETULT:
4265 case ISD::SETUGE: {
4266 // C is not 0 because it is a legal immediate.
4267 assert(!C.isZero() && "C should not be zero here");
4268 APInt CMinusOne = C - 1;
4269 if (isLegalCmpImmed(CMinusOne) ||
4270 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4271 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4272 RHS = DAG.getConstant(CMinusOne, DL, VT);
4273 }
4274 break;
4275 }
4276 case ISD::SETLE:
4277 case ISD::SETGT:
4278 if (!C.isMaxSignedValue()) {
4279 APInt CPlusOne = C + 1;
4280 if (isLegalCmpImmed(CPlusOne) ||
4281 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4282 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4283 RHS = DAG.getConstant(CPlusOne, DL, VT);
4284 }
4285 }
4286 break;
4287 case ISD::SETULE:
4288 case ISD::SETUGT: {
4289 if (!C.isAllOnes()) {
4290 APInt CPlusOne = C + 1;
4291 if (isLegalCmpImmed(CPlusOne) ||
4292 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4293 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4294 RHS = DAG.getConstant(CPlusOne, DL, VT);
4295 }
4296 }
4297 break;
4298 }
4299 }
4300 }
4301 }
4302
4303 // Comparisons are canonicalized so that the RHS operand is simpler than the
4304 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4305 // can fold some shift+extend operations on the RHS operand, so swap the
4306 // operands if that can be done.
4307 //
4308 // For example:
4309 // lsl w13, w11, #1
4310 // cmp w13, w12
4311 // can be turned into:
4312 // cmp w12, w11, lsl #1
4313 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4314 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4315 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4316 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4317 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4318
4319 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4320 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4321 std::swap(LHS, RHS);
4323 }
4324 }
4325
4326 SDValue Cmp;
4328 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4330
4331 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4332 // For the i8 operand, the largest immediate is 255, so this can be easily
4333 // encoded in the compare instruction. For the i16 operand, however, the
4334 // largest immediate cannot be encoded in the compare.
4335 // Therefore, use a sign extending load and cmn to avoid materializing the
4336 // -1 constant. For example,
4337 // movz w1, #65535
4338 // ldrh w0, [x0, #0]
4339 // cmp w0, w1
4340 // >
4341 // ldrsh w0, [x0, #0]
4342 // cmn w0, #1
4343 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4344 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4345 // ensure both the LHS and RHS are truly zero extended and to make sure the
4346 // transformation is profitable.
4347 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4348 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4349 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4350 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4351 int16_t ValueofRHS = RHS->getAsZExtVal();
4352 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4353 SDValue SExt =
4354 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4355 DAG.getValueType(MVT::i16));
4356 Cmp = emitComparison(
4357 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4358 DL, DAG);
4360 }
4361 }
4362
4363 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4364 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4365 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4367 }
4368 }
4369 }
4370
4371 if (!Cmp) {
4372 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4374 }
4375 AArch64cc = getCondCode(DAG, AArch64CC);
4376 return Cmp;
4377}
4378
4379static std::pair<SDValue, SDValue>
4381 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4382 "Unsupported value type");
4383 SDValue Value, Overflow;
4384 SDLoc DL(Op);
4385 SDValue LHS = Op.getOperand(0);
4386 SDValue RHS = Op.getOperand(1);
4387 unsigned Opc = 0;
4388 switch (Op.getOpcode()) {
4389 default:
4390 llvm_unreachable("Unknown overflow instruction!");
4391 case ISD::SADDO:
4392 Opc = AArch64ISD::ADDS;
4393 CC = AArch64CC::VS;
4394 break;
4395 case ISD::UADDO:
4396 Opc = AArch64ISD::ADDS;
4397 CC = AArch64CC::HS;
4398 break;
4399 case ISD::SSUBO:
4400 Opc = AArch64ISD::SUBS;
4401 CC = AArch64CC::VS;
4402 break;
4403 case ISD::USUBO:
4404 Opc = AArch64ISD::SUBS;
4405 CC = AArch64CC::LO;
4406 break;
4407 // Multiply needs a little bit extra work.
4408 case ISD::SMULO:
4409 case ISD::UMULO: {
4410 CC = AArch64CC::NE;
4411 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4412 if (Op.getValueType() == MVT::i32) {
4413 // Extend to 64-bits, then perform a 64-bit multiply.
4414 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4415 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4416 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4417 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4418 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4419
4420 // Check that the result fits into a 32-bit integer.
4421 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4422 if (IsSigned) {
4423 // cmp xreg, wreg, sxtw
4424 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4425 Overflow =
4426 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4427 } else {
4428 // tst xreg, #0xffffffff00000000
4429 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4430 Overflow =
4431 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4432 }
4433 break;
4434 }
4435 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4436 // For the 64 bit multiply
4437 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4438 if (IsSigned) {
4439 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4440 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4441 DAG.getConstant(63, DL, MVT::i64));
4442 // It is important that LowerBits is last, otherwise the arithmetic
4443 // shift will not be folded into the compare (SUBS).
4444 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4445 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4446 .getValue(1);
4447 } else {
4448 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4449 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4450 Overflow =
4451 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4452 DAG.getConstant(0, DL, MVT::i64),
4453 UpperBits).getValue(1);
4454 }
4455 break;
4456 }
4457 } // switch (...)
4458
4459 if (Opc) {
4460 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4461
4462 // Emit the AArch64 operation with overflow check.
4463 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4464 Overflow = Value.getValue(1);
4465 }
4466 return std::make_pair(Value, Overflow);
4467}
4468
4469SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4470 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4471 !Subtarget->isNeonAvailable()))
4472 return LowerToScalableOp(Op, DAG);
4473
4474 SDValue Sel = Op.getOperand(0);
4475 SDValue Other = Op.getOperand(1);
4476 SDLoc DL(Sel);
4477
4478 // If the operand is an overflow checking operation, invert the condition
4479 // code and kill the Not operation. I.e., transform:
4480 // (xor (overflow_op_bool, 1))
4481 // -->
4482 // (csel 1, 0, invert(cc), overflow_op_bool)
4483 // ... which later gets transformed to just a cset instruction with an
4484 // inverted condition code, rather than a cset + eor sequence.
4486 // Only lower legal XALUO ops.
4488 return SDValue();
4489
4490 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4491 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4493 SDValue Value, Overflow;
4494 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4495 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4496 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4497 CCVal, Overflow);
4498 }
4499 // If neither operand is a SELECT_CC, give up.
4500 if (Sel.getOpcode() != ISD::SELECT_CC)
4501 std::swap(Sel, Other);
4502 if (Sel.getOpcode() != ISD::SELECT_CC)
4503 return Op;
4504
4505 // The folding we want to perform is:
4506 // (xor x, (select_cc a, b, cc, 0, -1) )
4507 // -->
4508 // (csel x, (xor x, -1), cc ...)
4509 //
4510 // The latter will get matched to a CSINV instruction.
4511
4512 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4513 SDValue LHS = Sel.getOperand(0);
4514 SDValue RHS = Sel.getOperand(1);
4515 SDValue TVal = Sel.getOperand(2);
4516 SDValue FVal = Sel.getOperand(3);
4517
4518 // FIXME: This could be generalized to non-integer comparisons.
4519 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4520 return Op;
4521
4522 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4523 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4524
4525 // The values aren't constants, this isn't the pattern we're looking for.
4526 if (!CFVal || !CTVal)
4527 return Op;
4528
4529 // We can commute the SELECT_CC by inverting the condition. This
4530 // might be needed to make this fit into a CSINV pattern.
4531 if (CTVal->isAllOnes() && CFVal->isZero()) {
4532 std::swap(TVal, FVal);
4533 std::swap(CTVal, CFVal);
4534 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4535 }
4536
4537 // If the constants line up, perform the transform!
4538 if (CTVal->isZero() && CFVal->isAllOnes()) {
4539 SDValue CCVal;
4540 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4541
4542 FVal = Other;
4543 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4544 DAG.getAllOnesConstant(DL, Other.getValueType()));
4545
4546 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4547 CCVal, Cmp);
4548 }
4549
4550 return Op;
4551}
4552
4553// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4554// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4555// sets 'C' bit to 0.
4557 SDLoc DL(Value);
4558 EVT VT = Value.getValueType();
4559 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4560 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4561 SDValue Cmp =
4562 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4563 return Cmp.getValue(1);
4564}
4565
4566// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4567// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4569 bool Invert) {
4570 assert(Glue.getResNo() == 1);
4571 SDLoc DL(Glue);
4572 SDValue Zero = DAG.getConstant(0, DL, VT);
4573 SDValue One = DAG.getConstant(1, DL, VT);
4575 SDValue CC = getCondCode(DAG, Cond);
4576 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4577}
4578
4579// Value is 1 if 'V' bit of NZCV is 1, else 0
4581 assert(Glue.getResNo() == 1);
4582 SDLoc DL(Glue);
4583 SDValue Zero = DAG.getConstant(0, DL, VT);
4584 SDValue One = DAG.getConstant(1, DL, VT);
4586 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4587}
4588
4589// This lowering is inefficient, but it will get cleaned up by
4590// `foldOverflowCheck`
4592 unsigned Opcode, bool IsSigned) {
4593 EVT VT0 = Op.getValue(0).getValueType();
4594 EVT VT1 = Op.getValue(1).getValueType();
4595
4596 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4597 return SDValue();
4598
4599 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4600 SDValue OpLHS = Op.getOperand(0);
4601 SDValue OpRHS = Op.getOperand(1);
4602 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4603
4604 SDLoc DL(Op);
4605
4606 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4607 OpRHS, OpCarryIn);
4608
4609 SDValue OutFlag =
4610 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4611 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4612
4613 return DAG.getMergeValues({Sum, OutFlag}, DL);
4614}
4615
4616static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
4617 SelectionDAG &DAG,
4618 bool LastOperandIsImm = false) {
4619 if (Op.getValueType().isVector())
4620 return SDValue();
4621
4622 SDLoc DL(Op);
4624 const unsigned NumOperands = Op.getNumOperands();
4625 auto getFloatVT = [](EVT VT) {
4626 assert((VT == MVT::i32 || VT == MVT::i64) && "Unexpected VT");
4627 return VT == MVT::i32 ? MVT::f32 : MVT::f64;
4628 };
4629 auto bitcastToFloat = [&](SDValue Val) {
4630 return DAG.getBitcast(getFloatVT(Val.getValueType()), Val);
4631 };
4632
4633 // Skip first operand as it is intrinsic ID.
4634 for (unsigned I = 1; I < NumOperands; ++I) {
4635 SDValue Val = Op.getOperand(I);
4636 const bool KeepInt = LastOperandIsImm && (I == NumOperands - 1);
4637 NewOps.push_back(KeepInt ? Val : bitcastToFloat(Val));
4638 }
4639 EVT OrigVT = Op.getValueType();
4640 SDValue OpNode = DAG.getNode(Opcode, DL, getFloatVT(OrigVT), NewOps);
4641 return DAG.getBitcast(OrigVT, OpNode);
4642}
4643
4645 // Let legalize expand this if it isn't a legal type yet.
4646 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4647 return SDValue();
4648
4649 SDLoc DL(Op);
4651 // The actual operation that sets the overflow or carry flag.
4652 SDValue Value, Overflow;
4653 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4654
4655 // We use 0 and 1 as false and true values.
4656 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4657 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4658
4659 // We use an inverted condition, because the conditional select is inverted
4660 // too. This will allow it to be selected to a single instruction:
4661 // CSINC Wd, WZR, WZR, invert(cond).
4662 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4663 Overflow =
4664 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4665
4666 return DAG.getMergeValues({Value, Overflow}, DL);
4667}
4668
4669// Prefetch operands are:
4670// 1: Address to prefetch
4671// 2: bool isWrite
4672// 3: int locality (0 = no locality ... 3 = extreme locality)
4673// 4: bool isDataCache
4675 SDLoc DL(Op);
4676 unsigned IsWrite = Op.getConstantOperandVal(2);
4677 unsigned Locality = Op.getConstantOperandVal(3);
4678 unsigned IsData = Op.getConstantOperandVal(4);
4679
4680 bool IsStream = !Locality;
4681 // When the locality number is set
4682 if (Locality) {
4683 // The front-end should have filtered out the out-of-range values
4684 assert(Locality <= 3 && "Prefetch locality out-of-range");
4685 // The locality degree is the opposite of the cache speed.
4686 // Put the number the other way around.
4687 // The encoding starts at 0 for level 1
4688 Locality = 3 - Locality;
4689 }
4690
4691 // built the mask value encoding the expected behavior.
4692 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4693 (!IsData << 3) | // IsDataCache bit
4694 (Locality << 1) | // Cache level bits
4695 (unsigned)IsStream; // Stream bit
4696 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4697 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4698 Op.getOperand(1));
4699}
4700
4701// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4702// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4703// (AND X Y) Z which produces a better opt with EmitComparison
4705 SelectionDAG &DAG, const SDLoc DL) {
4706 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4707 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4709 if (LHSConstOp && RHSConst) {
4710 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4711 uint64_t RHSConstant = RHSConst->getZExtValue();
4712 if (isPowerOf2_64(RHSConstant)) {
4713 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4714 LHS =
4715 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4716 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4717 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4718 CC = ISD::SETEQ;
4719 }
4720 }
4721 }
4722}
4723
4724SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4725 SelectionDAG &DAG) const {
4726 EVT VT = Op.getValueType();
4727 if (VT.isScalableVector()) {
4728 SDValue SrcVal = Op.getOperand(0);
4729
4730 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4731 // Break conversion in two with the first part converting to f32 and the
4732 // second using native f32->VT instructions.
4733 SDLoc DL(Op);
4734 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4735 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4736 }
4737
4738 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4739 }
4740
4741 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4742 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4743
4744 bool IsStrict = Op->isStrictFPOpcode();
4745 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4746 EVT Op0VT = Op0.getValueType();
4747 if (VT == MVT::f64) {
4748 // FP16->FP32 extends are legal for v32 and v4f32.
4749 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4750 return Op;
4751 // Split bf16->f64 extends into two fpextends.
4752 if (Op0VT == MVT::bf16 && IsStrict) {
4753 SDValue Ext1 =
4754 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4755 {Op0, Op.getOperand(0)});
4756 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4757 {Ext1, Ext1.getValue(1)});
4758 }
4759 if (Op0VT == MVT::bf16)
4760 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4761 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4762 return SDValue();
4763 }
4764
4765 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4766 return SDValue();
4767}
4768
4769SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4770 SelectionDAG &DAG) const {
4771 EVT VT = Op.getValueType();
4772 bool IsStrict = Op->isStrictFPOpcode();
4773 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4774 EVT SrcVT = SrcVal.getValueType();
4775 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4776
4777 if (VT.isScalableVector()) {
4778 // Let common code split the operation.
4779 if (SrcVT == MVT::nxv8f32)
4780 return Op;
4781
4782 if (VT.getScalarType() != MVT::bf16)
4783 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4784
4785 SDLoc DL(Op);
4786 constexpr EVT I32 = MVT::nxv4i32;
4787 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4788
4789 SDValue NaN;
4790 SDValue Narrow;
4791
4792 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4793 if (Subtarget->hasBF16())
4794 return LowerToPredicatedOp(Op, DAG,
4795 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4796
4797 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4798
4799 // Set the quiet bit.
4800 if (!DAG.isKnownNeverSNaN(SrcVal))
4801 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4802 } else if (SrcVT == MVT::nxv2f64 &&
4803 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4804 // Round to float without introducing rounding errors and try again.
4805 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4806 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4807 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4808
4810 if (IsStrict)
4811 NewOps.push_back(Op.getOperand(0));
4812 NewOps.push_back(Narrow);
4813 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4814 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4815 } else
4816 return SDValue();
4817
4818 if (!Trunc) {
4819 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4820 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4821 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4822 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4823 }
4824
4825 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4826 // 0x80000000.
4827 if (NaN) {
4828 EVT I1 = I32.changeElementType(*DAG.getContext(), MVT::i1);
4829 EVT CondVT = VT.changeElementType(*DAG.getContext(), MVT::i1);
4830 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4831 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4832 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4833 }
4834
4835 // Now that we have rounded, shift the bits into position.
4836 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4837 return getSVESafeBitCast(VT, Narrow, DAG);
4838 }
4839
4840 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4841 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4842
4843 // Expand cases where the result type is BF16 but we don't have hardware
4844 // instructions to lower it.
4845 if (VT.getScalarType() == MVT::bf16 &&
4846 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4847 Subtarget->hasBF16())) {
4848 SDLoc DL(Op);
4849 SDValue Narrow = SrcVal;
4850 SDValue NaN;
4851 EVT I32 = SrcVT.changeElementType(*DAG.getContext(), MVT::i32);
4852 EVT F32 = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
4853 if (SrcVT.getScalarType() == MVT::f32) {
4854 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4855 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4856 if (!NeverSNaN) {
4857 // Set the quiet bit.
4858 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4859 DAG.getConstant(0x400000, DL, I32));
4860 }
4861 } else if (SrcVT.getScalarType() == MVT::f64) {
4862 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4863 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4864 } else {
4865 return SDValue();
4866 }
4867 if (!Trunc) {
4868 SDValue One = DAG.getConstant(1, DL, I32);
4869 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4870 DAG.getShiftAmountConstant(16, I32, DL));
4871 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4872 SDValue RoundingBias =
4873 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4874 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4875 }
4876
4877 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4878 // 0x80000000.
4879 if (NaN) {
4880 SDValue IsNaN = DAG.getSetCC(
4881 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4882 SrcVal, SrcVal, ISD::SETUO);
4883 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4884 }
4885
4886 // Now that we have rounded, shift the bits into position.
4887 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4888 DAG.getShiftAmountConstant(16, I32, DL));
4889 if (VT.isVector()) {
4890 EVT I16 = I32.changeVectorElementType(*DAG.getContext(), MVT::i16);
4891 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4892 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4893 }
4894 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4895 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4896 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4897 : Result;
4898 }
4899
4900 if (SrcVT != MVT::f128) {
4901 // Expand cases where the input is a vector bigger than NEON.
4903 return SDValue();
4904
4905 // It's legal except when f128 is involved
4906 return Op;
4907 }
4908
4909 return SDValue();
4910}
4911
4912SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4913 SelectionDAG &DAG) const {
4914 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4915 // Any additional optimization in this function should be recorded
4916 // in the cost tables.
4917 bool IsStrict = Op->isStrictFPOpcode();
4918 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4919 EVT VT = Op.getValueType();
4920
4921 assert(!(IsStrict && VT.isScalableVector()) &&
4922 "Unimplemented SVE support for STRICT_FP_to_INT!");
4923
4924 // f16 conversions are promoted to f32 when full fp16 is not supported.
4925 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4926 InVT.getVectorElementType() == MVT::bf16) {
4927 EVT NewVT = VT.changeElementType(*DAG.getContext(), MVT::f32);
4928 SDLoc DL(Op);
4929 if (IsStrict) {
4930 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4931 {Op.getOperand(0), Op.getOperand(1)});
4932 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4933 {Ext.getValue(1), Ext.getValue(0)});
4934 }
4935 return DAG.getNode(
4936 Op.getOpcode(), DL, Op.getValueType(),
4937 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4938 }
4939
4940 if (VT.isScalableVector()) {
4941 if (VT.getVectorElementType() == MVT::i1) {
4942 SDLoc DL(Op);
4943 EVT CvtVT = getPromotedVTForPredicate(VT);
4944 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4945 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4946 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4947 }
4948
4949 // Let common code split the operation.
4950 if (InVT == MVT::nxv8f32)
4951 return Op;
4952
4953 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4954 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4955 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4956 return LowerToPredicatedOp(Op, DAG, Opcode);
4957 }
4958
4959 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4960 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4961 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4962
4963 uint64_t VTSize = VT.getFixedSizeInBits();
4964 uint64_t InVTSize = InVT.getFixedSizeInBits();
4965 if (VTSize < InVTSize) {
4966 SDLoc DL(Op);
4967 if (IsStrict) {
4969 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4970 {Op.getOperand(0), Op.getOperand(1)});
4971 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4972 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4973 }
4974 SDValue Cv =
4975 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4976 Op.getOperand(0));
4977 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4978 }
4979
4980 if (VTSize > InVTSize) {
4981 SDLoc DL(Op);
4982 MVT ExtVT =
4985 if (IsStrict) {
4986 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4987 {Op.getOperand(0), Op.getOperand(1)});
4988 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4989 {Ext.getValue(1), Ext.getValue(0)});
4990 }
4991 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4992 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4993 }
4994
4995 // Use a scalar operation for conversions between single-element vectors of
4996 // the same size.
4997 if (InVT.getVectorNumElements() == 1) {
4998 SDLoc DL(Op);
4999 SDValue Extract = DAG.getNode(
5001 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
5002 EVT ScalarVT = VT.getScalarType();
5003 if (IsStrict)
5004 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5005 {Op.getOperand(0), Extract});
5006 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5007 }
5008
5009 // Type changing conversions are illegal.
5010 return Op;
5011}
5012
5013SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
5014 SelectionDAG &DAG) const {
5015 bool IsStrict = Op->isStrictFPOpcode();
5016 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5017
5018 if (SrcVal.getValueType().isVector())
5019 return LowerVectorFP_TO_INT(Op, DAG);
5020
5021 // f16 conversions are promoted to f32 when full fp16 is not supported.
5022 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
5023 SrcVal.getValueType() == MVT::bf16) {
5024 SDLoc DL(Op);
5025 if (IsStrict) {
5026 SDValue Ext =
5027 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
5028 {Op.getOperand(0), SrcVal});
5029 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
5030 {Ext.getValue(1), Ext.getValue(0)});
5031 }
5032 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
5033 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
5034 }
5035
5036 if (SrcVal.getValueType() != MVT::f128) {
5037 // It's legal except when f128 is involved
5038 return Op;
5039 }
5040
5041 return SDValue();
5042}
5043
5044SDValue
5045AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
5046 SelectionDAG &DAG) const {
5047 // AArch64 FP-to-int conversions saturate to the destination element size, so
5048 // we can lower common saturating conversions to simple instructions.
5049 SDValue SrcVal = Op.getOperand(0);
5050 EVT SrcVT = SrcVal.getValueType();
5051 EVT DstVT = Op.getValueType();
5052 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5053
5054 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
5055 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
5056 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5057 assert(SatWidth <= DstElementWidth &&
5058 "Saturation width cannot exceed result width");
5059
5060 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
5061 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
5062 // types, so this is hard to reach.
5063 if (DstVT.isScalableVector())
5064 return SDValue();
5065
5066 EVT SrcElementVT = SrcVT.getVectorElementType();
5067
5068 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5069 SDLoc DL(Op);
5070 SDValue SrcVal2;
5071 if ((SrcElementVT == MVT::f16 &&
5072 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
5073 SrcElementVT == MVT::bf16) {
5074 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
5075 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
5076 // If we are extending to a v8f32, split into two v4f32 to produce legal
5077 // types.
5078 if (F32VT.getSizeInBits() > 128) {
5079 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
5080 F32VT = F32VT.getHalfNumVectorElementsVT();
5081 }
5082 SrcVT = F32VT;
5083 SrcElementVT = MVT::f32;
5084 SrcElementWidth = 32;
5085 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
5086 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
5087 return SDValue();
5088
5089 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
5090 // width and produce a fcvtzu.
5091 if (SatWidth == 64 && SrcElementWidth < 64) {
5092 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
5093 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
5094 SrcVT = F64VT;
5095 SrcElementVT = MVT::f64;
5096 SrcElementWidth = 64;
5097 }
5098 // Cases that we can emit directly.
5099 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
5100 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5101 DAG.getValueType(DstVT.getScalarType()));
5102 if (SrcVal2) {
5103 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
5104 DAG.getValueType(DstVT.getScalarType()));
5105 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
5106 }
5107 return Res;
5108 }
5109
5110 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5111 // result. This is only valid if the legal cvt is larger than the saturate
5112 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
5113 // (at least until sqxtn is selected).
5114 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
5115 return SDValue();
5116
5117 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
5118 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
5119 DAG.getValueType(IntVT.getScalarType()));
5120 SDValue NativeCvt2 =
5121 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
5122 DAG.getValueType(IntVT.getScalarType()))
5123 : SDValue();
5124 SDValue Sat, Sat2;
5125 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5126 SDValue MinC = DAG.getConstant(
5127 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5128 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
5129 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5130 SDValue MaxC = DAG.getConstant(
5131 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5132 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
5133 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
5134 } else {
5135 SDValue MinC = DAG.getConstant(
5136 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
5137 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
5138 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5139 }
5140
5141 if (SrcVal2)
5142 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
5144 Sat, Sat2);
5145
5146 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5147}
5148
5149SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5150 SelectionDAG &DAG) const {
5151 // AArch64 FP-to-int conversions saturate to the destination register size, so
5152 // we can lower common saturating conversions to simple instructions.
5153 SDValue SrcVal = Op.getOperand(0);
5154 EVT SrcVT = SrcVal.getValueType();
5155
5156 if (SrcVT.isVector())
5157 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5158
5159 EVT DstVT = Op.getValueType();
5160 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5161 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5162 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5163 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5164
5165 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5166 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5167 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5168 SrcVT = MVT::f32;
5169 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5170 SrcVT != MVT::bf16)
5171 return SDValue();
5172
5173 SDLoc DL(Op);
5174 // Cases that we can emit directly.
5175 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5176 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5177 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5178 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5179 DAG.getValueType(DstVT));
5180
5181 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5182 // result. This is only valid if the legal cvt is larger than the saturate
5183 // width.
5184 if (DstWidth < SatWidth)
5185 return SDValue();
5186
5187 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5188 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5189 SDValue CVTf32 =
5190 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5191 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5192 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5193 DAG.getValueType(SatVT));
5194 }
5195 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5196 return DAG.getBitcast(DstVT, CVTf32);
5197 }
5198
5199 SDValue NativeCvt =
5200 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5201 SDValue Sat;
5202 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5203 SDValue MinC = DAG.getConstant(
5204 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5205 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5206 SDValue MaxC = DAG.getConstant(
5207 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5208 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5209 } else {
5210 SDValue MinC = DAG.getConstant(
5211 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5212 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5213 }
5214
5215 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5216}
5217
5218SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5219 SelectionDAG &DAG) const {
5220 EVT VT = Op.getValueType();
5221 SDValue Src = Op.getOperand(0);
5222 SDLoc DL(Op);
5223
5224 assert(VT.isVector() && "Expected vector type");
5225
5226 EVT CastVT = VT.changeVectorElementType(
5227 *DAG.getContext(), Src.getValueType().getVectorElementType());
5228
5229 // Round the floating-point value into a floating-point register with the
5230 // current rounding mode.
5231 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5232
5233 // Truncate the rounded floating point to an integer.
5234 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5236}
5237
5238SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5239 SelectionDAG &DAG) const {
5240 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5241 // Any additional optimization in this function should be recorded
5242 // in the cost tables.
5243 bool IsStrict = Op->isStrictFPOpcode();
5244 EVT VT = Op.getValueType();
5245 SDLoc DL(Op);
5246 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5247 EVT InVT = In.getValueType();
5248 unsigned Opc = Op.getOpcode();
5249 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5250
5251 assert(!(IsStrict && VT.isScalableVector()) &&
5252 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5253
5254 // NOTE: i1->bf16 does not require promotion to f32.
5255 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5256 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5257 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5258 : DAG.getConstantFP(1.0, DL, VT);
5259 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5260 }
5261
5262 // Promote bf16 conversions to f32.
5263 if (VT.getVectorElementType() == MVT::bf16) {
5264 EVT F32 = VT.changeElementType(*DAG.getContext(), MVT::f32);
5265 if (IsStrict) {
5266 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5267 {Op.getOperand(0), In});
5268 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5269 {Op.getValueType(), MVT::Other},
5270 {Val.getValue(1), Val.getValue(0),
5271 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5272 }
5273 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5274 DAG.getNode(Op.getOpcode(), DL, F32, In),
5275 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5276 }
5277
5278 if (VT.isScalableVector()) {
5279 // Let common code split the operation.
5280 if (VT == MVT::nxv8f32)
5281 return Op;
5282
5283 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5284 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5285 return LowerToPredicatedOp(Op, DAG, Opcode);
5286 }
5287
5288 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5289 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5290 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5291
5292 uint64_t VTSize = VT.getFixedSizeInBits();
5293 uint64_t InVTSize = InVT.getFixedSizeInBits();
5294 if (VTSize < InVTSize) {
5295 // AArch64 doesn't have a direct vector instruction to convert
5296 // fixed point to floating point AND narrow it at the same time.
5297 // Additional rounding when the target is f32/f64 causes double
5298 // rounding issues. Conversion to f16 is fine due to narrow width.
5299 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5300 bool IsTargetf16 = false;
5301 if (Op.hasOneUse() &&
5302 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5303 // Some vector types are split during legalization into half, followed by
5304 // concatenation, followed by rounding to the original vector type. If we
5305 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5306 SDNode *U = *Op->user_begin();
5307 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5308 EVT TmpVT = U->user_begin()->getValueType(0);
5309 if (TmpVT.getScalarType() == MVT::f16)
5310 IsTargetf16 = true;
5311 }
5312 }
5313
5314 if (IsTargetf32 && !IsTargetf16) {
5315 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5316 }
5317
5318 MVT CastVT =
5320 InVT.getVectorNumElements());
5321 if (IsStrict) {
5322 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5323 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5324 {In.getValue(1), In.getValue(0),
5325 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5326 }
5327 In = DAG.getNode(Opc, DL, CastVT, In);
5328 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5329 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5330 }
5331
5332 if (VTSize > InVTSize) {
5333 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5334 EVT CastVT = VT.changeVectorElementTypeToInteger();
5335 In = DAG.getNode(CastOpc, DL, CastVT, In);
5336 if (IsStrict)
5337 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5338 return DAG.getNode(Opc, DL, VT, In);
5339 }
5340
5341 // Use a scalar operation for conversions between single-element vectors of
5342 // the same size.
5343 if (VT.getVectorNumElements() == 1) {
5344 SDValue Extract =
5346 DAG.getConstant(0, DL, MVT::i64));
5347 EVT ScalarVT = VT.getScalarType();
5348 if (IsStrict)
5349 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5350 {Op.getOperand(0), Extract});
5351 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5352 }
5353
5354 return Op;
5355}
5356
5357SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5358 SelectionDAG &DAG) const {
5359 if (Op.getValueType().isVector())
5360 return LowerVectorINT_TO_FP(Op, DAG);
5361
5362 bool IsStrict = Op->isStrictFPOpcode();
5363 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5364
5365 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5366 Op->getOpcode() == ISD::SINT_TO_FP;
5367
5368 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5369 SDLoc DL(Op);
5370 if (IsStrict) {
5371 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5372 {Op.getOperand(0), SrcVal});
5373 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5374 {Op.getValueType(), MVT::Other},
5375 {Val.getValue(1), Val.getValue(0),
5376 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5377 }
5378 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5379 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5380 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5381 };
5382
5383 if (Op.getValueType() == MVT::bf16) {
5384 unsigned MaxWidth = IsSigned
5385 ? DAG.ComputeMaxSignificantBits(SrcVal)
5386 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5387 // bf16 conversions are promoted to f32 when converting from i16.
5388 if (MaxWidth <= 24) {
5389 return IntToFpViaPromotion(MVT::f32);
5390 }
5391
5392 // bf16 conversions are promoted to f64 when converting from i32.
5393 if (MaxWidth <= 53) {
5394 return IntToFpViaPromotion(MVT::f64);
5395 }
5396
5397 // We need to be careful about i64 -> bf16.
5398 // Consider an i32 22216703.
5399 // This number cannot be represented exactly as an f32 and so a itofp will
5400 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5401 // However, the correct bf16 was supposed to be 22151168.0
5402 // We need to use sticky rounding to get this correct.
5403 if (SrcVal.getValueType() == MVT::i64) {
5404 SDLoc DL(Op);
5405 // This algorithm is equivalent to the following:
5406 // uint64_t SrcHi = SrcVal & ~0xfffull;
5407 // uint64_t SrcLo = SrcVal & 0xfffull;
5408 // uint64_t Highest = SrcVal >> 53;
5409 // bool HasHighest = Highest != 0;
5410 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5411 // double Rounded = static_cast<double>(ToRound);
5412 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5413 // uint64_t HasLo = SrcLo != 0;
5414 // bool NeedsAdjustment = HasHighest & HasLo;
5415 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5416 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5417 // return static_cast<__bf16>(Adjusted);
5418 //
5419 // Essentially, what happens is that SrcVal either fits perfectly in a
5420 // double-precision value or it is too big. If it is sufficiently small,
5421 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5422 // ensure that u64 -> double has no rounding error by only using the 52
5423 // MSB of the input. The low order bits will get merged into a sticky bit
5424 // which will avoid issues incurred by double rounding.
5425
5426 // Signed conversion is more or less like so:
5427 // copysign((__bf16)abs(SrcVal), SrcVal)
5428 SDValue SignBit;
5429 if (IsSigned) {
5430 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5431 DAG.getConstant(1ull << 63, DL, MVT::i64));
5432 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5433 }
5434 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5435 DAG.getConstant(~0xfffull, DL, MVT::i64));
5436 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5437 DAG.getConstant(0xfffull, DL, MVT::i64));
5438 SDValue Highest =
5439 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5440 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5441 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5442 SDValue ToRound =
5443 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5444 SDValue Rounded =
5445 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5446 {Op.getOperand(0), ToRound})
5447 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5448
5449 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5450 if (SignBit) {
5451 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5452 }
5453
5454 SDValue HasHighest = DAG.getSetCC(
5455 DL,
5456 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5457 Highest, Zero64, ISD::SETNE);
5458
5459 SDValue HasLo = DAG.getSetCC(
5460 DL,
5461 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5462 SrcLo, Zero64, ISD::SETNE);
5463
5464 SDValue NeedsAdjustment =
5465 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5466 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5467
5468 SDValue AdjustedBits =
5469 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5470 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5471 return IsStrict
5472 ? DAG.getNode(
5474 {Op.getValueType(), MVT::Other},
5475 {Rounded.getValue(1), Adjusted,
5476 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5477 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5478 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5479 }
5480 }
5481
5482 // f16 conversions are promoted to f32 when full fp16 is not supported.
5483 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5484 return IntToFpViaPromotion(MVT::f32);
5485 }
5486
5487 // i128 conversions are libcalls.
5488 if (SrcVal.getValueType() == MVT::i128)
5489 return SDValue();
5490
5491 // Other conversions are legal, unless it's to the completely software-based
5492 // fp128.
5493 if (Op.getValueType() != MVT::f128)
5494 return Op;
5495 return SDValue();
5496}
5497
5498static MVT getSVEContainerType(EVT ContentTy);
5499
5500SDValue
5501AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5502 SelectionDAG &DAG) const {
5503 assert((Subtarget->hasSVE2() ||
5504 (Subtarget->hasSME() && Subtarget->isStreaming())) &&
5505 "Lowering loop_dependence_raw_mask or loop_dependence_war_mask "
5506 "requires SVE or SME");
5507
5508 SDLoc DL(Op);
5509 EVT VT = Op.getValueType();
5510 unsigned LaneOffset = Op.getConstantOperandVal(3);
5511 unsigned NumElements = VT.getVectorMinNumElements();
5512 uint64_t EltSizeInBytes = Op.getConstantOperandVal(2);
5513
5514 // Lane offsets and other element sizes are not supported by whilewr/rw.
5515 if (LaneOffset != 0 || !is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes))
5516 return SDValue();
5517
5518 EVT EltVT = MVT::getIntegerVT(EltSizeInBytes * 8);
5519 EVT PredVT =
5520 getPackedSVEVectorVT(EltVT).changeElementType(*DAG.getContext(), MVT::i1);
5521
5522 // Legal whilewr/rw (lowered by tablegen matcher).
5523 if (PredVT == VT)
5524 return Op;
5525
5526 // Expand if this mask needs splitting (this will produce a whilelo).
5527 if (NumElements > PredVT.getVectorMinNumElements())
5528 return SDValue();
5529
5530 SDValue Mask =
5531 DAG.getNode(Op.getOpcode(), DL, PredVT, to_vector(Op->op_values()));
5532
5533 if (VT.isFixedLengthVector()) {
5534 EVT WidePredVT =
5535 PredVT.changeElementType(*DAG.getContext(), VT.getScalarType());
5536 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, WidePredVT, Mask);
5537 return convertFromScalableVector(DAG, VT, MaskAsInt);
5538 }
5539
5540 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Mask,
5541 DAG.getConstant(0, DL, MVT::i64));
5542}
5543
5544SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5545 SelectionDAG &DAG) const {
5546 EVT OpVT = Op.getValueType();
5547 EVT ArgVT = Op.getOperand(0).getValueType();
5548
5550 return LowerFixedLengthBitcastToSVE(Op, DAG);
5551
5552 if (OpVT.isScalableVector()) {
5553 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5554
5555 // Handle type legalisation first.
5556 if (!isTypeLegal(ArgVT)) {
5557 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5558 "Expected int->fp bitcast!");
5559
5560 // Bitcasting between unpacked vector types of different element counts is
5561 // not a NOP because the live elements are laid out differently.
5562 // 01234567
5563 // e.g. nxv2i32 = XX??XX??
5564 // nxv4f16 = X?X?X?X?
5565 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5566 return SDValue();
5567
5568 SDValue ExtResult =
5569 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5570 Op.getOperand(0));
5571 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5572 }
5573
5574 // Bitcasts between legal types with the same element count are legal.
5575 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5576 return Op;
5577
5578 // getSVESafeBitCast does not support casting between unpacked types.
5579 if (!isPackedVectorType(OpVT, DAG))
5580 return SDValue();
5581
5582 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5583 }
5584
5585 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5586 return SDValue();
5587
5588 // Bitcasts between f16 and bf16 are legal.
5589 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5590 return Op;
5591
5592 assert(ArgVT == MVT::i16);
5593 SDLoc DL(Op);
5594
5595 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5596 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5597 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5598}
5599
5600// Returns lane if Op extracts from a two-element vector and lane is constant
5601// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5602static std::optional<uint64_t>
5604 SDNode *OpNode = Op.getNode();
5605 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5606 return std::nullopt;
5607
5608 EVT VT = OpNode->getOperand(0).getValueType();
5610 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5611 return std::nullopt;
5612
5613 return C->getZExtValue();
5614}
5615
5617 bool isSigned) {
5618 EVT VT = N.getValueType();
5619
5620 if (N.getOpcode() != ISD::BUILD_VECTOR)
5621 return false;
5622
5623 for (const SDValue &Elt : N->op_values()) {
5625 unsigned EltSize = VT.getScalarSizeInBits();
5626 unsigned HalfSize = EltSize / 2;
5627 if (isSigned) {
5628 if (!isIntN(HalfSize, C->getSExtValue()))
5629 return false;
5630 } else {
5631 if (!isUIntN(HalfSize, C->getZExtValue()))
5632 return false;
5633 }
5634 continue;
5635 }
5636 return false;
5637 }
5638
5639 return true;
5640}
5641
5643 EVT VT = N.getValueType();
5644 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5645 EVT HalfVT = EVT::getVectorVT(
5646 *DAG.getContext(),
5649 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5650}
5651
5653 return N.getOpcode() == ISD::SIGN_EXTEND ||
5654 N.getOpcode() == ISD::ANY_EXTEND ||
5655 isExtendedBUILD_VECTOR(N, DAG, true);
5656}
5657
5659 return N.getOpcode() == ISD::ZERO_EXTEND ||
5660 N.getOpcode() == ISD::ANY_EXTEND ||
5661 isExtendedBUILD_VECTOR(N, DAG, false);
5662}
5663
5665 unsigned Opcode = N.getOpcode();
5666 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5667 SDValue N0 = N.getOperand(0);
5668 SDValue N1 = N.getOperand(1);
5669 return N0->hasOneUse() && N1->hasOneUse() &&
5670 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5671 }
5672 return false;
5673}
5674
5676 unsigned Opcode = N.getOpcode();
5677 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5678 SDValue N0 = N.getOperand(0);
5679 SDValue N1 = N.getOperand(1);
5680 return N0->hasOneUse() && N1->hasOneUse() &&
5681 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5682 }
5683 return false;
5684}
5685
5686SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5687 SelectionDAG &DAG) const {
5688 // The rounding mode is in bits 23:22 of the FPSCR.
5689 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5690 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5691 // so that the shift + and get folded into a bitfield extract.
5692 SDLoc DL(Op);
5693
5694 SDValue Chain = Op.getOperand(0);
5695 SDValue FPCR_64 =
5696 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5697 {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
5698 MVT::i64)});
5699 Chain = FPCR_64.getValue(1);
5700 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5701 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5702 DAG.getConstant(1U << 22, DL, MVT::i32));
5703 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5704 DAG.getConstant(22, DL, MVT::i32));
5705 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5706 DAG.getConstant(3, DL, MVT::i32));
5707 return DAG.getMergeValues({AND, Chain}, DL);
5708}
5709
5710SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5711 SelectionDAG &DAG) const {
5712 SDLoc DL(Op);
5713 SDValue Chain = Op->getOperand(0);
5714 SDValue RMValue = Op->getOperand(1);
5715
5716 // The rounding mode is in bits 23:22 of the FPCR.
5717 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5718 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5719 // ((arg - 1) & 3) << 22).
5720 //
5721 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5722 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5723 // generated llvm.set.rounding to ensure this condition.
5724
5725 // Calculate new value of FPCR[23:22].
5726 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5727 DAG.getConstant(1, DL, MVT::i32));
5728 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5729 DAG.getConstant(0x3, DL, MVT::i32));
5730 RMValue =
5731 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5732 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5733 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5734
5735 // Get current value of FPCR.
5736 SDValue Ops[] = {
5737 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5738 SDValue FPCR =
5739 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5740 Chain = FPCR.getValue(1);
5741 FPCR = FPCR.getValue(0);
5742
5743 // Put new rounding mode into FPSCR[23:22].
5744 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5745 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5746 DAG.getConstant(RMMask, DL, MVT::i64));
5747 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5748 SDValue Ops2[] = {
5749 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5750 FPCR};
5751 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5752}
5753
5754SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5755 SelectionDAG &DAG) const {
5756 SDLoc DL(Op);
5757 SDValue Chain = Op->getOperand(0);
5758
5759 // Get current value of FPCR.
5760 SDValue Ops[] = {
5761 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5762 SDValue FPCR =
5763 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5764 Chain = FPCR.getValue(1);
5765 FPCR = FPCR.getValue(0);
5766
5767 // Truncate FPCR to 32 bits.
5768 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5769
5770 return DAG.getMergeValues({Result, Chain}, DL);
5771}
5772
5773SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5774 SelectionDAG &DAG) const {
5775 SDLoc DL(Op);
5776 SDValue Chain = Op->getOperand(0);
5777 SDValue Mode = Op->getOperand(1);
5778
5779 // Extend the specified value to 64 bits.
5780 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5781
5782 // Set new value of FPCR.
5783 SDValue Ops2[] = {
5784 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5785 FPCR};
5786 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5787}
5788
5789SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5790 SelectionDAG &DAG) const {
5791 SDLoc DL(Op);
5792 SDValue Chain = Op->getOperand(0);
5793
5794 // Get current value of FPCR.
5795 SDValue Ops[] = {
5796 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5797 SDValue FPCR =
5798 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5799 Chain = FPCR.getValue(1);
5800 FPCR = FPCR.getValue(0);
5801
5802 // Clear bits that are not reserved.
5803 SDValue FPSCRMasked = DAG.getNode(
5804 ISD::AND, DL, MVT::i64, FPCR,
5806
5807 // Set new value of FPCR.
5808 SDValue Ops2[] = {
5809 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5810 FPSCRMasked};
5811 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5812}
5813
5814static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5815 SDLoc DL, bool &IsMLA) {
5816 bool IsN0SExt = isSignExtended(N0, DAG);
5817 bool IsN1SExt = isSignExtended(N1, DAG);
5818 if (IsN0SExt && IsN1SExt)
5819 return AArch64ISD::SMULL;
5820
5821 bool IsN0ZExt = isZeroExtended(N0, DAG);
5822 bool IsN1ZExt = isZeroExtended(N1, DAG);
5823
5824 if (IsN0ZExt && IsN1ZExt)
5825 return AArch64ISD::UMULL;
5826
5827 // Select UMULL if we can replace the other operand with an extend.
5828 EVT VT = N0.getValueType();
5829 unsigned EltSize = VT.getScalarSizeInBits();
5830 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5831 if (IsN0ZExt || IsN1ZExt) {
5832 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5833 return AArch64ISD::UMULL;
5834 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5835 DAG.MaskedValueIsZero(N1, Mask)) {
5836 // For v2i64 we look more aggressively at both operands being zero, to avoid
5837 // scalarization.
5838 return AArch64ISD::UMULL;
5839 }
5840
5841 if (IsN0SExt || IsN1SExt) {
5842 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5843 return AArch64ISD::SMULL;
5844 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5845 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5846 return AArch64ISD::SMULL;
5847 }
5848
5849 if (!IsN1SExt && !IsN1ZExt)
5850 return 0;
5851
5852 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5853 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5854 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5855 IsMLA = true;
5856 return AArch64ISD::SMULL;
5857 }
5858 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5859 IsMLA = true;
5860 return AArch64ISD::UMULL;
5861 }
5862 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5863 std::swap(N0, N1);
5864 IsMLA = true;
5865 return AArch64ISD::UMULL;
5866 }
5867 return 0;
5868}
5869
5870SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5871 EVT VT = Op.getValueType();
5872
5873 bool OverrideNEON = !Subtarget->isNeonAvailable();
5874 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5875 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5876
5877 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5878 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5879 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5880 "unexpected type for custom-lowering ISD::MUL");
5881 SDValue N0 = Op.getOperand(0);
5882 SDValue N1 = Op.getOperand(1);
5883 bool isMLA = false;
5884 EVT OVT = VT;
5885 if (VT.is64BitVector()) {
5886 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5887 isNullConstant(N0.getOperand(1)) &&
5890 isNullConstant(N1.getOperand(1)) &&
5892 N0 = N0.getOperand(0);
5893 N1 = N1.getOperand(0);
5894 VT = N0.getValueType();
5895 } else {
5896 if (VT == MVT::v1i64) {
5897 if (Subtarget->hasSVE())
5898 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5899 // Fall through to expand this. It is not legal.
5900 return SDValue();
5901 } else
5902 // Other vector multiplications are legal.
5903 return Op;
5904 }
5905 }
5906
5907 SDLoc DL(Op);
5908 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5909
5910 if (!NewOpc) {
5911 if (VT.getVectorElementType() == MVT::i64) {
5912 // If SVE is available then i64 vector multiplications can also be made
5913 // legal.
5914 if (Subtarget->hasSVE())
5915 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5916 // Fall through to expand this. It is not legal.
5917 return SDValue();
5918 } else
5919 // Other vector multiplications are legal.
5920 return Op;
5921 }
5922
5923 // Legalize to a S/UMULL instruction
5924 SDValue Op0;
5925 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5926 if (!isMLA) {
5927 Op0 = skipExtensionForVectorMULL(N0, DAG);
5929 Op1.getValueType().is64BitVector() &&
5930 "unexpected types for extended operands to VMULL");
5931 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5932 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5933 DAG.getConstant(0, DL, MVT::i64));
5934 }
5935 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5936 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5937 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5940 EVT Op1VT = Op1.getValueType();
5941 return DAG.getNode(
5943 DAG.getNode(N0.getOpcode(), DL, VT,
5944 DAG.getNode(NewOpc, DL, VT,
5945 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5946 DAG.getNode(NewOpc, DL, VT,
5947 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5948 DAG.getConstant(0, DL, MVT::i64));
5949}
5950
5951static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5952 int Pattern) {
5953 if (Pattern == AArch64SVEPredPattern::all)
5954 return DAG.getConstant(1, DL, VT);
5955
5956 // When the number of active elements of a pattern matches the scalable vector
5957 // length, we can upgrade the pattern to ALL and emit a splat instead.
5958 if (unsigned PatNumElts = getNumElementsFromSVEPredPattern(Pattern)) {
5959 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
5960 unsigned NumElts = VT.getVectorMinNumElements();
5961 unsigned VScale = Subtarget.getSVEVectorSizeInBits() / 128;
5962 if (PatNumElts == (NumElts * VScale))
5963 return DAG.getConstant(1, DL, VT);
5964 }
5965
5966 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5967 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5968}
5969
5971 bool IsSigned, bool IsEqual) {
5972 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5973 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5974
5975 if (!N->getValueType(0).isScalableVector() ||
5976 !isa<ConstantSDNode>(N->getOperand(Op1)))
5977 return SDValue();
5978
5979 SDLoc DL(N);
5980 APInt Y = N->getConstantOperandAPInt(Op1);
5981
5982 // When the second operand is the maximum value, comparisons that include
5983 // equality can never fail and thus we can return an all active predicate.
5984 if (IsEqual)
5985 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5986 return DAG.getConstant(1, DL, N->getValueType(0));
5987
5988 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5989 return SDValue();
5990
5991 APInt X = N->getConstantOperandAPInt(Op0);
5992
5993 bool Overflow;
5994 APInt NumActiveElems =
5995 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5996
5997 if (Overflow)
5998 return SDValue();
5999
6000 if (IsEqual) {
6001 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
6002 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
6003 : NumActiveElems.uadd_ov(One, Overflow);
6004 if (Overflow)
6005 return SDValue();
6006 }
6007
6008 std::optional<unsigned> PredPattern =
6010 unsigned MinSVEVectorSize = std::max(
6012 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
6013 if (PredPattern != std::nullopt &&
6014 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
6015 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
6016
6017 return SDValue();
6018}
6019
6020// Returns a safe bitcast between two scalable vector predicates, where
6021// any newly created lanes from a widening bitcast are defined as zero.
6023 SDLoc DL(Op);
6024 EVT InVT = Op.getValueType();
6025
6026 assert(InVT.getVectorElementType() == MVT::i1 &&
6027 VT.getVectorElementType() == MVT::i1 &&
6028 "Expected a predicate-to-predicate bitcast");
6030 InVT.isScalableVector() &&
6031 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
6032 "Only expect to cast between legal scalable predicate types!");
6033
6034 // Return the operand if the cast isn't changing type,
6035 if (InVT == VT)
6036 return Op;
6037
6038 // Look through casts to <vscale x 16 x i1> when their input has more lanes
6039 // than VT. This will increase the chances of removing casts that introduce
6040 // new lanes, which have to be explicitly zero'd.
6041 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6042 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
6043 Op.getOperand(1).getValueType().bitsGT(VT))
6044 Op = Op.getOperand(1);
6045
6046 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
6047
6048 // We only have to zero the lanes if new lanes are being defined, e.g. when
6049 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
6050 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
6051 // we can return here.
6052 if (InVT.bitsGT(VT))
6053 return Reinterpret;
6054
6055 // Check if the other lanes are already known to be zeroed by
6056 // construction.
6058 return Reinterpret;
6059
6060 // Zero the newly introduced lanes.
6061 SDValue Mask = DAG.getConstant(1, DL, InVT);
6062 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
6063 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
6064}
6065
6066SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
6067 SDValue Chain, SDLoc DL,
6068 EVT VT) const {
6069 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
6070 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
6071 SDValue Callee =
6072 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
6073 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
6074 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
6075 TargetLowering::CallLoweringInfo CLI(DAG);
6077 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
6078 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
6079 std::move(Args));
6080 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6081 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
6082 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
6083 Mask);
6084}
6085
6086// Lower an SME LDR/STR ZA intrinsic
6087// Case 1: If the vector number (vecnum) is an immediate in range, it gets
6088// folded into the instruction
6089// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
6090// Case 2: If the vecnum is not an immediate, then it is used to modify the base
6091// and tile slice registers
6092// ldr(%tileslice, %ptr, %vecnum)
6093// ->
6094// %svl = rdsvl
6095// %ptr2 = %ptr + %svl * %vecnum
6096// %tileslice2 = %tileslice + %vecnum
6097// ldr [%tileslice2, 0], [%ptr2, 0]
6098// Case 3: If the vecnum is an immediate out of range, then the same is done as
6099// case 2, but the base and slice registers are modified by the greatest
6100// multiple of 15 lower than the vecnum and the remainder is folded into the
6101// instruction. This means that successive loads and stores that are offset from
6102// each other can share the same base and slice register updates.
6103// ldr(%tileslice, %ptr, 22)
6104// ldr(%tileslice, %ptr, 23)
6105// ->
6106// %svl = rdsvl
6107// %ptr2 = %ptr + %svl * 15
6108// %tileslice2 = %tileslice + 15
6109// ldr [%tileslice2, 7], [%ptr2, 7]
6110// ldr [%tileslice2, 8], [%ptr2, 8]
6111// Case 4: If the vecnum is an add of an immediate, then the non-immediate
6112// operand and the immediate can be folded into the instruction, like case 2.
6113// ldr(%tileslice, %ptr, %vecnum + 7)
6114// ldr(%tileslice, %ptr, %vecnum + 8)
6115// ->
6116// %svl = rdsvl
6117// %ptr2 = %ptr + %svl * %vecnum
6118// %tileslice2 = %tileslice + %vecnum
6119// ldr [%tileslice2, 7], [%ptr2, 7]
6120// ldr [%tileslice2, 8], [%ptr2, 8]
6121// Case 5: The vecnum being an add of an immediate out of range is also handled,
6122// in which case the same remainder logic as case 3 is used.
6124 SDLoc DL(N);
6125
6126 SDValue TileSlice = N->getOperand(2);
6127 SDValue Base = N->getOperand(3);
6128 SDValue VecNum = N->getOperand(4);
6129 int32_t ConstAddend = 0;
6130 SDValue VarAddend = VecNum;
6131
6132 // If the vnum is an add of an immediate, we can fold it into the instruction
6133 if (VecNum.getOpcode() == ISD::ADD &&
6134 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6135 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6136 VarAddend = VecNum.getOperand(0);
6137 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6138 ConstAddend = ImmNode->getSExtValue();
6139 VarAddend = SDValue();
6140 }
6141
6142 int32_t ImmAddend = ConstAddend % 16;
6143 if (int32_t C = (ConstAddend - ImmAddend)) {
6144 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6145 VarAddend = VarAddend
6146 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6147 : CVal;
6148 }
6149
6150 if (VarAddend) {
6151 // Get the vector length that will be multiplied by vnum
6152 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6153 DAG.getConstant(1, DL, MVT::i32));
6154
6155 // Multiply SVL and vnum then add it to the base
6156 SDValue Mul = DAG.getNode(
6157 ISD::MUL, DL, MVT::i64,
6158 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6159 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6160 // Just add vnum to the tileslice
6161 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6162 }
6163
6164 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6165 DL, MVT::Other,
6166 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6167 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6168}
6169
6171 SDLoc DL(Op);
6172 SDValue ID =
6173 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6174
6175 auto Op1 = Op.getOperand(1);
6176 auto Op2 = Op.getOperand(2);
6177 auto Mask = Op.getOperand(3);
6178
6179 EVT Op1VT = Op1.getValueType();
6180 EVT Op2VT = Op2.getValueType();
6181 EVT ResVT = Op.getValueType();
6182
6183 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6184 Op1VT.getVectorElementType() == MVT::i16) &&
6185 "Expected 8-bit or 16-bit characters.");
6186
6187 // Scalable vector type used to wrap operands.
6188 // A single container is enough for both operands because ultimately the
6189 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6190 EVT OpContainerVT = Op1VT.isScalableVector()
6191 ? Op1VT
6193
6194 if (Op2VT.is128BitVector()) {
6195 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6196 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6197 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6198 if (ResVT.isScalableVector())
6199 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6200 DAG.getTargetConstant(0, DL, MVT::i64));
6201 } else {
6202 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6203 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6204 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6205 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6206 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6207 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6208 DAG.getConstant(0, DL, MVT::i64));
6209 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6210 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6211 }
6212
6213 // If the result is scalable, we just need to carry out the MATCH.
6214 if (ResVT.isScalableVector())
6215 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6216
6217 // If the result is fixed, we can still use MATCH but we need to wrap the
6218 // first operand and the mask in scalable vectors before doing so.
6219
6220 // Wrap the operands.
6221 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6222 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6223 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6224
6225 // Carry out the match.
6226 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6227 ID, Mask, Op1, Op2);
6228
6229 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6230 // (v16i8/v8i8).
6231 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6232 Match = convertFromScalableVector(DAG, Op1VT, Match);
6233 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6234}
6235
6236SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6237 SelectionDAG &DAG) const {
6238 unsigned IntNo = Op.getConstantOperandVal(1);
6239 SDLoc DL(Op);
6240 switch (IntNo) {
6241 default:
6242 return SDValue(); // Don't custom lower most intrinsics.
6243 case Intrinsic::aarch64_prefetch: {
6244 SDValue Chain = Op.getOperand(0);
6245 SDValue Addr = Op.getOperand(2);
6246
6247 unsigned IsWrite = Op.getConstantOperandVal(3);
6248 unsigned Locality = Op.getConstantOperandVal(4);
6249 unsigned IsStream = Op.getConstantOperandVal(5);
6250 unsigned IsData = Op.getConstantOperandVal(6);
6251 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6252 (!IsData << 3) | // IsDataCache bit
6253 (Locality << 1) | // Cache level bits
6254 (unsigned)IsStream; // Stream bit
6255
6256 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6257 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6258 }
6259 case Intrinsic::aarch64_range_prefetch: {
6260 SDValue Chain = Op.getOperand(0);
6261 SDValue Addr = Op.getOperand(2);
6262
6263 unsigned IsWrite = Op.getConstantOperandVal(3);
6264 unsigned IsStream = Op.getConstantOperandVal(4);
6265 unsigned PrfOp = (IsStream << 2) | IsWrite;
6266
6267 SDValue Metadata = Op.getOperand(5);
6268 return DAG.getNode(AArch64ISD::RANGE_PREFETCH, DL, MVT::Other, Chain,
6269 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr,
6270 Metadata);
6271 }
6272 case Intrinsic::aarch64_sme_str:
6273 case Intrinsic::aarch64_sme_ldr: {
6274 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6275 }
6276 case Intrinsic::aarch64_sme_za_enable:
6277 return DAG.getNode(
6278 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6279 Op->getOperand(0), // Chain
6280 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6281 case Intrinsic::aarch64_sme_za_disable:
6282 return DAG.getNode(
6283 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6284 Op->getOperand(0), // Chain
6285 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6286 }
6287}
6288
6289SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6290 SelectionDAG &DAG) const {
6291 unsigned IntNo = Op.getConstantOperandVal(1);
6292 SDLoc DL(Op);
6293 switch (IntNo) {
6294 default:
6295 return SDValue(); // Don't custom lower most intrinsics.
6296 case Intrinsic::aarch64_mops_memset_tag: {
6297 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6298 SDValue Chain = Node->getChain();
6299 SDValue Dst = Op.getOperand(2);
6300 SDValue Val = Op.getOperand(3);
6301 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6302 SDValue Size = Op.getOperand(4);
6303 auto Alignment = Node->getMemOperand()->getAlign();
6304 bool IsVol = Node->isVolatile();
6305 auto DstPtrInfo = Node->getPointerInfo();
6306
6307 const auto &SDI =
6308 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6309 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6310 Chain, Dst, Val, Size, Alignment, IsVol,
6311 DstPtrInfo, MachinePointerInfo{});
6312
6313 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6314 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6315 // LowerOperationWrapper will complain that the number of results has
6316 // changed.
6317 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6318 }
6319 }
6320}
6321
6322SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6323 SelectionDAG &DAG) const {
6324 unsigned IntNo = Op.getConstantOperandVal(0);
6325 SDLoc DL(Op);
6326 switch (IntNo) {
6327 default: return SDValue(); // Don't custom lower most intrinsics.
6328 case Intrinsic::thread_pointer: {
6329 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6330 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6331 }
6332 case Intrinsic::aarch64_sve_whilewr_b:
6333 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6334 Op.getOperand(1), Op.getOperand(2),
6335 DAG.getConstant(1, DL, MVT::i64),
6336 DAG.getConstant(0, DL, MVT::i64));
6337 case Intrinsic::aarch64_sve_whilewr_h:
6338 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6339 Op.getOperand(1), Op.getOperand(2),
6340 DAG.getConstant(2, DL, MVT::i64),
6341 DAG.getConstant(0, DL, MVT::i64));
6342 case Intrinsic::aarch64_sve_whilewr_s:
6343 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6344 Op.getOperand(1), Op.getOperand(2),
6345 DAG.getConstant(4, DL, MVT::i64),
6346 DAG.getConstant(0, DL, MVT::i64));
6347 case Intrinsic::aarch64_sve_whilewr_d:
6348 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6349 Op.getOperand(1), Op.getOperand(2),
6350 DAG.getConstant(8, DL, MVT::i64),
6351 DAG.getConstant(0, DL, MVT::i64));
6352 case Intrinsic::aarch64_sve_whilerw_b:
6353 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6354 Op.getOperand(1), Op.getOperand(2),
6355 DAG.getConstant(1, DL, MVT::i64),
6356 DAG.getConstant(0, DL, MVT::i64));
6357 case Intrinsic::aarch64_sve_whilerw_h:
6358 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6359 Op.getOperand(1), Op.getOperand(2),
6360 DAG.getConstant(2, DL, MVT::i64),
6361 DAG.getConstant(0, DL, MVT::i64));
6362 case Intrinsic::aarch64_sve_whilerw_s:
6363 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6364 Op.getOperand(1), Op.getOperand(2),
6365 DAG.getConstant(4, DL, MVT::i64),
6366 DAG.getConstant(0, DL, MVT::i64));
6367 case Intrinsic::aarch64_sve_whilerw_d:
6368 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6369 Op.getOperand(1), Op.getOperand(2),
6370 DAG.getConstant(8, DL, MVT::i64),
6371 DAG.getConstant(0, DL, MVT::i64));
6372 case Intrinsic::aarch64_neon_abs: {
6373 EVT Ty = Op.getValueType();
6374 if (Ty == MVT::i64) {
6375 SDValue Result =
6376 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6377 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6378 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6379 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6380 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6381 } else {
6382 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6383 }
6384 }
6385 case Intrinsic::aarch64_neon_pmull64: {
6386 SDValue LHS = Op.getOperand(1);
6387 SDValue RHS = Op.getOperand(2);
6388
6389 std::optional<uint64_t> LHSLane =
6391 std::optional<uint64_t> RHSLane =
6393
6394 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6395 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6396
6397 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6398 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6399 // which ISel recognizes better. For example, generate a ldr into d*
6400 // registers as opposed to a GPR load followed by a fmov.
6401 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6402 std::optional<uint64_t> OtherLane,
6403 const SDLoc &DL,
6404 SelectionDAG &DAG) -> SDValue {
6405 // If the operand is an higher half itself, rewrite it to
6406 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6407 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6408 if (NLane == 1)
6409 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6410 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6411
6412 // Operand N is not a higher half but the other operand is.
6413 if (OtherLane == 1) {
6414 // If this operand is a lower half, rewrite it to
6415 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6416 // align lanes of two operands. A roundtrip sequence (to move from lane
6417 // 1 to lane 0) is like this:
6418 // mov x8, v0.d[1]
6419 // fmov d0, x8
6420 if (NLane == 0)
6421 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6422 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6423 N.getOperand(0),
6424 DAG.getConstant(0, DL, MVT::i64)),
6425 DAG.getConstant(1, DL, MVT::i64));
6426
6427 // Otherwise just dup from main to all lanes.
6428 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6429 }
6430
6431 // Neither operand is an extract of higher half, so codegen may just use
6432 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6433 assert(N.getValueType() == MVT::i64 &&
6434 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6435 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6436 };
6437
6438 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6439 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6440
6441 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6442 }
6443 case Intrinsic::aarch64_neon_smax:
6444 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6445 Op.getOperand(2));
6446 case Intrinsic::aarch64_neon_umax:
6447 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6448 Op.getOperand(2));
6449 case Intrinsic::aarch64_neon_smin:
6450 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6451 Op.getOperand(2));
6452 case Intrinsic::aarch64_neon_umin:
6453 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6454 Op.getOperand(2));
6455 case Intrinsic::aarch64_neon_scalar_sqxtn:
6456 case Intrinsic::aarch64_neon_scalar_sqxtun:
6457 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6458 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6459 if (Op.getValueType() == MVT::i32)
6460 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6461 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6462 Op.getOperand(0),
6463 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6464 Op.getOperand(1))));
6465 return SDValue();
6466 }
6467 case Intrinsic::aarch64_neon_sqxtn:
6468 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6469 Op.getOperand(1));
6470 case Intrinsic::aarch64_neon_sqxtun:
6471 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6472 Op.getOperand(1));
6473 case Intrinsic::aarch64_neon_uqxtn:
6474 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6475 Op.getOperand(1));
6476 case Intrinsic::aarch64_neon_sqshrn:
6477 if (Op.getValueType().isVector())
6478 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6479 DAG.getNode(AArch64ISD::VASHR, DL,
6480 Op.getOperand(1).getValueType(),
6481 Op.getOperand(1), Op.getOperand(2)));
6482 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRN, DAG,
6483 /*LastOperandIsImm=*/true);
6484 case Intrinsic::aarch64_neon_sqshrun:
6485 if (Op.getValueType().isVector())
6486 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6487 DAG.getNode(AArch64ISD::VASHR, DL,
6488 Op.getOperand(1).getValueType(),
6489 Op.getOperand(1), Op.getOperand(2)));
6490 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRUN, DAG,
6491 /*LastOperandIsImm=*/true);
6492 case Intrinsic::aarch64_neon_uqshrn:
6493 if (Op.getValueType().isVector())
6494 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6495 DAG.getNode(AArch64ISD::VLSHR, DL,
6496 Op.getOperand(1).getValueType(),
6497 Op.getOperand(1), Op.getOperand(2)));
6498 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHRN, DAG,
6499 /*LastOperandIsImm=*/true);
6500 case Intrinsic::aarch64_neon_sqrshrn:
6501 if (Op.getValueType().isVector())
6502 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6503 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6504 Op.getOperand(1).getValueType(),
6505 Op.getOperand(1), Op.getOperand(2)));
6506 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRN, DAG,
6507 /*LastOperandIsImm=*/true);
6508 case Intrinsic::aarch64_neon_sqrshrun:
6509 if (Op.getValueType().isVector())
6510 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6511 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6512 Op.getOperand(1).getValueType(),
6513 Op.getOperand(1), Op.getOperand(2)));
6514 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRUN, DAG,
6515 /*LastOperandIsImm=*/true);
6516 case Intrinsic::aarch64_neon_uqrshrn:
6517 if (Op.getValueType().isVector())
6518 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6519 DAG.getNode(AArch64ISD::URSHR_I, DL,
6520 Op.getOperand(1).getValueType(),
6521 Op.getOperand(1), Op.getOperand(2)));
6522 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHRN, DAG,
6523 /*LastOperandIsImm=*/true);
6524 case Intrinsic::aarch64_neon_sqdmulh:
6525 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULH, DAG);
6526 case Intrinsic::aarch64_neon_sqrdmulh:
6527 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMULH, DAG);
6528 case Intrinsic::aarch64_neon_sqrdmlah:
6529 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLAH, DAG);
6530 case Intrinsic::aarch64_neon_sqrdmlsh:
6531 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLSH, DAG);
6532 case Intrinsic::aarch64_neon_sqrshl:
6533 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
6534 case Intrinsic::aarch64_neon_sqshl:
6535 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
6536 case Intrinsic::aarch64_neon_uqrshl:
6537 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
6538 case Intrinsic::aarch64_neon_uqshl:
6539 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
6540 case Intrinsic::aarch64_neon_sqadd:
6541 if (Op.getValueType().isVector())
6542 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6543 Op.getOperand(2));
6544 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
6545
6546 case Intrinsic::aarch64_neon_sqsub:
6547 if (Op.getValueType().isVector())
6548 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6549 Op.getOperand(2));
6550 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
6551
6552 case Intrinsic::aarch64_neon_uqadd:
6553 if (Op.getValueType().isVector())
6554 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6555 Op.getOperand(2));
6556 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
6557 case Intrinsic::aarch64_neon_uqsub:
6558 if (Op.getValueType().isVector())
6559 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6560 Op.getOperand(2));
6561 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
6562 case Intrinsic::aarch64_neon_sqdmulls_scalar:
6563 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULL, DAG);
6564 case Intrinsic::aarch64_sve_whilelt:
6565 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6566 /*IsEqual=*/false);
6567 case Intrinsic::aarch64_sve_whilels:
6568 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6569 /*IsEqual=*/true);
6570 case Intrinsic::aarch64_sve_whilele:
6571 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6572 /*IsEqual=*/true);
6573 case Intrinsic::aarch64_sve_sunpkhi:
6574 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6575 Op.getOperand(1));
6576 case Intrinsic::aarch64_sve_sunpklo:
6577 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6578 Op.getOperand(1));
6579 case Intrinsic::aarch64_sve_uunpkhi:
6580 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6581 Op.getOperand(1));
6582 case Intrinsic::aarch64_sve_uunpklo:
6583 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6584 Op.getOperand(1));
6585 case Intrinsic::aarch64_sve_clasta_n:
6586 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6587 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6588 case Intrinsic::aarch64_sve_clastb_n:
6589 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6590 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6591 case Intrinsic::aarch64_sve_lasta:
6592 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6593 Op.getOperand(1), Op.getOperand(2));
6594 case Intrinsic::aarch64_sve_lastb:
6595 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6596 Op.getOperand(1), Op.getOperand(2));
6597 case Intrinsic::aarch64_sve_tbl:
6598 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6599 Op.getOperand(2));
6600 case Intrinsic::aarch64_sve_trn1:
6601 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6602 Op.getOperand(1), Op.getOperand(2));
6603 case Intrinsic::aarch64_sve_trn2:
6604 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6605 Op.getOperand(1), Op.getOperand(2));
6606 case Intrinsic::aarch64_sve_uzp1:
6607 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6608 Op.getOperand(1), Op.getOperand(2));
6609 case Intrinsic::aarch64_sve_uzp2:
6610 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6611 Op.getOperand(1), Op.getOperand(2));
6612 case Intrinsic::aarch64_sve_zip1:
6613 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6614 Op.getOperand(1), Op.getOperand(2));
6615 case Intrinsic::aarch64_sve_zip2:
6616 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6617 Op.getOperand(1), Op.getOperand(2));
6618 case Intrinsic::aarch64_sve_splice:
6619 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6620 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6621 case Intrinsic::aarch64_sve_ptrue:
6622 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6623 case Intrinsic::aarch64_sve_clz:
6624 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6625 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6626 case Intrinsic::aarch64_sme_cntsd: {
6627 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6628 DAG.getConstant(1, DL, MVT::i32));
6629 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6630 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6631 }
6632 case Intrinsic::aarch64_sve_cnt: {
6633 SDValue Data = Op.getOperand(3);
6634 // CTPOP only supports integer operands.
6635 if (Data.getValueType().isFloatingPoint())
6636 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6637 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6638 Op.getOperand(2), Data, Op.getOperand(1));
6639 }
6640 case Intrinsic::aarch64_sve_dupq_lane:
6641 return LowerDUPQLane(Op, DAG);
6642 case Intrinsic::aarch64_sve_convert_from_svbool:
6643 if (Op.getValueType() == MVT::aarch64svcount)
6644 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6645 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6646 case Intrinsic::aarch64_sve_convert_to_svbool:
6647 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6648 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6649 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6650 case Intrinsic::aarch64_sve_fneg:
6651 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6652 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6653 case Intrinsic::aarch64_sve_frintp:
6654 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6655 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6656 case Intrinsic::aarch64_sve_frintm:
6657 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6658 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6659 case Intrinsic::aarch64_sve_frinti:
6660 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6661 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6662 Op.getOperand(1));
6663 case Intrinsic::aarch64_sve_frintx:
6664 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6665 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6666 case Intrinsic::aarch64_sve_frint32x:
6667 return DAG.getNode(AArch64ISD::FRINT32_MERGE_PASSTHRU, DL,
6668 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6669 Op.getOperand(1));
6670 case Intrinsic::aarch64_sve_frint64x:
6671 return DAG.getNode(AArch64ISD::FRINT64_MERGE_PASSTHRU, DL,
6672 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6673 Op.getOperand(1));
6674 case Intrinsic::aarch64_sve_frinta:
6675 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6676 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6677 case Intrinsic::aarch64_sve_frintn:
6678 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6679 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6680 Op.getOperand(1));
6681 case Intrinsic::aarch64_sve_frintz:
6682 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6683 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6684 case Intrinsic::aarch64_sve_frint32z:
6685 return DAG.getNode(AArch64ISD::FTRUNC32_MERGE_PASSTHRU, DL,
6686 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6687 Op.getOperand(1));
6688 case Intrinsic::aarch64_sve_frint64z:
6689 return DAG.getNode(AArch64ISD::FTRUNC64_MERGE_PASSTHRU, DL,
6690 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6691 Op.getOperand(1));
6692 case Intrinsic::aarch64_sve_ucvtf:
6693 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6694 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6695 Op.getOperand(1));
6696 case Intrinsic::aarch64_sve_scvtf:
6697 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6698 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6699 Op.getOperand(1));
6700 case Intrinsic::aarch64_sve_fcvtzu:
6701 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6702 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6703 case Intrinsic::aarch64_sve_fcvtzs:
6704 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6705 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6706 case Intrinsic::aarch64_sve_fsqrt:
6707 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6708 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6709 case Intrinsic::aarch64_sve_frecpx:
6710 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6711 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6712 case Intrinsic::aarch64_sve_frecpe_x:
6713 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6714 Op.getOperand(1));
6715 case Intrinsic::aarch64_sve_frecps_x:
6716 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6717 Op.getOperand(1), Op.getOperand(2));
6718 case Intrinsic::aarch64_sve_frsqrte_x:
6719 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6720 Op.getOperand(1));
6721 case Intrinsic::aarch64_sve_frsqrts_x:
6722 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6723 Op.getOperand(1), Op.getOperand(2));
6724 case Intrinsic::aarch64_sve_fabs:
6725 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6726 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6727 case Intrinsic::aarch64_sve_abs:
6728 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6729 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6730 case Intrinsic::aarch64_sve_neg:
6731 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6732 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6733 case Intrinsic::aarch64_sve_insr: {
6734 SDValue Scalar = Op.getOperand(2);
6735 EVT ScalarTy = Scalar.getValueType();
6736 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6737 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6738
6739 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6740 Op.getOperand(1), Scalar);
6741 }
6742 case Intrinsic::aarch64_sve_rbit:
6743 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6744 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6745 Op.getOperand(1));
6746 case Intrinsic::aarch64_sve_revb:
6747 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6748 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6749 case Intrinsic::aarch64_sve_revh:
6750 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6751 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6752 case Intrinsic::aarch64_sve_revw:
6753 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6754 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6755 case Intrinsic::aarch64_sve_revd:
6756 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6757 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6758 case Intrinsic::aarch64_sve_sxtb:
6759 return DAG.getNode(
6760 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6761 Op.getOperand(2), Op.getOperand(3),
6762 DAG.getValueType(Op.getValueType().changeVectorElementType(
6763 *DAG.getContext(), MVT::i8)),
6764 Op.getOperand(1));
6765 case Intrinsic::aarch64_sve_sxth:
6766 return DAG.getNode(
6767 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6768 Op.getOperand(2), Op.getOperand(3),
6769 DAG.getValueType(Op.getValueType().changeVectorElementType(
6770 *DAG.getContext(), MVT::i16)),
6771 Op.getOperand(1));
6772 case Intrinsic::aarch64_sve_sxtw:
6773 return DAG.getNode(
6774 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6775 Op.getOperand(2), Op.getOperand(3),
6776 DAG.getValueType(Op.getValueType().changeVectorElementType(
6777 *DAG.getContext(), MVT::i32)),
6778 Op.getOperand(1));
6779 case Intrinsic::aarch64_sve_uxtb:
6780 return DAG.getNode(
6781 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6782 Op.getOperand(2), Op.getOperand(3),
6783 DAG.getValueType(Op.getValueType().changeVectorElementType(
6784 *DAG.getContext(), MVT::i8)),
6785 Op.getOperand(1));
6786 case Intrinsic::aarch64_sve_uxth:
6787 return DAG.getNode(
6788 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6789 Op.getOperand(2), Op.getOperand(3),
6790 DAG.getValueType(Op.getValueType().changeVectorElementType(
6791 *DAG.getContext(), MVT::i16)),
6792 Op.getOperand(1));
6793 case Intrinsic::aarch64_sve_uxtw:
6794 return DAG.getNode(
6795 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6796 Op.getOperand(2), Op.getOperand(3),
6797 DAG.getValueType(Op.getValueType().changeVectorElementType(
6798 *DAG.getContext(), MVT::i32)),
6799 Op.getOperand(1));
6800 case Intrinsic::localaddress: {
6801 const auto &MF = DAG.getMachineFunction();
6802 const auto *RegInfo = Subtarget->getRegisterInfo();
6803 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6804 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6805 Op.getSimpleValueType());
6806 }
6807
6808 case Intrinsic::eh_recoverfp: {
6809 // FIXME: This needs to be implemented to correctly handle highly aligned
6810 // stack objects. For now we simply return the incoming FP. Refer D53541
6811 // for more details.
6812 SDValue FnOp = Op.getOperand(1);
6813 SDValue IncomingFPOp = Op.getOperand(2);
6814 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6815 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6816 if (!Fn)
6818 "llvm.eh.recoverfp must take a function as the first argument");
6819 return IncomingFPOp;
6820 }
6821 case Intrinsic::aarch64_neon_vsri:
6822 case Intrinsic::aarch64_neon_vsli:
6823 case Intrinsic::aarch64_sve_sri:
6824 case Intrinsic::aarch64_sve_sli: {
6825 EVT Ty = Op.getValueType();
6826
6827 if (!Ty.isVector())
6828 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6829
6830 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6831
6832 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6833 IntNo == Intrinsic::aarch64_sve_sri;
6834 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6835 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6836 Op.getOperand(3));
6837 }
6838
6839 case Intrinsic::aarch64_neon_srhadd:
6840 case Intrinsic::aarch64_neon_urhadd:
6841 case Intrinsic::aarch64_neon_shadd:
6842 case Intrinsic::aarch64_neon_uhadd: {
6843 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6844 IntNo == Intrinsic::aarch64_neon_shadd);
6845 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6846 IntNo == Intrinsic::aarch64_neon_urhadd);
6847 unsigned Opcode = IsSignedAdd
6848 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6849 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6850 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6851 Op.getOperand(2));
6852 }
6853 case Intrinsic::aarch64_neon_saddlp:
6854 case Intrinsic::aarch64_neon_uaddlp: {
6855 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6856 ? AArch64ISD::UADDLP
6857 : AArch64ISD::SADDLP;
6858 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6859 }
6860 case Intrinsic::aarch64_neon_sdot:
6861 case Intrinsic::aarch64_neon_udot:
6862 case Intrinsic::aarch64_sve_sdot:
6863 case Intrinsic::aarch64_sve_udot: {
6864 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6865 IntNo == Intrinsic::aarch64_sve_udot)
6866 ? AArch64ISD::UDOT
6867 : AArch64ISD::SDOT;
6868 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6869 Op.getOperand(2), Op.getOperand(3));
6870 }
6871 case Intrinsic::aarch64_neon_usdot:
6872 case Intrinsic::aarch64_sve_usdot: {
6873 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6874 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6875 }
6876 case Intrinsic::aarch64_neon_saddlv:
6877 case Intrinsic::aarch64_neon_uaddlv: {
6878 EVT OpVT = Op.getOperand(1).getValueType();
6879 EVT ResVT = Op.getValueType();
6880 assert(
6881 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6882 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6883 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6884 "Unexpected aarch64_neon_u/saddlv type");
6885 (void)OpVT;
6886 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6887 SDValue ADDLV = DAG.getNode(
6888 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6889 : AArch64ISD::SADDLV,
6890 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6891 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6892 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6893 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6894 return EXTRACT_VEC_ELT;
6895 }
6896 case Intrinsic::experimental_cttz_elts: {
6897 SDValue CttzOp = Op.getOperand(1);
6898 EVT VT = CttzOp.getValueType();
6899 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6900
6901 if (VT.isFixedLengthVector()) {
6902 // We can use SVE instructions to lower this intrinsic by first creating
6903 // an SVE predicate register mask from the fixed-width vector.
6904 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6905 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6906 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6907 }
6908
6909 SDValue NewCttzElts =
6910 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6911 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6912 }
6913 case Intrinsic::experimental_vector_match: {
6914 return LowerVectorMatch(Op, DAG);
6915 }
6916 case Intrinsic::aarch64_cls:
6917 case Intrinsic::aarch64_cls64:
6918 SDValue Res = DAG.getNode(ISD::CTLS, DL, Op.getOperand(1).getValueType(),
6919 Op.getOperand(1));
6920 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
6921 }
6922}
6923
6924bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6925 if (VT.getVectorElementType() == MVT::i8 ||
6926 VT.getVectorElementType() == MVT::i16) {
6927 EltTy = MVT::i32;
6928 return true;
6929 }
6930 return false;
6931}
6932
6933bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6934 EVT DataVT) const {
6935 const EVT IndexVT = Extend.getOperand(0).getValueType();
6936 // SVE only supports implicit extension of 32-bit indices.
6937 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6938 return false;
6939
6940 // Indices cannot be smaller than the main data type.
6941 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6942 return false;
6943
6944 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6945 // element container type, which would violate the previous clause.
6946 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6947}
6948
6949/// Helper function to check if a small vector load can be optimized.
6951 const AArch64Subtarget &Subtarget) {
6952 if (!Subtarget.isNeonAvailable())
6953 return false;
6954 if (LD->isVolatile())
6955 return false;
6956
6957 EVT MemVT = LD->getMemoryVT();
6958 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
6959 return false;
6960
6961 Align Alignment = LD->getAlign();
6962 Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
6963 if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
6964 return false;
6965
6966 return true;
6967}
6968
6969bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6970 EVT ExtVT = ExtVal.getValueType();
6971 // Small, illegal vectors can be extended inreg.
6972 if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
6973 if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
6974 isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
6975 return true;
6976 }
6977 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6978 return false;
6979
6980 // It may be worth creating extending masked loads if there are multiple
6981 // masked loads using the same predicate. That way we'll end up creating
6982 // extending masked loads that may then get split by the legaliser. This
6983 // results in just one set of predicate unpacks at the start, instead of
6984 // multiple sets of vector unpacks after each load.
6985 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6986 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6987 // Disable extending masked loads for fixed-width for now, since the code
6988 // quality doesn't look great.
6989 if (!ExtVT.isScalableVector())
6990 return false;
6991
6992 unsigned NumExtMaskedLoads = 0;
6993 for (auto *U : Ld->getMask()->users())
6994 if (isa<MaskedLoadSDNode>(U))
6995 NumExtMaskedLoads++;
6996
6997 if (NumExtMaskedLoads <= 1)
6998 return false;
6999 }
7000 }
7001
7002 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
7003 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
7004 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
7005}
7006
7007unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
7008 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
7009 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
7010 AArch64ISD::GLD1_MERGE_ZERO},
7011 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
7012 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
7013 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
7014 AArch64ISD::GLD1_MERGE_ZERO},
7015 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
7016 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
7017 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
7018 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7019 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
7020 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
7021 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
7022 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7023 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
7024 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
7025 };
7026 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
7027 return AddrModes.find(Key)->second;
7028}
7029
7030unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
7031 switch (Opcode) {
7032 default:
7033 llvm_unreachable("unimplemented opcode");
7034 return Opcode;
7035 case AArch64ISD::GLD1_MERGE_ZERO:
7036 return AArch64ISD::GLD1S_MERGE_ZERO;
7037 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
7038 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
7039 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
7040 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
7041 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
7042 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
7043 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
7044 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
7045 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
7046 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
7047 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
7048 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
7049 }
7050}
7051
7052SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
7053 SelectionDAG &DAG) const {
7054 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
7055
7056 SDLoc DL(Op);
7057 SDValue Chain = MGT->getChain();
7058 SDValue PassThru = MGT->getPassThru();
7059 SDValue Mask = MGT->getMask();
7060 SDValue BasePtr = MGT->getBasePtr();
7061 SDValue Index = MGT->getIndex();
7062 SDValue Scale = MGT->getScale();
7063 EVT VT = Op.getValueType();
7064 EVT MemVT = MGT->getMemoryVT();
7065 ISD::LoadExtType ExtType = MGT->getExtensionType();
7066 ISD::MemIndexType IndexType = MGT->getIndexType();
7067
7068 // SVE supports zero (and so undef) passthrough values only, everything else
7069 // must be handled manually by an explicit select on the load's output.
7070 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
7071 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
7072 SDValue Load =
7073 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7074 MGT->getMemOperand(), IndexType, ExtType);
7075 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7076 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
7077 }
7078
7079 bool IsScaled = MGT->isIndexScaled();
7080 bool IsSigned = MGT->isIndexSigned();
7081
7082 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7083 // must be calculated before hand.
7084 uint64_t ScaleVal = Scale->getAsZExtVal();
7085 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7086 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7087 EVT IndexVT = Index.getValueType();
7088 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7089 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7090 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7091
7092 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7093 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7094 MGT->getMemOperand(), IndexType, ExtType);
7095 }
7096
7097 // Lower fixed length gather to a scalable equivalent.
7098 if (VT.isFixedLengthVector()) {
7099 assert(Subtarget->useSVEForFixedLengthVectors() &&
7100 "Cannot lower when not using SVE for fixed vectors!");
7101
7102 // NOTE: Handle floating-point as if integer then bitcast the result.
7103 EVT DataVT = VT.changeVectorElementTypeToInteger();
7104 MemVT = MemVT.changeVectorElementTypeToInteger();
7105
7106 // Find the smallest integer fixed length vector we can use for the gather.
7107 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7108 if (DataVT.getVectorElementType() == MVT::i64 ||
7109 Index.getValueType().getVectorElementType() == MVT::i64 ||
7110 Mask.getValueType().getVectorElementType() == MVT::i64)
7111 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7112
7113 // Promote vector operands except for passthrough, which we know is either
7114 // undef or zero, and thus best constructed directly.
7115 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7116 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7117 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7118
7119 // A promoted result type forces the need for an extending load.
7120 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
7121 ExtType = ISD::EXTLOAD;
7122
7123 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7124
7125 // Convert fixed length vector operands to scalable.
7126 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7127 MemVT.getVectorElementType());
7128 Index = convertToScalableVector(DAG, ContainerVT, Index);
7130 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
7131 : DAG.getConstant(0, DL, ContainerVT);
7132
7133 // Emit equivalent scalable vector gather.
7134 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7135 SDValue Load =
7136 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
7137 Ops, MGT->getMemOperand(), IndexType, ExtType);
7138
7139 // Extract fixed length data then convert to the required result type.
7140 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
7141 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
7142 if (VT.isFloatingPoint())
7143 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
7144
7145 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7146 }
7147
7148 // Everything else is legal.
7149 return Op;
7150}
7151
7152SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
7153 SelectionDAG &DAG) const {
7154 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
7155
7156 SDLoc DL(Op);
7157 SDValue Chain = MSC->getChain();
7158 SDValue StoreVal = MSC->getValue();
7159 SDValue Mask = MSC->getMask();
7160 SDValue BasePtr = MSC->getBasePtr();
7161 SDValue Index = MSC->getIndex();
7162 SDValue Scale = MSC->getScale();
7163 EVT VT = StoreVal.getValueType();
7164 EVT MemVT = MSC->getMemoryVT();
7165 ISD::MemIndexType IndexType = MSC->getIndexType();
7166 bool Truncating = MSC->isTruncatingStore();
7167
7168 bool IsScaled = MSC->isIndexScaled();
7169 bool IsSigned = MSC->isIndexSigned();
7170
7171 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7172 // must be calculated before hand.
7173 uint64_t ScaleVal = Scale->getAsZExtVal();
7174 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7175 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7176 EVT IndexVT = Index.getValueType();
7177 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7178 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7179 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7180
7181 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7182 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7183 MSC->getMemOperand(), IndexType, Truncating);
7184 }
7185
7186 // Lower fixed length scatter to a scalable equivalent.
7187 if (VT.isFixedLengthVector()) {
7188 assert(Subtarget->useSVEForFixedLengthVectors() &&
7189 "Cannot lower when not using SVE for fixed vectors!");
7190
7191 // Once bitcast we treat floating-point scatters as if integer.
7192 if (VT.isFloatingPoint()) {
7194 MemVT = MemVT.changeVectorElementTypeToInteger();
7195 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
7196 }
7197
7198 // Find the smallest integer fixed length vector we can use for the scatter.
7199 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7200 if (VT.getVectorElementType() == MVT::i64 ||
7201 Index.getValueType().getVectorElementType() == MVT::i64 ||
7202 Mask.getValueType().getVectorElementType() == MVT::i64)
7203 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7204
7205 // Promote vector operands.
7206 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7207 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7208 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7209 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
7210
7211 // A promoted value type forces the need for a truncating store.
7212 if (PromotedVT != VT)
7213 Truncating = true;
7214
7215 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7216
7217 // Convert fixed length vector operands to scalable.
7218 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7219 MemVT.getVectorElementType());
7220 Index = convertToScalableVector(DAG, ContainerVT, Index);
7222 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
7223
7224 // Emit equivalent scalable vector scatter.
7225 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7226 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7227 MSC->getMemOperand(), IndexType, Truncating);
7228 }
7229
7230 // Everything else is legal.
7231 return Op;
7232}
7233
7234SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7235 SDLoc DL(Op);
7236 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7237 assert(LoadNode && "Expected custom lowering of a masked load node");
7238 EVT VT = Op->getValueType(0);
7239
7240 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7241 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7242
7243 SDValue PassThru = LoadNode->getPassThru();
7244 SDValue Mask = LoadNode->getMask();
7245
7246 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7247 return Op;
7248
7250 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7251 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7252 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7253 LoadNode->getExtensionType());
7254
7255 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7256
7257 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7258}
7259
7260// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7262 EVT VT, EVT MemVT,
7263 SelectionDAG &DAG) {
7264 assert(VT.isVector() && "VT should be a vector type");
7265 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7266
7267 SDValue Value = ST->getValue();
7268
7269 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7270 // the word lane which represent the v4i8 subvector. It optimizes the store
7271 // to:
7272 //
7273 // xtn v0.8b, v0.8h
7274 // str s0, [x0]
7275
7276 SDValue Undef = DAG.getUNDEF(MVT::i16);
7277 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7278 {Undef, Undef, Undef, Undef});
7279
7280 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7281 Value, UndefVec);
7282 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7283
7284 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7285 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7286 Trunc, DAG.getConstant(0, DL, MVT::i64));
7287
7288 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7289 ST->getBasePtr(), ST->getMemOperand());
7290}
7291
7293 SDLoc DL(Op);
7294 SDValue Src = Op.getOperand(0);
7295 MVT DestVT = Op.getSimpleValueType();
7296 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7298
7299 unsigned SrcAS = N->getSrcAddressSpace();
7300 unsigned DestAS = N->getDestAddressSpace();
7301 assert(SrcAS != DestAS &&
7302 "addrspacecast must be between different address spaces");
7303 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7304 TLI.getTargetMachine().getPointerSize(DestAS) &&
7305 "addrspacecast must be between different ptr sizes");
7306 (void)TLI;
7307
7308 if (SrcAS == ARM64AS::PTR32_SPTR) {
7309 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7310 DAG.getTargetConstant(0, DL, DestVT));
7311 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7312 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7313 DAG.getTargetConstant(0, DL, DestVT));
7314 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7315 (DestAS == ARM64AS::PTR32_UPTR)) {
7316 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7317 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7318 return Trunc;
7319 } else {
7320 return Src;
7321 }
7322}
7323
7324// Lower non-temporal stores that would otherwise be broken by legalization.
7325static SDValue LowerNTStore(StoreSDNode *StoreNode, EVT VT, EVT MemVT,
7326 const SDLoc &DL, SelectionDAG &DAG) {
7327 assert(StoreNode && "Expected a store operation");
7328 assert(StoreNode->isNonTemporal() && "Expected a non-temporal store");
7329
7330 // Currently, STNP lowering can only either keep or increase code size, thus
7331 // we predicate it to not apply when optimizing for code size.
7332 if (DAG.shouldOptForSize())
7333 return SDValue();
7334
7335 // Currently we only support NT stores lowering for little-endian targets.
7336 if (!DAG.getDataLayout().isLittleEndian())
7337 return SDValue();
7338
7339 if (VT.isVector()) {
7340 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7341 // the custom lowering, as there are no un-paired non-temporal stores and
7342 // legalization will break up 256 bit inputs.
7344 if (VT.isVector() && MemVT.getSizeInBits() == 256u && EC.isKnownEven() &&
7345 (MemVT.getScalarSizeInBits() == 8u ||
7346 MemVT.getScalarSizeInBits() == 16u ||
7347 MemVT.getScalarSizeInBits() == 32u ||
7348 MemVT.getScalarSizeInBits() == 64u)) {
7349 SDValue Lo =
7352 StoreNode->getValue(), DAG.getConstant(0, DL, MVT::i64));
7353 SDValue Hi =
7356 StoreNode->getValue(),
7357 DAG.getConstant(EC.getKnownMinValue() / 2, DL, MVT::i64));
7358 SDValue Result = DAG.getMemIntrinsicNode(
7359 AArch64ISD::STNP, DL, DAG.getVTList(MVT::Other),
7360 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7361 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7362 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7363 return Result;
7364 }
7365 }
7366 return SDValue();
7367}
7368
7369// Custom lowering for any store, vector or scalar and/or default or with
7370// a truncate operations. Currently only custom lower truncate operation
7371// from vector v4i16 to v4i8 or volatile stores of i128.
7372SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7373 SelectionDAG &DAG) const {
7374 SDLoc Dl(Op);
7375 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7376 assert (StoreNode && "Can only custom lower store nodes");
7377
7378 SDValue Value = StoreNode->getValue();
7379
7380 EVT VT = Value.getValueType();
7381 EVT MemVT = StoreNode->getMemoryVT();
7382
7383 if (StoreNode->isNonTemporal()) {
7384 if (auto MaybeSTNP = LowerNTStore(StoreNode, VT, MemVT, Dl, DAG))
7385 return MaybeSTNP;
7386 }
7387
7388 if (VT.isVector()) {
7390 VT,
7391 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7392 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7393
7394 unsigned AS = StoreNode->getAddressSpace();
7395 Align Alignment = StoreNode->getAlign();
7396 if (Alignment < MemVT.getStoreSize() &&
7397 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7398 StoreNode->getMemOperand()->getFlags(),
7399 nullptr)) {
7400 return scalarizeVectorStore(StoreNode, DAG);
7401 }
7402
7403 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7404 MemVT == MVT::v4i8) {
7405 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7406 }
7407 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7408 return LowerStore128(Op, DAG);
7409 } else if (MemVT == MVT::i64x8) {
7410 SDValue Value = StoreNode->getValue();
7411 assert(Value->getValueType(0) == MVT::i64x8);
7412 SDValue Chain = StoreNode->getChain();
7413 SDValue Base = StoreNode->getBasePtr();
7414 EVT PtrVT = Base.getValueType();
7415 for (unsigned i = 0; i < 8; i++) {
7416 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, Value,
7417 DAG.getConstant(i, Dl, MVT::i32));
7418 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7419 DAG.getConstant(i * 8, Dl, PtrVT));
7420 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7421 StoreNode->getBaseAlign());
7422 }
7423 return Chain;
7424 }
7425
7426 return SDValue();
7427}
7428
7429/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7430SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7431 SelectionDAG &DAG) const {
7432 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7433 assert(StoreNode->getMemoryVT() == MVT::i128);
7434 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7435
7436 bool IsStoreRelease =
7438 if (StoreNode->isAtomic())
7439 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7440 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7443
7444 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7445 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7446 ? StoreNode->getOperand(1)
7447 : StoreNode->getOperand(2);
7448 SDLoc DL(Op);
7449 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7450 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7451 if (DAG.getDataLayout().isBigEndian())
7452 std::swap(StoreValue.first, StoreValue.second);
7454 Opcode, DL, DAG.getVTList(MVT::Other),
7455 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7456 StoreNode->getBasePtr()},
7457 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7458 return Result;
7459}
7460
7461/// Helper function to optimize loads of extended small vectors.
7462/// These patterns would otherwise get scalarized into inefficient sequences.
7464 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7465 if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7466 return SDValue();
7467
7468 EVT MemVT = Load->getMemoryVT();
7469 EVT ResVT = Load->getValueType(0);
7470 unsigned NumElts = ResVT.getVectorNumElements();
7471 unsigned DstEltBits = ResVT.getScalarSizeInBits();
7472 unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7473
7474 unsigned ExtOpcode;
7475 switch (Load->getExtensionType()) {
7476 case ISD::EXTLOAD:
7477 case ISD::ZEXTLOAD:
7478 ExtOpcode = ISD::ZERO_EXTEND;
7479 break;
7480 case ISD::SEXTLOAD:
7481 ExtOpcode = ISD::SIGN_EXTEND;
7482 break;
7483 case ISD::NON_EXTLOAD:
7484 return SDValue();
7485 }
7486
7487 SDLoc DL(Load);
7488 SDValue Chain = Load->getChain();
7489 SDValue BasePtr = Load->getBasePtr();
7490 const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7491 Align Alignment = Load->getAlign();
7492
7493 // Load the data as an FP scalar to avoid issues with integer loads.
7494 unsigned LoadBits = MemVT.getStoreSizeInBits();
7495 MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7496 SDValue ScalarLoad =
7497 DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7498
7499 MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7500 SDValue ScalarToVec =
7501 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7502 MVT BitcastTy =
7503 MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7504 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7505
7506 SDValue Res = Bitcast;
7507 unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7508 unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7509 while (CurrentEltBits < DstEltBits) {
7510 if (Res.getValueSizeInBits() >= 128) {
7511 CurrentNumElts = CurrentNumElts / 2;
7512 MVT ExtractVT =
7513 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7514 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7515 DAG.getConstant(0, DL, MVT::i64));
7516 }
7517 CurrentEltBits = CurrentEltBits * 2;
7518 MVT ExtVT =
7519 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7520 Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7521 }
7522
7523 if (CurrentNumElts != NumElts) {
7524 MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7525 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7526 DAG.getConstant(0, DL, MVT::i64));
7527 }
7528
7529 return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7530}
7531
7532SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7533 SelectionDAG &DAG) const {
7534 SDLoc DL(Op);
7535 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7536 assert(LoadNode && "Expected custom lowering of a load node");
7537
7538 if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7539 return Result;
7540
7541 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7543 SDValue Base = LoadNode->getBasePtr();
7544 SDValue Chain = LoadNode->getChain();
7545 EVT PtrVT = Base.getValueType();
7546 for (unsigned i = 0; i < 8; i++) {
7547 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7548 DAG.getConstant(i * 8, DL, PtrVT));
7549 SDValue Part =
7550 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7551 LoadNode->getBaseAlign());
7552 Ops.push_back(Part);
7553 Chain = SDValue(Part.getNode(), 1);
7554 }
7555 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7556 return DAG.getMergeValues({Loaded, Chain}, DL);
7557 }
7558
7559 return SDValue();
7560}
7561
7562SDValue AArch64TargetLowering::LowerFixedLengthVectorCompressToSVE(
7563 SDValue Op, SelectionDAG &DAG) const {
7564 SDLoc DL(Op);
7565 EVT VT = Op.getValueType();
7566
7567 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7568 SDValue Vec = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
7569 SDValue Mask = convertFixedMaskToScalableVector(Op.getOperand(1), DAG);
7570 SDValue Passthru =
7571 convertToScalableVector(DAG, ContainerVT, Op.getOperand(2));
7572
7573 SDValue Result =
7574 DAG.getNode(ISD::VECTOR_COMPRESS, DL, ContainerVT, Vec, Mask, Passthru);
7575 return convertFromScalableVector(DAG, VT, Result);
7576}
7577
7578SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7579 SelectionDAG &DAG) const {
7580 EVT VT = Op.getValueType();
7581 if (!Subtarget->isSVEAvailable())
7582 return SDValue();
7583
7584 if (VT.isFixedLengthVector())
7585 return LowerFixedLengthVectorCompressToSVE(Op, DAG);
7586
7587 SDLoc DL(Op);
7588 SDValue Vec = Op.getOperand(0);
7589 SDValue Mask = Op.getOperand(1);
7590 SDValue Passthru = Op.getOperand(2);
7591 EVT MaskVT = Mask.getValueType();
7592
7593 SDValue Compressed = DAG.getNode(
7595 DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
7596 Vec);
7597
7598 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7599 if (Passthru.isUndef() ||
7601 return Compressed;
7602
7603 SDValue CntActive = DAG.getNode(
7604 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7605 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7606 Mask);
7607
7608 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7609 SDValue CompressedMask =
7610 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
7611
7612 return DAG.getNode(ISD::VSELECT, DL, VT, CompressedMask, Compressed,
7613 Passthru);
7614}
7615
7616// Generate SUBS and CSEL for integer abs.
7617SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7618 MVT VT = Op.getSimpleValueType();
7619
7620 if (VT.isVector())
7621 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7622
7623 SDLoc DL(Op);
7624 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7625
7626 // Generate SUBS & CSEL.
7627 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7628 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7629 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7630 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7631}
7632
7634 SDValue Chain = Op.getOperand(0);
7635 SDValue Cond = Op.getOperand(1);
7636 SDValue Dest = Op.getOperand(2);
7637
7639 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7640 SDLoc DL(Op);
7641 SDValue CCVal = getCondCode(DAG, CC);
7642 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7643 Cmp);
7644 }
7645
7646 return SDValue();
7647}
7648
7649// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7650// FSHL is converted to FSHR before deciding what to do with it
7652 SDValue Shifts = Op.getOperand(2);
7653 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7654 // If opcode is FSHL, convert it to FSHR
7655 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7656 SDLoc DL(Op);
7657 MVT VT = Op.getSimpleValueType();
7658 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7659
7660 if (Op.getOpcode() == ISD::FSHL) {
7661 if (NewShiftNo == 0)
7662 return Op.getOperand(0);
7663
7664 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7665 return DAG.getNode(
7666 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7667 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7668 }
7669
7670 if (Op.getOpcode() == ISD::FSHR) {
7671 if (NewShiftNo == 0)
7672 return Op.getOperand(1);
7673
7674 if (ShiftNo->getZExtValue() == NewShiftNo)
7675 return Op;
7676
7677 // Rewrite using the normalised shift amount.
7678 return DAG.getNode(
7679 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7680 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7681 }
7682 }
7683
7684 return SDValue();
7685}
7686
7688 SDValue X = Op.getOperand(0);
7689 EVT XScalarTy = X.getValueType();
7690 SDValue Exp = Op.getOperand(1);
7691
7692 SDLoc DL(Op);
7693 EVT XVT, ExpVT;
7694 switch (Op.getSimpleValueType().SimpleTy) {
7695 default:
7696 return SDValue();
7697 case MVT::bf16:
7698 case MVT::f16:
7699 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7700 [[fallthrough]];
7701 case MVT::f32:
7702 XVT = MVT::nxv4f32;
7703 ExpVT = MVT::nxv4i32;
7704 break;
7705 case MVT::f64:
7706 XVT = MVT::nxv2f64;
7707 ExpVT = MVT::nxv2i64;
7708 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7709 break;
7710 }
7711
7712 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7713 SDValue VX =
7714 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7715 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7716 DAG.getUNDEF(ExpVT), Exp, Zero);
7717 SDValue VPg = DAG.getConstant(
7718 1, DL, XVT.changeVectorElementType(*DAG.getContext(), MVT::i1));
7719 SDValue FScale = DAG.getNode(
7721 DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
7722 VX, VExp);
7723 SDValue Final =
7724 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7725 if (X.getValueType() != XScalarTy)
7726 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7727 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7728 return Final;
7729}
7730
7731SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7732 SelectionDAG &DAG) const {
7733 return Op.getOperand(0);
7734}
7735
7736SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7737 SelectionDAG &DAG) const {
7738 SDValue Chain = Op.getOperand(0);
7739 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7740 SDValue FPtr = Op.getOperand(2); // nested function
7741 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7742
7743 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7744
7745 // ldr NestReg, .+16
7746 // ldr x17, .+20
7747 // br x17
7748 // .word 0
7749 // .nest: .qword nest
7750 // .fptr: .qword fptr
7751 SDValue OutChains[5];
7752
7753 const Function *Func =
7754 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7755 CallingConv::ID CC = Func->getCallingConv();
7756 unsigned NestReg;
7757
7758 switch (CC) {
7759 default:
7760 NestReg = 0x0f; // X15
7761 break;
7763 // Must be kept in sync with AArch64CallingConv.td
7764 NestReg = 0x04; // X4
7765 break;
7766 }
7767
7768 const char FptrReg = 0x11; // X17
7769
7770 SDValue Addr = Trmp;
7771
7772 SDLoc DL(Op);
7773 OutChains[0] = DAG.getStore(
7774 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7775 MachinePointerInfo(TrmpAddr));
7776
7777 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7778 DAG.getConstant(4, DL, MVT::i64));
7779 OutChains[1] = DAG.getStore(
7780 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7781 MachinePointerInfo(TrmpAddr, 4));
7782
7783 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7784 DAG.getConstant(8, DL, MVT::i64));
7785 OutChains[2] =
7786 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7787 MachinePointerInfo(TrmpAddr, 8));
7788
7789 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7790 DAG.getConstant(16, DL, MVT::i64));
7791 OutChains[3] =
7792 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7793
7794 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7795 DAG.getConstant(24, DL, MVT::i64));
7796 OutChains[4] =
7797 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7798
7799 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7800
7801 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7802 DAG.getConstant(12, DL, MVT::i64));
7803
7804 // Call clear cache on the trampoline instructions.
7805 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7806 EndOfTrmp);
7807}
7808
7809SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const {
7810 SDLoc DL(Op);
7811 EVT VT = Op.getValueType();
7812 if (VT.getScalarType() != MVT::bf16 ||
7813 (Subtarget->hasSVEB16B16() &&
7814 Subtarget->isNonStreamingSVEorSME2Available()))
7815 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7816
7817 assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering");
7818 assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16 || VT == MVT::v8bf16) &&
7819 "Unexpected FMUL VT");
7820
7821 auto MakeGetIntrinsic = [&](Intrinsic::ID IID) {
7822 return [&, IID](EVT VT, auto... Ops) {
7823 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7824 DAG.getConstant(IID, DL, MVT::i32), Ops...);
7825 };
7826 };
7827
7828 auto Reinterpret = [&](SDValue Value, EVT VT) {
7829 EVT SrcVT = Value.getValueType();
7830 if (VT == SrcVT)
7831 return Value;
7832 if (SrcVT.isFixedLengthVector())
7833 return convertToScalableVector(DAG, VT, Value);
7834 if (VT.isFixedLengthVector())
7835 return convertFromScalableVector(DAG, VT, Value);
7836 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value);
7837 };
7838
7839 bool UseSVEBFMLAL = VT.isScalableVector();
7840 auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2);
7841 auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2);
7842
7843 // Note: The NEON BFMLAL[BT] reads even/odd lanes like the SVE variant.
7844 // This does not match BFCVTN[2], so we use SVE to convert back to bf16.
7845 auto BFMLALB =
7846 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalb
7847 : Intrinsic::aarch64_neon_bfmlalb);
7848 auto BFMLALT =
7849 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalt
7850 : Intrinsic::aarch64_neon_bfmlalt);
7851
7852 EVT AccVT = UseSVEBFMLAL ? MVT::nxv4f32 : MVT::v4f32;
7853 bool IgnoreZeroSign =
7854 Op->getFlags().hasNoSignedZeros() || DAG.canIgnoreSignBitOfZero(Op);
7855 SDValue Zero = DAG.getConstantFP(IgnoreZeroSign ? +0.0F : -0.0F, DL, AccVT);
7856 SDValue Pg = getPredicateForVector(DAG, DL, AccVT);
7857
7858 // Lower bf16 FMUL as a pair (VT == [nx]v8bf16) of BFMLAL top/bottom
7859 // instructions. These result in two f32 vectors, which can be converted back
7860 // to bf16 with FCVT and FCVTNT.
7861 SDValue LHS = Op.getOperand(0);
7862 SDValue RHS = Op.getOperand(1);
7863
7864 // All SVE intrinsics expect to operate on full bf16 vector types.
7865 if (UseSVEBFMLAL) {
7866 LHS = Reinterpret(LHS, MVT::nxv8bf16);
7867 RHS = Reinterpret(RHS, MVT::nxv8bf16);
7868 }
7869
7870 SDValue BottomF32 = Reinterpret(BFMLALB(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7871 SDValue BottomBF16 =
7872 FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32);
7873 // Note: nxv4bf16 only uses even lanes.
7874 if (VT == MVT::nxv4bf16)
7875 return Reinterpret(BottomBF16, VT);
7876
7877 SDValue TopF32 = Reinterpret(BFMLALT(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7878 SDValue TopBF16 = FCVTNT(MVT::nxv8bf16, BottomBF16, Pg, TopF32);
7879 return Reinterpret(TopBF16, VT);
7880}
7881
7882SDValue AArch64TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const {
7883 SDValue OpA = Op->getOperand(0);
7884 SDValue OpB = Op->getOperand(1);
7885 SDValue OpC = Op->getOperand(2);
7886 EVT VT = Op.getValueType();
7887 SDLoc DL(Op);
7888
7889 assert(VT.isVector() && "Scalar fma lowering should be handled by patterns");
7890
7891 // Bail early if we're definitely not looking to merge FNEGs into the FMA.
7892 if (VT != MVT::v8f16 && VT != MVT::v4f32 && VT != MVT::v2f64)
7893 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7894
7895 if (OpC.getOpcode() != ISD::FNEG)
7896 return useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())
7897 ? LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED)
7898 : Op; // Fallback to NEON lowering.
7899
7900 // Convert FMA/FNEG nodes to SVE to enable the following patterns:
7901 // fma(a, b, neg(c)) -> fnmls(a, b, c)
7902 // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
7903 // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
7904 SDValue Pg = getPredicateForVector(DAG, DL, VT);
7905 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7906
7907 auto ConvertToScalableFnegMt = [&](SDValue Op) {
7908 if (Op.getOpcode() == ISD::FNEG)
7909 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7910 return convertToScalableVector(DAG, ContainerVT, Op);
7911 };
7912
7913 OpA = ConvertToScalableFnegMt(OpA);
7914 OpB = ConvertToScalableFnegMt(OpB);
7915 OpC = ConvertToScalableFnegMt(OpC);
7916
7917 SDValue ScalableRes =
7918 DAG.getNode(AArch64ISD::FMA_PRED, DL, ContainerVT, Pg, OpA, OpB, OpC);
7919 return convertFromScalableVector(DAG, VT, ScalableRes);
7920}
7921
7923 SelectionDAG &DAG) const {
7924 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7925 LLVM_DEBUG(Op.dump());
7926
7927 switch (Op.getOpcode()) {
7928 default:
7929 llvm_unreachable("unimplemented operand");
7930 return SDValue();
7933 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7934 case ISD::BITCAST:
7935 return LowerBITCAST(Op, DAG);
7936 case ISD::GlobalAddress:
7937 return LowerGlobalAddress(Op, DAG);
7939 return LowerGlobalTLSAddress(Op, DAG);
7941 return LowerPtrAuthGlobalAddress(Op, DAG);
7943 return LowerADJUST_TRAMPOLINE(Op, DAG);
7945 return LowerINIT_TRAMPOLINE(Op, DAG);
7946 case ISD::SETCC:
7947 case ISD::STRICT_FSETCC:
7949 return LowerSETCC(Op, DAG);
7950 case ISD::SETCCCARRY:
7951 return LowerSETCCCARRY(Op, DAG);
7952 case ISD::BRCOND:
7953 return LowerBRCOND(Op, DAG);
7954 case ISD::BR_CC:
7955 return LowerBR_CC(Op, DAG);
7956 case ISD::SELECT:
7957 return LowerSELECT(Op, DAG);
7958 case ISD::SELECT_CC:
7959 return LowerSELECT_CC(Op, DAG);
7960 case ISD::JumpTable:
7961 return LowerJumpTable(Op, DAG);
7962 case ISD::BR_JT:
7963 return LowerBR_JT(Op, DAG);
7964 case ISD::BRIND:
7965 return LowerBRIND(Op, DAG);
7966 case ISD::ConstantPool:
7967 return LowerConstantPool(Op, DAG);
7968 case ISD::BlockAddress:
7969 return LowerBlockAddress(Op, DAG);
7970 case ISD::VASTART:
7971 return LowerVASTART(Op, DAG);
7972 case ISD::VACOPY:
7973 return LowerVACOPY(Op, DAG);
7974 case ISD::VAARG:
7975 return LowerVAARG(Op, DAG);
7976 case ISD::UADDO_CARRY:
7977 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7978 case ISD::USUBO_CARRY:
7979 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7980 case ISD::SADDO_CARRY:
7981 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7982 case ISD::SSUBO_CARRY:
7983 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7984 case ISD::SADDO:
7985 case ISD::UADDO:
7986 case ISD::SSUBO:
7987 case ISD::USUBO:
7988 case ISD::SMULO:
7989 case ISD::UMULO:
7990 return LowerXALUO(Op, DAG);
7991 case ISD::FADD:
7992 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7993 case ISD::FSUB:
7994 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7995 case ISD::FMUL:
7996 return LowerFMUL(Op, DAG);
7997 case ISD::FMA:
7998 return LowerFMA(Op, DAG);
7999 case ISD::FDIV:
8000 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
8001 case ISD::FNEG:
8002 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
8003 case ISD::FCEIL:
8004 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
8005 case ISD::FFLOOR:
8006 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
8007 case ISD::FNEARBYINT:
8008 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
8009 case ISD::FRINT:
8010 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
8011 case ISD::FROUND:
8012 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
8013 case ISD::FROUNDEVEN:
8014 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
8015 case ISD::FTRUNC:
8016 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
8017 case ISD::FSQRT:
8018 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
8019 case ISD::FABS:
8020 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
8021 case ISD::FP_ROUND:
8023 return LowerFP_ROUND(Op, DAG);
8024 case ISD::FP_EXTEND:
8026 return LowerFP_EXTEND(Op, DAG);
8027 case ISD::FRAMEADDR:
8028 return LowerFRAMEADDR(Op, DAG);
8029 case ISD::SPONENTRY:
8030 return LowerSPONENTRY(Op, DAG);
8031 case ISD::RETURNADDR:
8032 return LowerRETURNADDR(Op, DAG);
8034 return LowerADDROFRETURNADDR(Op, DAG);
8036 return LowerCONCAT_VECTORS(Op, DAG);
8038 return LowerINSERT_VECTOR_ELT(Op, DAG);
8040 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
8041 case ISD::BUILD_VECTOR:
8042 return LowerBUILD_VECTOR(Op, DAG);
8045 return LowerEXTEND_VECTOR_INREG(Op, DAG);
8047 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
8049 return LowerVECTOR_SHUFFLE(Op, DAG);
8050 case ISD::SPLAT_VECTOR:
8051 return LowerSPLAT_VECTOR(Op, DAG);
8053 return LowerEXTRACT_SUBVECTOR(Op, DAG);
8055 return LowerINSERT_SUBVECTOR(Op, DAG);
8056 case ISD::SDIV:
8057 case ISD::UDIV:
8058 return LowerDIV(Op, DAG);
8059 case ISD::SMIN:
8060 case ISD::UMIN:
8061 case ISD::SMAX:
8062 case ISD::UMAX:
8063 return LowerMinMax(Op, DAG);
8064 case ISD::SRA:
8065 case ISD::SRL:
8066 case ISD::SHL:
8067 return LowerVectorSRA_SRL_SHL(Op, DAG);
8068 case ISD::SHL_PARTS:
8069 case ISD::SRL_PARTS:
8070 case ISD::SRA_PARTS:
8071 return LowerShiftParts(Op, DAG);
8072 case ISD::CTPOP:
8073 case ISD::PARITY:
8074 return LowerCTPOP_PARITY(Op, DAG);
8075 case ISD::FCOPYSIGN:
8076 return LowerFCOPYSIGN(Op, DAG);
8077 case ISD::OR:
8078 return LowerVectorOR(Op, DAG);
8079 case ISD::XOR:
8080 return LowerXOR(Op, DAG);
8081 case ISD::PREFETCH:
8082 return LowerPREFETCH(Op, DAG);
8083 case ISD::SINT_TO_FP:
8084 case ISD::UINT_TO_FP:
8087 return LowerINT_TO_FP(Op, DAG);
8088 case ISD::FP_TO_SINT:
8089 case ISD::FP_TO_UINT:
8092 return LowerFP_TO_INT(Op, DAG);
8095 return LowerFP_TO_INT_SAT(Op, DAG);
8096 case ISD::GET_ROUNDING:
8097 return LowerGET_ROUNDING(Op, DAG);
8098 case ISD::SET_ROUNDING:
8099 return LowerSET_ROUNDING(Op, DAG);
8100 case ISD::GET_FPMODE:
8101 return LowerGET_FPMODE(Op, DAG);
8102 case ISD::SET_FPMODE:
8103 return LowerSET_FPMODE(Op, DAG);
8104 case ISD::RESET_FPMODE:
8105 return LowerRESET_FPMODE(Op, DAG);
8106 case ISD::MUL:
8107 return LowerMUL(Op, DAG);
8108 case ISD::MULHS:
8109 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
8110 case ISD::MULHU:
8111 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
8113 return LowerINTRINSIC_W_CHAIN(Op, DAG);
8115 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
8117 return LowerINTRINSIC_VOID(Op, DAG);
8118 case ISD::ATOMIC_STORE:
8119 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
8120 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
8121 return LowerStore128(Op, DAG);
8122 }
8123 return SDValue();
8124 case ISD::STORE:
8125 return LowerSTORE(Op, DAG);
8126 case ISD::MSTORE:
8127 return LowerMSTORE(Op, DAG);
8128 case ISD::MGATHER:
8129 return LowerMGATHER(Op, DAG);
8130 case ISD::MSCATTER:
8131 return LowerMSCATTER(Op, DAG);
8133 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
8134 case ISD::VECREDUCE_ADD:
8135 case ISD::VECREDUCE_AND:
8136 case ISD::VECREDUCE_OR:
8137 case ISD::VECREDUCE_XOR:
8147 return LowerVECREDUCE(Op, DAG);
8148 case ISD::VECREDUCE_MUL:
8150 return LowerVECREDUCE_MUL(Op, DAG);
8152 return LowerATOMIC_LOAD_AND(Op, DAG);
8154 return LowerDYNAMIC_STACKALLOC(Op, DAG);
8155 case ISD::VSCALE:
8156 return LowerVSCALE(Op, DAG);
8158 return LowerVECTOR_COMPRESS(Op, DAG);
8159 case ISD::ANY_EXTEND:
8160 case ISD::SIGN_EXTEND:
8161 case ISD::ZERO_EXTEND:
8162 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
8163 case ISD::ADDRSPACECAST:
8164 return LowerADDRSPACECAST(Op, DAG);
8166 // Only custom lower when ExtraVT has a legal byte based element type.
8167 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
8168 EVT ExtraEltVT = ExtraVT.getVectorElementType();
8169 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
8170 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
8171 return SDValue();
8172
8173 return LowerToPredicatedOp(Op, DAG,
8174 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
8175 }
8176 case ISD::TRUNCATE:
8177 return LowerTRUNCATE(Op, DAG);
8178 case ISD::MLOAD:
8179 return LowerMLOAD(Op, DAG);
8180 case ISD::LOAD:
8181 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
8182 !Subtarget->isNeonAvailable()))
8183 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
8184 return LowerLOAD(Op, DAG);
8185 case ISD::ADD:
8186 case ISD::AND:
8187 case ISD::SUB:
8188 return LowerToScalableOp(Op, DAG);
8189 case ISD::FMAXIMUM:
8190 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
8191 case ISD::FMAXNUM:
8192 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
8193 case ISD::FMINIMUM:
8194 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
8195 case ISD::FMINNUM:
8196 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
8197 case ISD::VSELECT:
8198 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
8199 case ISD::ABS:
8200 return LowerABS(Op, DAG);
8201 case ISD::ABDS:
8202 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
8203 case ISD::ABDU:
8204 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
8205 case ISD::AVGFLOORS:
8206 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
8207 case ISD::AVGFLOORU:
8208 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
8209 case ISD::AVGCEILS:
8210 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
8211 case ISD::AVGCEILU:
8212 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
8213 case ISD::BITREVERSE:
8214 return LowerBitreverse(Op, DAG);
8215 case ISD::BSWAP:
8216 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
8217 case ISD::CTLZ:
8218 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
8219 case ISD::CTTZ:
8220 return LowerCTTZ(Op, DAG);
8223 return LowerVECTOR_SPLICE(Op, DAG);
8225 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
8227 return LowerVECTOR_INTERLEAVE(Op, DAG);
8229 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
8230 case ISD::LRINT:
8231 case ISD::LLRINT:
8232 if (Op.getValueType().isVector())
8233 return LowerVectorXRINT(Op, DAG);
8234 [[fallthrough]];
8235 case ISD::LROUND:
8236 case ISD::LLROUND: {
8237 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
8238 Op.getOperand(0).getValueType() == MVT::bf16) &&
8239 "Expected custom lowering of rounding operations only for f16");
8240 SDLoc DL(Op);
8241 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
8242 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
8243 }
8244 case ISD::STRICT_LROUND:
8246 case ISD::STRICT_LRINT:
8247 case ISD::STRICT_LLRINT: {
8248 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
8249 Op.getOperand(1).getValueType() == MVT::bf16) &&
8250 "Expected custom lowering of rounding operations only for f16");
8251 SDLoc DL(Op);
8252 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
8253 {Op.getOperand(0), Op.getOperand(1)});
8254 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
8255 {Ext.getValue(1), Ext.getValue(0)});
8256 }
8257 case ISD::WRITE_REGISTER: {
8258 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
8259 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
8260 SDLoc DL(Op);
8261
8262 SDValue Chain = Op.getOperand(0);
8263 SDValue SysRegName = Op.getOperand(1);
8264 std::pair<SDValue, SDValue> Pair =
8265 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
8266
8267 // chain = MSRR(chain, sysregname, lo, hi)
8268 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
8269 SysRegName, Pair.first, Pair.second);
8270
8271 return Result;
8272 }
8273 case ISD::FSHL:
8274 case ISD::FSHR:
8275 return LowerFunnelShift(Op, DAG);
8276 case ISD::FLDEXP:
8277 return LowerFLDEXP(Op, DAG);
8279 return LowerVECTOR_HISTOGRAM(Op, DAG);
8284 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
8285 }
8286}
8287
8289 return !Subtarget->useSVEForFixedLengthVectors();
8290}
8291
8293 EVT VT, bool OverrideNEON) const {
8294 if (!VT.isFixedLengthVector() || !VT.isSimple())
8295 return false;
8296
8297 // Don't use SVE for vectors we cannot scalarize if required.
8298 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8299 // Fixed length predicates should be promoted to i8.
8300 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
8301 case MVT::i1:
8302 default:
8303 return false;
8304 case MVT::i8:
8305 case MVT::i16:
8306 case MVT::i32:
8307 case MVT::i64:
8308 case MVT::f16:
8309 case MVT::f32:
8310 case MVT::f64:
8311 break;
8312 }
8313
8314 // NEON-sized vectors can be emulated using SVE instructions.
8315 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
8316 return Subtarget->isSVEorStreamingSVEAvailable();
8317
8318 // Ensure NEON MVTs only belong to a single register class.
8319 if (VT.getFixedSizeInBits() <= 128)
8320 return false;
8321
8322 // Ensure wider than NEON code generation is enabled.
8323 if (!Subtarget->useSVEForFixedLengthVectors())
8324 return false;
8325
8326 // Don't use SVE for types that don't fit.
8327 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
8328 return false;
8329
8330 // TODO: Perhaps an artificial restriction, but worth having whilst getting
8331 // the base fixed length SVE support in place.
8332 if (!VT.isPow2VectorType())
8333 return false;
8334
8335 return true;
8336}
8337
8338//===----------------------------------------------------------------------===//
8339// Calling Convention Implementation
8340//===----------------------------------------------------------------------===//
8341
8342static unsigned getIntrinsicID(const SDNode *N) {
8343 unsigned Opcode = N->getOpcode();
8344 switch (Opcode) {
8345 default:
8348 unsigned IID = N->getConstantOperandVal(0);
8349 if (IID < Intrinsic::num_intrinsics)
8350 return IID;
8352 }
8353 }
8354}
8355
8357 SDValue N1) const {
8358 if (!N0.hasOneUse())
8359 return false;
8360
8361 unsigned IID = getIntrinsicID(N1.getNode());
8362 // Avoid reassociating expressions that can be lowered to smlal/umlal.
8363 if (IID == Intrinsic::aarch64_neon_umull ||
8364 N1.getOpcode() == AArch64ISD::UMULL ||
8365 IID == Intrinsic::aarch64_neon_smull ||
8366 N1.getOpcode() == AArch64ISD::SMULL)
8367 return N0.getOpcode() != ISD::ADD;
8368
8369 return true;
8370}
8371
8372/// Selects the correct CCAssignFn for a given CallingConvention value.
8374 bool IsVarArg) const {
8375 switch (CC) {
8376 default:
8377 reportFatalUsageError("unsupported calling convention");
8378 case CallingConv::GHC:
8379 return CC_AArch64_GHC;
8381 // The VarArg implementation makes assumptions about register
8382 // argument passing that do not hold for preserve_none, so we
8383 // instead fall back to C argument passing.
8384 // The non-vararg case is handled in the CC function itself.
8385 if (!IsVarArg)
8387 [[fallthrough]];
8388 case CallingConv::C:
8389 case CallingConv::Fast:
8393 case CallingConv::Swift:
8395 case CallingConv::Tail:
8396 case CallingConv::GRAAL:
8397 if (Subtarget->isTargetWindows()) {
8398 if (IsVarArg) {
8399 if (Subtarget->isWindowsArm64EC())
8402 }
8403 return CC_AArch64_Win64PCS;
8404 }
8405 if (!Subtarget->isTargetDarwin())
8406 return CC_AArch64_AAPCS;
8407 if (!IsVarArg)
8408 return CC_AArch64_DarwinPCS;
8409 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8411 case CallingConv::Win64:
8412 if (IsVarArg) {
8413 if (Subtarget->isWindowsArm64EC())
8416 }
8417 return CC_AArch64_Win64PCS;
8419 if (Subtarget->isWindowsArm64EC())
8427 return CC_AArch64_AAPCS;
8432 }
8433}
8434
8435CCAssignFn *
8437 switch (CC) {
8438 default:
8439 return RetCC_AArch64_AAPCS;
8443 if (Subtarget->isWindowsArm64EC())
8445 return RetCC_AArch64_AAPCS;
8446 }
8447}
8448
8449static bool isPassedInFPR(EVT VT) {
8450 return VT.isFixedLengthVector() ||
8451 (VT.isFloatingPoint() && !VT.isScalableVector());
8452}
8453
8455 AArch64FunctionInfo &FuncInfo,
8456 SelectionDAG &DAG) {
8457 if (!FuncInfo.hasZT0SpillSlotIndex())
8458 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8459
8460 return DAG.getFrameIndex(
8461 FuncInfo.getZT0SpillSlotIndex(),
8463}
8464
8465// Emit a call to __arm_sme_save or __arm_sme_restore.
8467 SelectionDAG &DAG,
8469 SDValue Chain, bool IsSave) {
8472 FuncInfo->setSMESaveBufferUsed();
8474 Args.emplace_back(
8475 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8477
8478 RTLIB::Libcall LC =
8479 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8480 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
8481 SDValue Callee =
8482 DAG.getExternalSymbol(LCImpl, TLI.getPointerTy(DAG.getDataLayout()));
8483 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8485 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8486 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
8487 std::move(Args));
8488 return TLI.LowerCallTo(CLI).second;
8489}
8490
8492 const AArch64TargetLowering &TLI,
8493 const AArch64RegisterInfo &TRI,
8494 AArch64FunctionInfo &FuncInfo,
8495 SelectionDAG &DAG) {
8496 // Conditionally restore the lazy save using a pseudo node.
8497 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
8498 TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
8499
8500 RTLIB::LibcallImpl LibcallImpl = DAG.getLibcalls().getLibcallImpl(LC);
8501 SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
8502 DAG.getMachineFunction(),
8503 DAG.getLibcalls().getLibcallImplCallingConv(LibcallImpl)));
8504 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8505 LibcallImpl, TLI.getPointerTy(DAG.getDataLayout()));
8506 SDValue TPIDR2_EL0 = DAG.getNode(
8507 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
8508 DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8509 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8510 // RESTORE_ZA pseudo.
8511 SDValue Glue;
8512 SDValue TPIDR2Block = DAG.getFrameIndex(
8513 TPIDR2.FrameIndex,
8515 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
8516 Chain =
8517 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8518 {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8519 RestoreRoutine, RegMask, Chain.getValue(1)});
8520 // Finally reset the TPIDR2_EL0 register to 0.
8521 Chain = DAG.getNode(
8522 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8523 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8524 DAG.getConstant(0, DL, MVT::i64));
8525 TPIDR2.Uses++;
8526 return Chain;
8527}
8528
8529SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8530 SelectionDAG &DAG) const {
8531 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8532 SDValue Glue = Chain.getValue(1);
8533
8534 MachineFunction &MF = DAG.getMachineFunction();
8535 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8536 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
8537 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
8538
8539 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8540
8541 // The following conditions are true on entry to an exception handler:
8542 // - PSTATE.SM is 0.
8543 // - PSTATE.ZA is 0.
8544 // - TPIDR2_EL0 is null.
8545 // See:
8546 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8547 //
8548 // Therefore, if the function that contains this exception handler is a
8549 // streaming[-compatible] function, we must re-enable streaming mode.
8550 //
8551 // These mode changes are usually optimized away in catch blocks as they
8552 // occur before the __cxa_begin_catch (which is a non-streaming function),
8553 // but are necessary in some cases (such as for cleanups).
8554 //
8555 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8556
8557 // [COND_]SMSTART SM
8558 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8559 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8560 /*Glue*/ Glue, AArch64SME::Always);
8561 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8562 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8564
8565 if (getTM().useNewSMEABILowering())
8566 return Chain;
8567
8568 if (SMEFnAttrs.hasAgnosticZAInterface()) {
8569 // Restore full ZA
8570 Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
8571 /*IsSave=*/false);
8572 } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
8573 // SMSTART ZA
8574 Chain = DAG.getNode(
8575 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
8576 DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
8577
8578 // Restore ZT0
8579 if (SMEFnAttrs.hasZT0State()) {
8580 SDValue ZT0FrameIndex =
8581 getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
8582 Chain =
8583 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8584 {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
8585 }
8586
8587 // Restore ZA
8588 if (SMEFnAttrs.hasZAState())
8589 Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
8590 }
8591
8592 return Chain;
8593}
8594
8595SDValue AArch64TargetLowering::LowerFormalArguments(
8596 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8597 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8598 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8599 MachineFunction &MF = DAG.getMachineFunction();
8600 const Function &F = MF.getFunction();
8601 MachineFrameInfo &MFI = MF.getFrameInfo();
8602 bool IsWin64 =
8603 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8604 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8605 (isVarArg && Subtarget->isWindowsArm64EC());
8606 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8607
8609 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8611 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8612 FuncInfo->setIsSVECC(true);
8613
8614 // Assign locations to all of the incoming arguments.
8616 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8617
8618 // At this point, Ins[].VT may already be promoted to i32. To correctly
8619 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8620 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8621 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8622 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8623 // LocVT.
8624 unsigned NumArgs = Ins.size();
8625 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8626 unsigned CurArgIdx = 0;
8627 bool UseVarArgCC = false;
8628 if (IsWin64)
8629 UseVarArgCC = isVarArg;
8630
8631 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8632
8633 for (unsigned i = 0; i != NumArgs; ++i) {
8634 MVT ValVT = Ins[i].VT;
8635 if (Ins[i].isOrigArg()) {
8636 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8637 CurArgIdx = Ins[i].getOrigArgIndex();
8638
8639 // Get type of the original argument.
8640 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8641 /*AllowUnknown*/ true);
8642 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8643 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8644 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8645 ValVT = MVT::i8;
8646 else if (ActualMVT == MVT::i16)
8647 ValVT = MVT::i16;
8648 }
8649 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8650 Ins[i].OrigTy, CCInfo);
8651 assert(!Res && "Call operand has unhandled type");
8652 (void)Res;
8653 }
8654
8655 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8656 bool IsLocallyStreaming =
8657 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8658 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8659 SDValue Glue = Chain.getValue(1);
8660
8661 unsigned ExtraArgLocs = 0;
8662 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8663 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8664
8665 if (Ins[i].Flags.isByVal()) {
8666 // Byval is used for HFAs in the PCS, but the system should work in a
8667 // non-compliant manner for larger structs.
8668 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8669 int Size = Ins[i].Flags.getByValSize();
8670 unsigned NumRegs = (Size + 7) / 8;
8671
8672 // FIXME: This works on big-endian for composite byvals, which are the common
8673 // case. It should also work for fundamental types too.
8674 unsigned FrameIdx =
8675 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8676 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8677 InVals.push_back(FrameIdxN);
8678
8679 continue;
8680 }
8681
8682 if (Ins[i].Flags.isSwiftAsync())
8683 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8684
8685 SDValue ArgValue;
8686 if (VA.isRegLoc()) {
8687 // Arguments stored in registers.
8688 EVT RegVT = VA.getLocVT();
8689 const TargetRegisterClass *RC;
8690
8691 if (RegVT == MVT::i32)
8692 RC = &AArch64::GPR32RegClass;
8693 else if (RegVT == MVT::i64)
8694 RC = &AArch64::GPR64RegClass;
8695 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8696 RC = &AArch64::FPR16RegClass;
8697 else if (RegVT == MVT::f32)
8698 RC = &AArch64::FPR32RegClass;
8699 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8700 RC = &AArch64::FPR64RegClass;
8701 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8702 RC = &AArch64::FPR128RegClass;
8703 else if (RegVT.isScalableVector() &&
8704 RegVT.getVectorElementType() == MVT::i1) {
8705 FuncInfo->setIsSVECC(true);
8706 RC = &AArch64::PPRRegClass;
8707 } else if (RegVT == MVT::aarch64svcount) {
8708 FuncInfo->setIsSVECC(true);
8709 RC = &AArch64::PPRRegClass;
8710 } else if (RegVT.isScalableVector()) {
8711 FuncInfo->setIsSVECC(true);
8712 RC = &AArch64::ZPRRegClass;
8713 } else
8714 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8715
8716 // Transform the arguments in physical registers into virtual ones.
8717 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8718
8719 if (IsLocallyStreaming) {
8720 // LocallyStreamingFunctions must insert the SMSTART in the correct
8721 // position, so we use Glue to ensure no instructions can be scheduled
8722 // between the chain of:
8723 // t0: ch,glue = EntryNode
8724 // t1: res,ch,glue = CopyFromReg
8725 // ...
8726 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8727 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8728 // ^^^^^^
8729 // This will be the new Chain/Root node.
8730 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8731 Glue = ArgValue.getValue(2);
8732 if (isPassedInFPR(ArgValue.getValueType())) {
8733 ArgValue =
8734 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8735 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8736 {ArgValue, Glue});
8737 Glue = ArgValue.getValue(1);
8738 }
8739 } else
8740 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8741
8742 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8743 // to 64 bits. Insert an assert[sz]ext to capture this, then
8744 // truncate to the right size.
8745 switch (VA.getLocInfo()) {
8746 default:
8747 llvm_unreachable("Unknown loc info!");
8748 case CCValAssign::Full:
8749 break;
8751 assert(
8752 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8753 "Indirect arguments should be scalable on most subtargets");
8754 break;
8755 case CCValAssign::BCvt:
8756 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8757 break;
8758 case CCValAssign::AExt:
8759 case CCValAssign::SExt:
8760 case CCValAssign::ZExt:
8761 break;
8763 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8764 DAG.getConstant(32, DL, RegVT));
8765 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8766 break;
8767 }
8768 } else { // VA.isRegLoc()
8769 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8770 unsigned ArgOffset = VA.getLocMemOffset();
8771 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8772 ? VA.getLocVT().getSizeInBits()
8773 : VA.getValVT().getSizeInBits()) / 8;
8774
8775 uint32_t BEAlign = 0;
8776 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8777 !Ins[i].Flags.isInConsecutiveRegs())
8778 BEAlign = 8 - ArgSize;
8779
8780 SDValue FIN;
8781 MachinePointerInfo PtrInfo;
8782 if (StackViaX4) {
8783 // In both the ARM64EC varargs convention and the thunk convention,
8784 // arguments on the stack are accessed relative to x4, not sp. In
8785 // the thunk convention, there's an additional offset of 32 bytes
8786 // to account for the shadow store.
8787 unsigned ObjOffset = ArgOffset + BEAlign;
8788 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8789 ObjOffset += 32;
8790 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8791 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8792 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8793 DAG.getConstant(ObjOffset, DL, MVT::i64));
8795 } else {
8796 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8797
8798 // Create load nodes to retrieve arguments from the stack.
8799 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8800 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8801 }
8802
8803 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8805 MVT MemVT = VA.getValVT();
8806
8807 switch (VA.getLocInfo()) {
8808 default:
8809 break;
8810 case CCValAssign::Trunc:
8811 case CCValAssign::BCvt:
8812 MemVT = VA.getLocVT();
8813 break;
8815 assert(
8816 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8817 "Indirect arguments should be scalable on most subtargets");
8818 MemVT = VA.getLocVT();
8819 break;
8820 case CCValAssign::SExt:
8821 ExtType = ISD::SEXTLOAD;
8822 break;
8823 case CCValAssign::ZExt:
8824 ExtType = ISD::ZEXTLOAD;
8825 break;
8826 case CCValAssign::AExt:
8827 ExtType = ISD::EXTLOAD;
8828 break;
8829 }
8830
8831 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8832 MemVT);
8833 }
8834
8835 if (VA.getLocInfo() == CCValAssign::Indirect) {
8836 assert((VA.getValVT().isScalableVT() ||
8837 Subtarget->isWindowsArm64EC()) &&
8838 "Indirect arguments should be scalable on most subtargets");
8839
8840 TypeSize PartSize = VA.getValVT().getStoreSize();
8841 unsigned NumParts = 1;
8842 if (Ins[i].Flags.isInConsecutiveRegs()) {
8843 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8844 ++NumParts;
8845 }
8846
8847 MVT PartLoad = VA.getValVT();
8848 SDValue Ptr = ArgValue;
8849
8850 // Ensure we generate all loads for each tuple part, whilst updating the
8851 // pointer after each load correctly using vscale.
8852 while (NumParts > 0) {
8853 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8854 InVals.push_back(ArgValue);
8855 NumParts--;
8856 if (NumParts > 0) {
8857 SDValue BytesIncrement =
8858 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
8859 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8860 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8861 ExtraArgLocs++;
8862 i++;
8863 }
8864 }
8865 } else {
8866 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8867 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8868 ArgValue, DAG.getValueType(MVT::i32));
8869
8870 // i1 arguments are zero-extended to i8 by the caller. Emit a
8871 // hint to reflect this.
8872 if (Ins[i].isOrigArg()) {
8873 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8874 if (OrigArg->getType()->isIntegerTy(1)) {
8875 if (!Ins[i].Flags.isZExt()) {
8876 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8877 ArgValue.getValueType(), ArgValue);
8878 }
8879 }
8880 }
8881
8882 InVals.push_back(ArgValue);
8883 }
8884 }
8885 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8886
8887 if (Attrs.hasStreamingCompatibleInterface()) {
8888 SDValue EntryPStateSM =
8889 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8890 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8891
8892 // Copy the value to a virtual register, and save that in FuncInfo.
8893 Register EntryPStateSMReg =
8894 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8895 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8896 EntryPStateSM);
8897 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8898 }
8899
8900 // Insert the SMSTART if this is a locally streaming function and
8901 // make sure it is Glued to the last CopyFromReg value.
8902 if (IsLocallyStreaming) {
8903 if (Attrs.hasStreamingCompatibleInterface())
8904 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8906 else
8907 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8909
8910 // Ensure that the SMSTART happens after the CopyWithChain such that its
8911 // chain result is used.
8912 for (unsigned I=0; I<InVals.size(); ++I) {
8915 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8916 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8917 InVals[I].getValueType());
8918 }
8919 }
8920
8921 // varargs
8922 if (isVarArg) {
8924 if (!Subtarget->isTargetDarwin() || IsWin64) {
8925 // The AAPCS variadic function ABI is identical to the non-variadic
8926 // one. As a result there may be more arguments in registers and we
8927 // should save them for future reference.
8928 // Win64 variadic functions also pass arguments in registers, but all
8929 // float arguments are passed in integer registers.
8930 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8931 }
8932
8933 // This will point to the next argument passed via stack.
8934 unsigned VarArgsOffset = CCInfo.getStackSize();
8935 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8936 VarArgsOffset =
8937 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8938 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8939 FuncInfo->setVarArgsStackIndex(
8940 MFI.CreateFixedObject(4, VarArgsOffset, true));
8941 }
8942
8943 if (MFI.hasMustTailInVarArgFunc()) {
8944 SmallVector<MVT, 2> RegParmTypes;
8945 RegParmTypes.push_back(MVT::i64);
8946 RegParmTypes.push_back(MVT::f128);
8947 // Compute the set of forwarded registers. The rest are scratch.
8948 SmallVectorImpl<ForwardedRegister> &Forwards =
8949 FuncInfo->getForwardedMustTailRegParms();
8950 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8952
8953 // Conservatively forward X8, since it might be used for aggregate return.
8954 if (!CCInfo.isAllocated(AArch64::X8)) {
8955 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8956 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8957 }
8958 }
8959 }
8960
8961 // On Windows, InReg pointers must be returned, so record the pointer in a
8962 // virtual register at the start of the function so it can be returned in the
8963 // epilogue.
8964 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8965 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8966 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8967 Ins[I].Flags.isInReg()) &&
8968 Ins[I].Flags.isSRet()) {
8969 assert(!FuncInfo->getSRetReturnReg());
8970
8971 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8972 Register Reg =
8974 FuncInfo->setSRetReturnReg(Reg);
8975
8976 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8977 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8978 break;
8979 }
8980 }
8981 }
8982
8983 unsigned StackArgSize = CCInfo.getStackSize();
8984 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8985 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8986 // This is a non-standard ABI so by fiat I say we're allowed to make full
8987 // use of the stack area to be popped, which must be aligned to 16 bytes in
8988 // any case:
8989 StackArgSize = alignTo(StackArgSize, 16);
8990
8991 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8992 // a multiple of 16.
8993 FuncInfo->setArgumentStackToRestore(StackArgSize);
8994
8995 // This realignment carries over to the available bytes below. Our own
8996 // callers will guarantee the space is free by giving an aligned value to
8997 // CALLSEQ_START.
8998 }
8999 // Even if we're not expected to free up the space, it's useful to know how
9000 // much is there while considering tail calls (because we can reuse it).
9001 FuncInfo->setBytesInStackArgArea(StackArgSize);
9002
9003 if (Subtarget->hasCustomCallingConv())
9004 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
9005
9006 if (getTM().useNewSMEABILowering()) {
9007 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
9008 SDValue Size;
9009 if (Attrs.hasZAState()) {
9010 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9011 DAG.getConstant(1, DL, MVT::i32));
9012 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
9013 } else if (Attrs.hasAgnosticZAInterface()) {
9014 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
9015 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9016
9017 SDValue Callee =
9018 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
9019 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
9020 TargetLowering::CallLoweringInfo CLI(DAG);
9021 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9022 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
9023 {});
9024 std::tie(Size, Chain) = LowerCallTo(CLI);
9025 }
9026 if (Size) {
9027 SDValue Buffer = DAG.getNode(
9028 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9029 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9030 Chain = Buffer.getValue(1);
9031
9032 Register BufferPtr =
9033 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9034 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
9035 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
9036 DAG.getVTList(MVT::Other), Chain);
9037 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
9038 MFI.CreateVariableSizedObject(Align(16), nullptr);
9039 }
9040 }
9041 } else {
9042 // Old SME ABI lowering (deprecated):
9043 // Create a 16 Byte TPIDR2 object. The dynamic buffer
9044 // will be expanded and stored in the static object later using a
9045 // pseudonode.
9046 if (Attrs.hasZAState()) {
9047 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9048 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
9049 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9050 DAG.getConstant(1, DL, MVT::i32));
9051 SDValue Buffer;
9052 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
9053 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
9054 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
9055 } else {
9056 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
9057 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
9058 DAG.getVTList(MVT::i64, MVT::Other),
9059 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9060 MFI.CreateVariableSizedObject(Align(16), nullptr);
9061 }
9062 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9063 DAG.getConstant(1, DL, MVT::i32));
9064 Chain = DAG.getNode(
9065 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
9066 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
9067 /*Num save slices*/ NumZaSaveSlices});
9068 } else if (Attrs.hasAgnosticZAInterface()) {
9069 // Call __arm_sme_state_size().
9070 SDValue BufferSize =
9071 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
9072 DAG.getVTList(MVT::i64, MVT::Other), Chain);
9073 Chain = BufferSize.getValue(1);
9074 SDValue Buffer;
9075 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
9076 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
9077 DAG.getVTList(MVT::i64, MVT::Other),
9078 {Chain, BufferSize});
9079 } else {
9080 // Allocate space dynamically.
9081 Buffer = DAG.getNode(
9082 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9083 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
9084 MFI.CreateVariableSizedObject(Align(16), nullptr);
9085 }
9086 // Copy the value to a virtual register, and save that in FuncInfo.
9087 Register BufferPtr =
9088 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9089 FuncInfo->setSMESaveBufferAddr(BufferPtr);
9090 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
9091 }
9092 }
9093
9094 if (CallConv == CallingConv::PreserveNone) {
9095 for (const ISD::InputArg &I : Ins) {
9096 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
9097 I.Flags.isSwiftAsync()) {
9098 MachineFunction &MF = DAG.getMachineFunction();
9099 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9100 MF.getFunction(),
9101 "Swift attributes can't be used with preserve_none",
9102 DL.getDebugLoc()));
9103 break;
9104 }
9105 }
9106 }
9107
9108 return Chain;
9109}
9110
9111void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
9112 SelectionDAG &DAG,
9113 const SDLoc &DL,
9114 SDValue &Chain) const {
9115 MachineFunction &MF = DAG.getMachineFunction();
9116 MachineFrameInfo &MFI = MF.getFrameInfo();
9117 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9118 auto PtrVT = getPointerTy(DAG.getDataLayout());
9119 Function &F = MF.getFunction();
9120 bool IsWin64 =
9121 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
9122
9124
9126 unsigned NumGPRArgRegs = GPRArgRegs.size();
9127 if (Subtarget->isWindowsArm64EC()) {
9128 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
9129 // functions.
9130 NumGPRArgRegs = 4;
9131 }
9132 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
9133
9134 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
9135 int GPRIdx = 0;
9136 if (GPRSaveSize != 0) {
9137 if (IsWin64) {
9138 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
9139 if (GPRSaveSize & 15)
9140 // The extra size here, if triggered, will always be 8.
9141 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
9142 } else
9143 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
9144
9145 SDValue FIN;
9146 if (Subtarget->isWindowsArm64EC()) {
9147 // With the Arm64EC ABI, we reserve the save area as usual, but we
9148 // compute its address relative to x4. For a normal AArch64->AArch64
9149 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
9150 // different address.
9151 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9152 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9153 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
9154 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
9155 } else {
9156 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
9157 }
9158
9159 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
9160 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
9161 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9162 SDValue Store =
9163 DAG.getStore(Val.getValue(1), DL, Val, FIN,
9165 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
9166 : MachinePointerInfo::getStack(MF, i * 8));
9167 MemOps.push_back(Store);
9168 FIN =
9169 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
9170 }
9171 }
9172 FuncInfo->setVarArgsGPRIndex(GPRIdx);
9173 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
9174
9175 if (Subtarget->hasFPARMv8() && !IsWin64) {
9177 const unsigned NumFPRArgRegs = FPRArgRegs.size();
9178 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
9179
9180 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
9181 int FPRIdx = 0;
9182 if (FPRSaveSize != 0) {
9183 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
9184
9185 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
9186
9187 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
9188 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
9189 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
9190
9191 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
9192 MachinePointerInfo::getStack(MF, i * 16));
9193 MemOps.push_back(Store);
9194 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
9195 DAG.getConstant(16, DL, PtrVT));
9196 }
9197 }
9198 FuncInfo->setVarArgsFPRIndex(FPRIdx);
9199 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
9200 }
9201
9202 if (!MemOps.empty()) {
9203 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9204 }
9205}
9206
9207/// LowerCallResult - Lower the result values of a call into the
9208/// appropriate copies out of appropriate physical registers.
9209SDValue AArch64TargetLowering::LowerCallResult(
9210 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
9211 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
9212 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
9213 SDValue ThisVal, bool RequiresSMChange) const {
9214 DenseMap<unsigned, SDValue> CopiedRegs;
9215 // Copy all of the result registers out of their specified physreg.
9216 for (unsigned i = 0; i != RVLocs.size(); ++i) {
9217 CCValAssign VA = RVLocs[i];
9218
9219 // Pass 'this' value directly from the argument to return value, to avoid
9220 // reg unit interference
9221 if (i == 0 && isThisReturn) {
9222 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
9223 "unexpected return calling convention register assignment");
9224 InVals.push_back(ThisVal);
9225 continue;
9226 }
9227
9228 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
9229 // allows one use of a physreg per block.
9230 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
9231 if (!Val) {
9232 Val =
9233 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
9234 Chain = Val.getValue(1);
9235 InGlue = Val.getValue(2);
9236 CopiedRegs[VA.getLocReg()] = Val;
9237 }
9238
9239 switch (VA.getLocInfo()) {
9240 default:
9241 llvm_unreachable("Unknown loc info!");
9242 case CCValAssign::Full:
9243 break;
9244 case CCValAssign::BCvt:
9245 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
9246 break;
9248 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
9249 DAG.getConstant(32, DL, VA.getLocVT()));
9250 [[fallthrough]];
9251 case CCValAssign::AExt:
9252 [[fallthrough]];
9253 case CCValAssign::ZExt:
9254 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
9255 break;
9256 }
9257
9258 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
9259 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9260 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
9261
9262 InVals.push_back(Val);
9263 }
9264
9265 return Chain;
9266}
9267
9268/// Return true if the calling convention is one that we can guarantee TCO for.
9269static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
9270 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
9272}
9273
9274/// Return true if we might ever do TCO for calls with this calling convention.
9276 switch (CC) {
9277 case CallingConv::C:
9282 case CallingConv::Swift:
9284 case CallingConv::Tail:
9285 case CallingConv::Fast:
9286 return true;
9287 default:
9288 return false;
9289 }
9290}
9291
9292/// Return true if the call convention supports varargs
9293/// Currently only those that pass varargs like the C
9294/// calling convention does are eligible
9295/// Calling conventions listed in this function must also
9296/// be properly handled in AArch64Subtarget::isCallingConvWin64
9298 switch (CC) {
9299 case CallingConv::C:
9301 // SVE vector call is only partially supported, but it should
9302 // support named arguments being passed. Any arguments being passed
9303 // as varargs, are still unsupported.
9305 return true;
9306 default:
9307 return false;
9308 }
9309}
9310
9312 const AArch64Subtarget *Subtarget,
9314 CCState &CCInfo) {
9315 const SelectionDAG &DAG = CLI.DAG;
9316 CallingConv::ID CalleeCC = CLI.CallConv;
9317 bool IsVarArg = CLI.IsVarArg;
9318 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9319 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
9320
9321 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
9322 // for the shadow store.
9323 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
9324 CCInfo.AllocateStack(32, Align(16));
9325
9326 unsigned NumArgs = Outs.size();
9327 for (unsigned i = 0; i != NumArgs; ++i) {
9328 MVT ArgVT = Outs[i].VT;
9329 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
9330
9331 bool UseVarArgCC = false;
9332 if (IsVarArg) {
9333 // On Windows, the fixed arguments in a vararg call are passed in GPRs
9334 // too, so use the vararg CC to force them to integer registers.
9335 if (IsCalleeWin64) {
9336 UseVarArgCC = true;
9337 } else {
9338 UseVarArgCC = ArgFlags.isVarArg();
9339 }
9340 }
9341
9342 if (!UseVarArgCC) {
9343 // Get type of the original argument.
9344 EVT ActualVT =
9345 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
9346 /*AllowUnknown*/ true);
9347 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
9348 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
9349 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
9350 ArgVT = MVT::i8;
9351 else if (ActualMVT == MVT::i16)
9352 ArgVT = MVT::i16;
9353 }
9354
9355 // FIXME: CCAssignFnForCall should be called once, for the call and not per
9356 // argument. This logic should exactly mirror LowerFormalArguments.
9357 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
9358 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
9359 Outs[i].OrigTy, CCInfo);
9360 assert(!Res && "Call operand has unhandled type");
9361 (void)Res;
9362 }
9363}
9364
9365static SMECallAttrs
9368 if (CLI.CB)
9369 return SMECallAttrs(*CLI.CB, &RTLCI);
9370 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9371 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
9373}
9374
9375bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9376 const CallLoweringInfo &CLI) const {
9377 CallingConv::ID CalleeCC = CLI.CallConv;
9378 if (!mayTailCallThisCC(CalleeCC))
9379 return false;
9380
9381 SDValue Callee = CLI.Callee;
9382 bool IsVarArg = CLI.IsVarArg;
9383 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9384 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9385 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9386 const SelectionDAG &DAG = CLI.DAG;
9387 MachineFunction &MF = DAG.getMachineFunction();
9388 const Function &CallerF = MF.getFunction();
9389 CallingConv::ID CallerCC = CallerF.getCallingConv();
9390
9391 // SME Streaming functions are not eligible for TCO as they may require
9392 // the streaming mode or ZA/ZT0 to be restored after returning from the call.
9393 SMECallAttrs CallAttrs =
9394 getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
9395 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9396 CallAttrs.requiresPreservingAllZAState() ||
9397 CallAttrs.requiresPreservingZT0() ||
9398 CallAttrs.caller().hasStreamingBody() || CallAttrs.caller().isNewZA() ||
9399 CallAttrs.caller().isNewZT0())
9400 return false;
9401
9402 // Functions using the C or Fast calling convention that have an SVE signature
9403 // preserve more registers and should assume the SVE_VectorCall CC.
9404 // The check for matching callee-saved regs will determine whether it is
9405 // eligible for TCO.
9406 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9407 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9409
9410 bool CCMatch = CallerCC == CalleeCC;
9411
9412 // When using the Windows calling convention on a non-windows OS, we want
9413 // to back up and restore X18 in such functions; we can't do a tail call
9414 // from those functions.
9415 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9416 CalleeCC != CallingConv::Win64)
9417 return false;
9418
9419 // Byval parameters hand the function a pointer directly into the stack area
9420 // we want to reuse during a tail call. Working around this *is* possible (see
9421 // X86) but less efficient and uglier in LowerCall.
9422 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9423 e = CallerF.arg_end();
9424 i != e; ++i) {
9425 if (i->hasByValAttr())
9426 return false;
9427
9428 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9429 // In this case, it is necessary to save X0/X1 in the callee and return it
9430 // in X0. Tail call opt may interfere with this, so we disable tail call
9431 // opt when the caller has an "inreg" attribute -- except if the callee
9432 // also has that attribute on the same argument, and the same value is
9433 // passed.
9434 if (i->hasInRegAttr()) {
9435 unsigned ArgIdx = i - CallerF.arg_begin();
9436 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9437 return false;
9438 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9439 if (!Attrs.hasAttribute(Attribute::InReg) ||
9440 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9441 CLI.CB->getArgOperand(ArgIdx) != i) {
9442 return false;
9443 }
9444 }
9445 }
9446
9447 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9448 return CCMatch;
9449
9450 // Externally-defined functions with weak linkage should not be
9451 // tail-called on AArch64 when the OS does not support dynamic
9452 // pre-emption of symbols, as the AAELF spec requires normal calls
9453 // to undefined weak functions to be replaced with a NOP or jump to the
9454 // next instruction. The behaviour of branch instructions in this
9455 // situation (as used for tail calls) is implementation-defined, so we
9456 // cannot rely on the linker replacing the tail call with a return.
9457 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9458 const GlobalValue *GV = G->getGlobal();
9459 const Triple &TT = getTargetMachine().getTargetTriple();
9460 if (GV->hasExternalWeakLinkage() &&
9461 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9462 return false;
9463 }
9464
9465 // Now we search for cases where we can use a tail call without changing the
9466 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9467 // concept.
9468
9469 // I want anyone implementing a new calling convention to think long and hard
9470 // about this assert.
9471 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9472 report_fatal_error("Unsupported variadic calling convention");
9473
9474 LLVMContext &C = *DAG.getContext();
9475 // Check that the call results are passed in the same way.
9476 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9477 CCAssignFnForCall(CalleeCC, IsVarArg),
9478 CCAssignFnForCall(CallerCC, IsVarArg)))
9479 return false;
9480 // The callee has to preserve all registers the caller needs to preserve.
9481 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9482 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9483 if (!CCMatch) {
9484 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9485 if (Subtarget->hasCustomCallingConv()) {
9486 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9487 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9488 }
9489 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9490 return false;
9491 }
9492
9493 // Nothing more to check if the callee is taking no arguments
9494 if (Outs.empty())
9495 return true;
9496
9498 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9499
9500 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9501
9502 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9503 // When we are musttail, additional checks have been done and we can safely ignore this check
9504 // At least two cases here: if caller is fastcc then we can't have any
9505 // memory arguments (we'd be expected to clean up the stack afterwards). If
9506 // caller is C then we could potentially use its argument area.
9507
9508 // FIXME: for now we take the most conservative of these in both cases:
9509 // disallow all variadic memory operands.
9510 for (const CCValAssign &ArgLoc : ArgLocs)
9511 if (!ArgLoc.isRegLoc())
9512 return false;
9513 }
9514
9515 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9516
9517 // If any of the arguments is passed indirectly, it must be SVE, so the
9518 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9519 // allocate space on the stack. That is why we determine this explicitly here
9520 // the call cannot be a tailcall.
9521 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9522 assert((A.getLocInfo() != CCValAssign::Indirect ||
9523 A.getValVT().isScalableVector() ||
9524 Subtarget->isWindowsArm64EC()) &&
9525 "Expected value to be scalable");
9526 return A.getLocInfo() == CCValAssign::Indirect;
9527 }))
9528 return false;
9529
9530 // If the stack arguments for this call do not fit into our own save area then
9531 // the call cannot be made tail.
9532 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9533 return false;
9534
9535 const MachineRegisterInfo &MRI = MF.getRegInfo();
9536 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9537 return false;
9538
9539 return true;
9540}
9541
9542SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9543 SelectionDAG &DAG,
9544 MachineFrameInfo &MFI,
9545 int ClobberedFI) const {
9546 SmallVector<SDValue, 8> ArgChains;
9547 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9548 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9549
9550 // Include the original chain at the beginning of the list. When this is
9551 // used by target LowerCall hooks, this helps legalize find the
9552 // CALLSEQ_BEGIN node.
9553 ArgChains.push_back(Chain);
9554
9555 // Add a chain value for each stack argument corresponding
9556 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9557 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9558 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9559 if (FI->getIndex() < 0) {
9560 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9561 int64_t InLastByte = InFirstByte;
9562 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9563
9564 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9565 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9566 ArgChains.push_back(SDValue(L, 1));
9567 }
9568
9569 // Build a tokenfactor for all the chains.
9570 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9571}
9572
9573bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9574 bool TailCallOpt) const {
9575 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9576 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9577}
9578
9579// Check if the value is zero-extended from i1 to i8
9580static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9581 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9582 if (SizeInBits < 8)
9583 return false;
9584
9585 APInt RequiredZero(SizeInBits, 0xFE);
9586 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9587 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9588 return ZExtBool;
9589}
9590
9591void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9592 SDNode *Node) const {
9593 // Live-in physreg copies that are glued to SMSTART are applied as
9594 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9595 // register allocator to pass call args in callee saved regs, without extra
9596 // copies to avoid these fake clobbers of actually-preserved GPRs.
9597 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9598 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9599 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9600 if (MachineOperand &MO = MI.getOperand(I);
9601 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9602 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9603 AArch64::GPR64RegClass.contains(MO.getReg())))
9604 MI.removeOperand(I);
9605
9606 // The SVE vector length can change when entering/leaving streaming mode.
9607 // FPMR is set to 0 when entering/leaving streaming mode.
9608 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9609 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9610 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9611 /*IsImplicit=*/true));
9612 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9613 /*IsImplicit=*/true));
9614 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9615 /*IsImplicit=*/true));
9616 }
9617 }
9618
9619 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9620 // have nothing to do with VG, were it not that they are used to materialise a
9621 // frame-address. If they contain a frame-index to a scalable vector, this
9622 // will likely require an ADDVL instruction to materialise the address, thus
9623 // reading VG.
9624 const MachineFunction &MF = *MI.getMF();
9625 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9626 (MI.getOpcode() == AArch64::ADDXri ||
9627 MI.getOpcode() == AArch64::SUBXri)) {
9628 const MachineOperand &MO = MI.getOperand(1);
9629 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9630 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9631 /*IsImplicit=*/true));
9632 }
9633}
9634
9636 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9637 unsigned Condition, bool InsertVectorLengthCheck) const {
9640 FuncInfo->setHasStreamingModeChanges(true);
9641
9642 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9643 SmallVector<SDValue, 2> Ops = {Chain};
9644 if (InGlue)
9645 Ops.push_back(InGlue);
9646 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9647 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9648 };
9649
9650 if (InsertVectorLengthCheck && Enable) {
9651 // Non-streaming -> Streaming
9652 // Insert vector length check before smstart
9653 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9654 Chain = CheckVL.getValue(0);
9655 InGlue = CheckVL.getValue(1);
9656 }
9657
9658 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9659 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9660 SDValue MSROp =
9661 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9662 SmallVector<SDValue> Ops = {Chain, MSROp};
9663 unsigned Opcode;
9664 if (Condition != AArch64SME::Always) {
9665 Register PStateReg = FuncInfo->getPStateSMReg();
9666 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9667 SDValue PStateSM =
9668 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9669 // Use chain and glue from the CopyFromReg.
9670 Ops[0] = PStateSM.getValue(1);
9671 InGlue = PStateSM.getValue(2);
9672 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9673 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9674 Ops.push_back(ConditionOp);
9675 Ops.push_back(PStateSM);
9676 } else {
9677 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9678 }
9679 Ops.push_back(RegMask);
9680
9681 if (InGlue)
9682 Ops.push_back(InGlue);
9683
9684 SDValue SMChange =
9685 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9686
9687 if (!InsertVectorLengthCheck || Enable)
9688 return SMChange;
9689
9690 // Streaming -> Non-streaming
9691 // Insert vector length check after smstop since we cannot read VL
9692 // in streaming mode
9693 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9694}
9695
9698 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9699 CallAttrs.caller().hasStreamingBody())
9700 return AArch64SME::Always;
9701 if (CallAttrs.callee().hasNonStreamingInterface())
9703 if (CallAttrs.callee().hasStreamingInterface())
9705
9706 llvm_unreachable("Unsupported attributes");
9707}
9708
9709/// Check whether a stack argument requires lowering in a tail call.
9711 const CCValAssign &VA, SDValue Arg,
9712 ISD::ArgFlagsTy Flags, int CallOffset) {
9713 // FIXME: We should be able to handle this case, but it's not clear how to.
9714 if (Flags.isZExt() || Flags.isSExt())
9715 return true;
9716
9717 for (;;) {
9718 // Look through nodes that don't alter the bits of the incoming value.
9719 unsigned Op = Arg.getOpcode();
9720 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9721 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9722 Arg = Arg.getOperand(0);
9723 continue;
9724 }
9725 break;
9726 }
9727
9728 // If the argument is a load from the same immutable stack slot, we can reuse
9729 // it.
9730 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9731 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9732 const MachineFrameInfo &MFI = MF.getFrameInfo();
9733 int FI = FINode->getIndex();
9734 if (!MFI.isImmutableObjectIndex(FI))
9735 return true;
9736 if (CallOffset != MFI.getObjectOffset(FI))
9737 return true;
9738 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9739 if (SizeInBits != VA.getValVT().getSizeInBits())
9740 return true;
9741 return false;
9742 }
9743 }
9744
9745 return true;
9746}
9747
9748/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9749/// and add input and output parameter nodes.
9750SDValue
9751AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9752 SmallVectorImpl<SDValue> &InVals) const {
9753 SelectionDAG &DAG = CLI.DAG;
9754 SDLoc &DL = CLI.DL;
9755 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9756 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9757 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9758 SDValue Chain = CLI.Chain;
9759 SDValue Callee = CLI.Callee;
9760 bool &IsTailCall = CLI.IsTailCall;
9761 CallingConv::ID &CallConv = CLI.CallConv;
9762 bool IsVarArg = CLI.IsVarArg;
9763 const CallBase *CB = CLI.CB;
9764
9765 MachineFunction &MF = DAG.getMachineFunction();
9766 MachineFunction::CallSiteInfo CSInfo;
9767 bool IsThisReturn = false;
9768
9769 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9770 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9771 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9772 bool IsSibCall = false;
9773 bool GuardWithBTI = false;
9774
9775 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9776 !Subtarget->noBTIAtReturnTwice()) {
9777 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9778 }
9779
9780 // Analyze operands of the call, assigning locations to each operand.
9782 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9783
9784 if (IsVarArg) {
9785 unsigned NumArgs = Outs.size();
9786
9787 for (unsigned i = 0; i != NumArgs; ++i) {
9788 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9789 report_fatal_error("Passing SVE types to variadic functions is "
9790 "currently not supported");
9791 }
9792 }
9793
9794 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9795
9796 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9797 // Assign locations to each value returned by this call.
9799 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9800 *DAG.getContext());
9801 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9802
9803 // Set type id for call site info.
9804 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9805 CSInfo = MachineFunction::CallSiteInfo(*CB);
9806
9807 // Check callee args/returns for SVE registers and set calling convention
9808 // accordingly.
9809 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9810 auto HasSVERegLoc = [](CCValAssign &Loc) {
9811 if (!Loc.isRegLoc())
9812 return false;
9813 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9814 AArch64::PPRRegClass.contains(Loc.getLocReg());
9815 };
9816 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9818 }
9819
9820 // Determine whether we need any streaming mode changes.
9821 SMECallAttrs CallAttrs =
9823
9824 std::optional<unsigned> ZAMarkerNode;
9825 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9826
9827 if (UseNewSMEABILowering) {
9828 if (CallAttrs.requiresLazySave() ||
9829 CallAttrs.requiresPreservingAllZAState())
9830 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9831 else if (CallAttrs.requiresPreservingZT0())
9832 ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
9833 else if (CallAttrs.caller().hasZAState() ||
9834 CallAttrs.caller().hasZT0State())
9835 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9836 }
9837
9838 if (IsTailCall) {
9839 // Check if it's really possible to do a tail call.
9840 IsTailCall = isEligibleForTailCallOptimization(CLI);
9841
9842 // A sibling call is one where we're under the usual C ABI and not planning
9843 // to change that but can still do a tail call:
9844 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9845 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9846 IsSibCall = true;
9847
9848 if (IsTailCall)
9849 ++NumTailCalls;
9850 }
9851
9852 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9853 report_fatal_error("failed to perform tail call elimination on a call "
9854 "site marked musttail");
9855
9856 // Get a count of how many bytes are to be pushed on the stack.
9857 unsigned NumBytes = CCInfo.getStackSize();
9858
9859 if (IsSibCall) {
9860 // Since we're not changing the ABI to make this a tail call, the memory
9861 // operands are already available in the caller's incoming argument space.
9862 NumBytes = 0;
9863 }
9864
9865 // FPDiff is the byte offset of the call's argument area from the callee's.
9866 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9867 // by this amount for a tail call. In a sibling call it must be 0 because the
9868 // caller will deallocate the entire stack and the callee still expects its
9869 // arguments to begin at SP+0. Completely unused for non-tail calls.
9870 int FPDiff = 0;
9871
9872 if (IsTailCall && !IsSibCall) {
9873 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9874
9875 // Since callee will pop argument stack as a tail call, we must keep the
9876 // popped size 16-byte aligned.
9877 NumBytes = alignTo(NumBytes, 16);
9878
9879 // FPDiff will be negative if this tail call requires more space than we
9880 // would automatically have in our incoming argument space. Positive if we
9881 // can actually shrink the stack.
9882 FPDiff = NumReusableBytes - NumBytes;
9883
9884 // Update the required reserved area if this is the tail call requiring the
9885 // most argument stack space.
9886 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9887 FuncInfo->setTailCallReservedStack(-FPDiff);
9888
9889 // The stack pointer must be 16-byte aligned at all times it's used for a
9890 // memory operation, which in practice means at *all* times and in
9891 // particular across call boundaries. Therefore our own arguments started at
9892 // a 16-byte aligned SP and the delta applied for the tail call should
9893 // satisfy the same constraint.
9894 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9895 }
9896
9897 auto DescribeCallsite =
9898 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9899 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9900 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9901 R << ore::NV("Callee", ES->getSymbol());
9902 else if (CLI.CB && CLI.CB->getCalledFunction())
9903 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9904 else
9905 R << "unknown callee";
9906 R << "'";
9907 return R;
9908 };
9909
9910 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9911 bool RequiresSaveAllZA =
9912 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9913 if (RequiresLazySave) {
9914 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9915 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9916 TPIDR2.FrameIndex,
9918 Chain = DAG.getNode(
9919 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9920 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9921 TPIDR2ObjAddr);
9922 OptimizationRemarkEmitter ORE(&MF.getFunction());
9923 ORE.emit([&]() {
9924 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9925 CLI.CB)
9926 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9927 &MF.getFunction());
9928 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9929 });
9930 } else if (RequiresSaveAllZA) {
9931 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9932 "Cannot share state that may not exist");
9933 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9934 /*IsSave=*/true);
9935 }
9936
9937 bool RequiresSMChange = CallAttrs.requiresSMChange();
9938 if (RequiresSMChange) {
9939 OptimizationRemarkEmitter ORE(&MF.getFunction());
9940 ORE.emit([&]() {
9941 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9942 CLI.CB)
9943 : OptimizationRemarkAnalysis("sme", "SMETransition",
9944 &MF.getFunction());
9945 DescribeCallsite(R) << " requires a streaming mode transition";
9946 return R;
9947 });
9948 }
9949
9950 SDValue ZTFrameIdx;
9951 MachineFrameInfo &MFI = MF.getFrameInfo();
9952 bool ShouldPreserveZT0 =
9953 !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0();
9954
9955 // If the caller has ZT0 state which will not be preserved by the callee,
9956 // spill ZT0 before the call.
9957 if (ShouldPreserveZT0) {
9958 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9959
9960 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9961 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9962 }
9963
9964 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9965 // PSTATE.ZA before the call if there is no lazy-save active.
9966 bool DisableZA =
9967 !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall();
9968 assert((!DisableZA || !RequiresLazySave) &&
9969 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9970
9971 if (DisableZA)
9972 Chain = DAG.getNode(
9973 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9974 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9975
9976 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9977 // These operations are automatically eliminated by the prolog/epilog pass
9978 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9979 if (!IsSibCall) {
9980 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9981 if (ZAMarkerNode) {
9982 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9983 // using a chain can result in incorrect scheduling. The markers refer to
9984 // the position just before the CALLSEQ_START (though occur after as
9985 // CALLSEQ_START lacks in-glue).
9986 Chain =
9987 DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other, MVT::Glue),
9988 {Chain, Chain.getValue(1)});
9989 }
9990 }
9991
9992 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9994
9996 SmallSet<unsigned, 8> RegsUsed;
9997 SmallVector<SDValue, 8> MemOpChains;
9998 auto PtrVT = getPointerTy(DAG.getDataLayout());
9999
10000 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
10001 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
10002 for (const auto &F : Forwards) {
10003 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
10004 RegsToPass.emplace_back(F.PReg, Val);
10005 }
10006 }
10007
10008 // Walk the register/memloc assignments, inserting copies/loads.
10009 unsigned ExtraArgLocs = 0;
10010 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
10011 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
10012 SDValue Arg = OutVals[i];
10013 ISD::ArgFlagsTy Flags = Outs[i].Flags;
10014
10015 // Promote the value if needed.
10016 switch (VA.getLocInfo()) {
10017 default:
10018 llvm_unreachable("Unknown loc info!");
10019 case CCValAssign::Full:
10020 break;
10021 case CCValAssign::SExt:
10022 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
10023 break;
10024 case CCValAssign::ZExt:
10025 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10026 break;
10027 case CCValAssign::AExt:
10028 if (Outs[i].ArgVT == MVT::i1) {
10029 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
10030 //
10031 // Check if we actually have to do this, because the value may
10032 // already be zero-extended.
10033 //
10034 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
10035 // and rely on DAGCombiner to fold this, because the following
10036 // (anyext i32) is combined with (zext i8) in DAG.getNode:
10037 //
10038 // (ext (zext x)) -> (zext x)
10039 //
10040 // This will give us (zext i32), which we cannot remove, so
10041 // try to check this beforehand.
10042 if (!checkZExtBool(Arg, DAG)) {
10043 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10044 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
10045 }
10046 }
10047 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10048 break;
10050 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10051 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10052 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10053 DAG.getConstant(32, DL, VA.getLocVT()));
10054 break;
10055 case CCValAssign::BCvt:
10056 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
10057 break;
10058 case CCValAssign::Trunc:
10059 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10060 break;
10061 case CCValAssign::FPExt:
10062 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
10063 break;
10065 bool isScalable = VA.getValVT().isScalableVT();
10066 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
10067 "Indirect arguments should be scalable on most subtargets");
10068
10069 TypeSize StoreSize = VA.getValVT().getStoreSize();
10070 TypeSize PartSize = StoreSize;
10071 unsigned NumParts = 1;
10072 if (Outs[i].Flags.isInConsecutiveRegs()) {
10073 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
10074 ++NumParts;
10075 StoreSize *= NumParts;
10076 }
10077
10078 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
10079 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
10080 MachineFrameInfo &MFI = MF.getFrameInfo();
10081 int FI =
10082 MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
10083 if (isScalable) {
10084 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
10085 VA.getValVT().getVectorElementType() == MVT::i1;
10088 }
10089
10090 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
10091 SDValue Ptr = DAG.getFrameIndex(
10093 SDValue SpillSlot = Ptr;
10094
10095 // Ensure we generate all stores for each tuple part, whilst updating the
10096 // pointer after each store correctly using vscale.
10097 while (NumParts) {
10098 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
10099 MemOpChains.push_back(Store);
10100
10101 NumParts--;
10102 if (NumParts > 0) {
10103 SDValue BytesIncrement =
10104 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
10105 MPI = MachinePointerInfo(MPI.getAddrSpace());
10106 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
10107 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
10108 ExtraArgLocs++;
10109 i++;
10110 }
10111 }
10112
10113 Arg = SpillSlot;
10114 break;
10115 }
10116
10117 if (VA.isRegLoc()) {
10118 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
10119 Outs[0].VT == MVT::i64) {
10120 assert(VA.getLocVT() == MVT::i64 &&
10121 "unexpected calling convention register assignment");
10122 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
10123 "unexpected use of 'returned'");
10124 IsThisReturn = true;
10125 }
10126 if (RegsUsed.count(VA.getLocReg())) {
10127 // If this register has already been used then we're trying to pack
10128 // parts of an [N x i32] into an X-register. The extension type will
10129 // take care of putting the two halves in the right place but we have to
10130 // combine them.
10131 SDValue &Bits =
10132 llvm::find_if(RegsToPass,
10133 [=](const std::pair<unsigned, SDValue> &Elt) {
10134 return Elt.first == VA.getLocReg();
10135 })
10136 ->second;
10137 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10138 // Call site info is used for function's parameter entry value
10139 // tracking. For now we track only simple cases when parameter
10140 // is transferred through whole register.
10142 [&VA](MachineFunction::ArgRegPair ArgReg) {
10143 return ArgReg.Reg == VA.getLocReg();
10144 });
10145 } else {
10146 // Add an extra level of indirection for streaming mode changes by
10147 // using a pseudo copy node that cannot be rematerialised between a
10148 // smstart/smstop and the call by the simple register coalescer.
10149 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
10150 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10151 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
10152 RegsToPass.emplace_back(VA.getLocReg(), Arg);
10153 RegsUsed.insert(VA.getLocReg());
10154 const TargetOptions &Options = DAG.getTarget().Options;
10155 if (Options.EmitCallSiteInfo)
10156 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
10157 }
10158 } else {
10159 assert(VA.isMemLoc());
10160
10161 SDValue DstAddr;
10162 MachinePointerInfo DstInfo;
10163
10164 // FIXME: This works on big-endian for composite byvals, which are the
10165 // common case. It should also work for fundamental types too.
10166 uint32_t BEAlign = 0;
10167 unsigned OpSize;
10168 if (VA.getLocInfo() == CCValAssign::Indirect ||
10170 OpSize = VA.getLocVT().getFixedSizeInBits();
10171 else
10172 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
10173 : VA.getValVT().getSizeInBits();
10174 OpSize = (OpSize + 7) / 8;
10175 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
10176 !Flags.isInConsecutiveRegs()) {
10177 if (OpSize < 8)
10178 BEAlign = 8 - OpSize;
10179 }
10180 unsigned LocMemOffset = VA.getLocMemOffset();
10181 int32_t Offset = LocMemOffset + BEAlign;
10182
10183 if (IsTailCall) {
10184 // When the frame pointer is perfectly aligned for the tail call and the
10185 // same stack argument is passed down intact, we can reuse it.
10186 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
10187 continue;
10188
10189 Offset = Offset + FPDiff;
10190 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
10191
10192 DstAddr = DAG.getFrameIndex(FI, PtrVT);
10193 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
10194
10195 // Make sure any stack arguments overlapping with where we're storing
10196 // are loaded before this eventual operation. Otherwise they'll be
10197 // clobbered.
10198 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
10199 } else {
10200 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
10201
10202 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
10203 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
10204 }
10205
10206 if (Outs[i].Flags.isByVal()) {
10207 SDValue SizeNode =
10208 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
10209 SDValue Cpy = DAG.getMemcpy(
10210 Chain, DL, DstAddr, Arg, SizeNode,
10211 Outs[i].Flags.getNonZeroByValAlign(),
10212 /*isVol = */ false, /*AlwaysInline = */ false,
10213 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
10214
10215 MemOpChains.push_back(Cpy);
10216 } else {
10217 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
10218 // promoted to a legal register type i32, we should truncate Arg back to
10219 // i1/i8/i16.
10220 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
10221 VA.getValVT() == MVT::i16)
10222 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
10223
10224 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
10225 MemOpChains.push_back(Store);
10226 }
10227 }
10228 }
10229
10230 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
10231 !(CLI.CB && CLI.CB->isMustTailCall())) {
10232 SDValue ParamPtr = StackPtr;
10233 if (IsTailCall) {
10234 // Create a dummy object at the top of the stack that can be used to get
10235 // the SP after the epilogue
10236 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
10237 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
10238 }
10239
10240 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
10241 // describing the argument list. x4 contains the address of the
10242 // first stack parameter. x5 contains the size in bytes of all parameters
10243 // passed on the stack.
10244 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
10245 RegsToPass.emplace_back(AArch64::X5,
10246 DAG.getConstant(NumBytes, DL, MVT::i64));
10247 }
10248
10249 if (!MemOpChains.empty())
10250 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
10251
10252 SDValue InGlue;
10253 if (RequiresSMChange) {
10254 bool InsertVectorLengthCheck =
10256 Chain = changeStreamingMode(
10257 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
10258 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
10259 InGlue = Chain.getValue(1);
10260 }
10261
10262 // Build a sequence of copy-to-reg nodes chained together with token chain
10263 // and flag operands which copy the outgoing args into the appropriate regs.
10264 for (auto &RegToPass : RegsToPass) {
10265 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
10266 RegToPass.second, InGlue);
10267 InGlue = Chain.getValue(1);
10268 }
10269
10270 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
10271 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
10272 // node so that legalize doesn't hack it.
10273 const GlobalValue *CalledGlobal = nullptr;
10274 unsigned OpFlags = 0;
10275 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
10276 CalledGlobal = G->getGlobal();
10277 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
10279 if (OpFlags & AArch64II::MO_GOT) {
10280 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
10281 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10282 } else {
10283 const GlobalValue *GV = G->getGlobal();
10284 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
10285 }
10286 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
10287 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
10288 Subtarget->isTargetMachO()) ||
10290 const char *Sym = S->getSymbol();
10291 if (UseGot) {
10293 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10294 } else {
10295 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
10296 }
10297 }
10298
10299 // We don't usually want to end the call-sequence here because we would tidy
10300 // the frame up *after* the call, however in the ABI-changing tail-call case
10301 // we've carefully laid out the parameters so that when sp is reset they'll be
10302 // in the correct location.
10303 if (IsTailCall && !IsSibCall) {
10304 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
10305 InGlue = Chain.getValue(1);
10306 }
10307
10308 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
10309
10310 std::vector<SDValue> Ops;
10311 Ops.push_back(Chain);
10312 Ops.push_back(Callee);
10313
10314 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
10315 // be expanded to the call, directly followed by a special marker sequence and
10316 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
10317 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
10318 assert(!IsTailCall &&
10319 "tail calls cannot be marked with clang.arc.attachedcall");
10320 Opc = AArch64ISD::CALL_RVMARKER;
10321
10322 // Add a target global address for the retainRV/claimRV runtime function
10323 // just before the call target.
10324 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
10325 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
10326 Ops.insert(Ops.begin() + 1, GA);
10327
10328 // We may or may not need to emit both the marker and the retain/claim call.
10329 // Tell the pseudo expansion using an additional boolean op.
10330 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
10331 SDValue DoEmitMarker =
10332 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
10333 Ops.insert(Ops.begin() + 2, DoEmitMarker);
10334 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10335 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
10336 } else if (GuardWithBTI) {
10337 Opc = AArch64ISD::CALL_BTI;
10338 }
10339
10340 if (IsTailCall) {
10341 // Each tail call may have to adjust the stack by a different amount, so
10342 // this information must travel along with the operation for eventual
10343 // consumption by emitEpilogue.
10344 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
10345 }
10346
10347 if (CLI.PAI) {
10348 const uint64_t Key = CLI.PAI->Key;
10350 "Invalid auth call key");
10351
10352 // Split the discriminator into address/integer components.
10353 SDValue AddrDisc, IntDisc;
10354 std::tie(IntDisc, AddrDisc) =
10355 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
10356
10357 if (Opc == AArch64ISD::CALL_RVMARKER)
10358 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
10359 else
10360 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
10361 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
10362 Ops.push_back(IntDisc);
10363 Ops.push_back(AddrDisc);
10364 }
10365
10366 // Add argument registers to the end of the list so that they are known live
10367 // into the call.
10368 for (auto &RegToPass : RegsToPass)
10369 Ops.push_back(DAG.getRegister(RegToPass.first,
10370 RegToPass.second.getValueType()));
10371
10372 // Add a register mask operand representing the call-preserved registers.
10373 const uint32_t *Mask;
10374 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10375 if (IsThisReturn) {
10376 // For 'this' returns, use the X0-preserving mask if applicable
10377 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10378 if (!Mask) {
10379 IsThisReturn = false;
10380 Mask = TRI->getCallPreservedMask(MF, CallConv);
10381 }
10382 } else
10383 Mask = TRI->getCallPreservedMask(MF, CallConv);
10384
10385 if (Subtarget->hasCustomCallingConv())
10386 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10387
10388 if (TRI->isAnyArgRegReserved(MF))
10389 TRI->emitReservedArgRegCallError(MF);
10390
10391 assert(Mask && "Missing call preserved mask for calling convention");
10392 Ops.push_back(DAG.getRegisterMask(Mask));
10393
10394 if (InGlue.getNode())
10395 Ops.push_back(InGlue);
10396
10397 if (CLI.DeactivationSymbol)
10398 Ops.push_back(DAG.getDeactivationSymbol(CLI.DeactivationSymbol));
10399
10400 // If we're doing a tall call, use a TC_RETURN here rather than an
10401 // actual call instruction.
10402 if (IsTailCall) {
10404 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10405 if (IsCFICall)
10406 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10407
10408 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10409 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10410 if (CalledGlobal &&
10411 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10412 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10413 return Ret;
10414 }
10415
10416 // Returns a chain and a flag for retval copy to use.
10417 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10418 if (IsCFICall)
10419 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10420
10421 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10422 InGlue = Chain.getValue(1);
10423 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10424 if (CalledGlobal &&
10425 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10426 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10427
10428 uint64_t CalleePopBytes =
10429 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10430
10431 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10432 InGlue = Chain.getValue(1);
10433
10434 // Handle result values, copying them out of physregs into vregs that we
10435 // return.
10436 SDValue Result = LowerCallResult(
10437 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10438 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10439
10440 if (!Ins.empty())
10441 InGlue = Result.getValue(Result->getNumValues() - 1);
10442
10443 if (RequiresSMChange) {
10445 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10446 getSMToggleCondition(CallAttrs));
10447 }
10448
10449 if (!UseNewSMEABILowering &&
10450 (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()))
10451 // Unconditionally resume ZA.
10452 Result = DAG.getNode(
10453 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10454 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10455
10456 if (ShouldPreserveZT0)
10457 Result =
10458 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10459 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10460
10461 if (RequiresLazySave) {
10462 Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
10463 } else if (RequiresSaveAllZA) {
10464 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10465 /*IsSave=*/false);
10466 }
10467
10468 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10469 RequiresSaveAllZA) {
10470 for (unsigned I = 0; I < InVals.size(); ++I) {
10471 // The smstart/smstop is chained as part of the call, but when the
10472 // resulting chain is discarded (which happens when the call is not part
10473 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10474 // smstart/smstop is chained to the result value. We can do that by doing
10475 // a vreg -> vreg copy.
10478 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10479 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10480 InVals[I].getValueType());
10481 }
10482 }
10483
10484 if (CallConv == CallingConv::PreserveNone) {
10485 for (const ISD::OutputArg &O : Outs) {
10486 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10487 O.Flags.isSwiftAsync()) {
10488 MachineFunction &MF = DAG.getMachineFunction();
10489 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10490 MF.getFunction(),
10491 "Swift attributes can't be used with preserve_none",
10492 DL.getDebugLoc()));
10493 break;
10494 }
10495 }
10496 }
10497
10498 return Result;
10499}
10500
10501bool AArch64TargetLowering::CanLowerReturn(
10502 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10504 const Type *RetTy) const {
10505 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10507 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10508 return CCInfo.CheckReturn(Outs, RetCC);
10509}
10510
10511SDValue
10512AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10513 bool isVarArg,
10515 const SmallVectorImpl<SDValue> &OutVals,
10516 const SDLoc &DL, SelectionDAG &DAG) const {
10517 auto &MF = DAG.getMachineFunction();
10518 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10519
10520 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10522 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10523 CCInfo.AnalyzeReturn(Outs, RetCC);
10524
10525 // Copy the result values into the output registers.
10526 SDValue Glue;
10528 SmallSet<unsigned, 4> RegsUsed;
10529 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10530 ++i, ++realRVLocIdx) {
10531 CCValAssign &VA = RVLocs[i];
10532 assert(VA.isRegLoc() && "Can only return in registers!");
10533 SDValue Arg = OutVals[realRVLocIdx];
10534
10535 switch (VA.getLocInfo()) {
10536 default:
10537 llvm_unreachable("Unknown loc info!");
10538 case CCValAssign::Full:
10539 if (Outs[i].ArgVT == MVT::i1) {
10540 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10541 // value. This is strictly redundant on Darwin (which uses "zeroext
10542 // i1"), but will be optimised out before ISel.
10543 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10544 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10545 }
10546 break;
10547 case CCValAssign::BCvt:
10548 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10549 break;
10550 case CCValAssign::AExt:
10551 case CCValAssign::ZExt:
10552 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10553 break;
10555 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10556 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10557 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10558 DAG.getConstant(32, DL, VA.getLocVT()));
10559 break;
10560 }
10561
10562 if (RegsUsed.count(VA.getLocReg())) {
10563 SDValue &Bits =
10564 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10565 return Elt.first == VA.getLocReg();
10566 })->second;
10567 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10568 } else {
10569 RetVals.emplace_back(VA.getLocReg(), Arg);
10570 RegsUsed.insert(VA.getLocReg());
10571 }
10572 }
10573
10574 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10575
10576 // Emit SMSTOP before returning from a locally streaming function
10577 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10578 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10579 if (FuncAttrs.hasStreamingCompatibleInterface())
10580 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10581 /*Glue*/ SDValue(),
10583 else
10584 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10585 /*Glue*/ SDValue(), AArch64SME::Always);
10586 Glue = Chain.getValue(1);
10587 }
10588
10589 SmallVector<SDValue, 4> RetOps(1, Chain);
10590 for (auto &RetVal : RetVals) {
10591 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10592 isPassedInFPR(RetVal.second.getValueType()))
10593 RetVal.second =
10594 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10595 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10596 RetVal.second);
10597 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10598 Glue = Chain.getValue(1);
10599 RetOps.push_back(
10600 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10601 }
10602
10603 // Windows AArch64 ABIs require that for returning structs by value we copy
10604 // the sret argument into X0 for the return.
10605 // We saved the argument into a virtual register in the entry block,
10606 // so now we copy the value out and into X0.
10607 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10608 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10610
10611 unsigned RetValReg = AArch64::X0;
10612 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10613 RetValReg = AArch64::X8;
10614 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10615 Glue = Chain.getValue(1);
10616
10617 RetOps.push_back(
10618 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10619 }
10620
10621 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10622 if (I) {
10623 for (; *I; ++I) {
10624 if (AArch64::GPR64RegClass.contains(*I))
10625 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10626 else if (AArch64::FPR64RegClass.contains(*I))
10627 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10628 else
10629 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10630 }
10631 }
10632
10633 RetOps[0] = Chain; // Update chain.
10634
10635 // Add the glue if we have it.
10636 if (Glue.getNode())
10637 RetOps.push_back(Glue);
10638
10639 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10640 // ARM64EC entry thunks use a special return sequence: instead of a regular
10641 // "ret" instruction, they need to explicitly call the emulator.
10642 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10643 SDValue Arm64ECRetDest =
10644 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10645 Arm64ECRetDest =
10646 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10647 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10648 MachinePointerInfo());
10649 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10650 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10651 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10652 }
10653
10654 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10655}
10656
10657//===----------------------------------------------------------------------===//
10658// Other Lowering Code
10659//===----------------------------------------------------------------------===//
10660
10661SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10662 SelectionDAG &DAG,
10663 unsigned Flag) const {
10664 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10665 N->getOffset(), Flag);
10666}
10667
10668SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10669 SelectionDAG &DAG,
10670 unsigned Flag) const {
10671 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10672}
10673
10674SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10675 SelectionDAG &DAG,
10676 unsigned Flag) const {
10677 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10678 N->getOffset(), Flag);
10679}
10680
10681SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10682 SelectionDAG &DAG,
10683 unsigned Flag) const {
10684 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10685}
10686
10687SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10688 SelectionDAG &DAG,
10689 unsigned Flag) const {
10690 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10691}
10692
10693// (loadGOT sym)
10694template <class NodeTy>
10695SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10696 unsigned Flags) const {
10697 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10698 SDLoc DL(N);
10699 EVT Ty = getPointerTy(DAG.getDataLayout());
10700 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10701 // FIXME: Once remat is capable of dealing with instructions with register
10702 // operands, expand this into two nodes instead of using a wrapper node.
10703 if (DAG.getMachineFunction()
10704 .getInfo<AArch64FunctionInfo>()
10705 ->hasELFSignedGOT())
10706 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10707 0);
10708 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10709}
10710
10711// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10712template <class NodeTy>
10713SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10714 unsigned Flags) const {
10715 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10716 SDLoc DL(N);
10717 EVT Ty = getPointerTy(DAG.getDataLayout());
10718 const unsigned char MO_NC = AArch64II::MO_NC;
10719 return DAG.getNode(
10720 AArch64ISD::WrapperLarge, DL, Ty,
10721 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10722 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10723 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10724 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10725}
10726
10727// (addlow (adrp %hi(sym)) %lo(sym))
10728template <class NodeTy>
10729SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10730 unsigned Flags) const {
10731 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10732 SDLoc DL(N);
10733 EVT Ty = getPointerTy(DAG.getDataLayout());
10734 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10735 SDValue Lo = getTargetNode(N, Ty, DAG,
10737 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10738 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10739}
10740
10741// (adr sym)
10742template <class NodeTy>
10743SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10744 unsigned Flags) const {
10745 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10746 SDLoc DL(N);
10747 EVT Ty = getPointerTy(DAG.getDataLayout());
10748 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10749 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10750}
10751
10752SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10753 SelectionDAG &DAG) const {
10754 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10755 const GlobalValue *GV = GN->getGlobal();
10756 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10757
10758 if (OpFlags != AArch64II::MO_NO_FLAG)
10760 "unexpected offset in global node");
10761
10762 // This also catches the large code model case for Darwin, and tiny code
10763 // model with got relocations.
10764 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10765 return getGOT(GN, DAG, OpFlags);
10766 }
10767
10771 Result = getAddrLarge(GN, DAG, OpFlags);
10772 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10773 Result = getAddrTiny(GN, DAG, OpFlags);
10774 } else {
10775 Result = getAddr(GN, DAG, OpFlags);
10776 }
10777 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10778 SDLoc DL(GN);
10780 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10782 return Result;
10783}
10784
10785/// Convert a TLS address reference into the correct sequence of loads
10786/// and calls to compute the variable's address (for Darwin, currently) and
10787/// return an SDValue containing the final node.
10788
10789/// Darwin only has one TLS scheme which must be capable of dealing with the
10790/// fully general situation, in the worst case. This means:
10791/// + "extern __thread" declaration.
10792/// + Defined in a possibly unknown dynamic library.
10793///
10794/// The general system is that each __thread variable has a [3 x i64] descriptor
10795/// which contains information used by the runtime to calculate the address. The
10796/// only part of this the compiler needs to know about is the first xword, which
10797/// contains a function pointer that must be called with the address of the
10798/// entire descriptor in "x0".
10799///
10800/// Since this descriptor may be in a different unit, in general even the
10801/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10802/// is:
10803/// adrp x0, _var@TLVPPAGE
10804/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10805/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10806/// ; the function pointer
10807/// blr x1 ; Uses descriptor address in x0
10808/// ; Address of _var is now in x0.
10809///
10810/// If the address of _var's descriptor *is* known to the linker, then it can
10811/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10812/// a slight efficiency gain.
10813SDValue
10814AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10815 SelectionDAG &DAG) const {
10816 assert(Subtarget->isTargetDarwin() &&
10817 "This function expects a Darwin target");
10818
10819 SDLoc DL(Op);
10820 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10821 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10822 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10823
10824 SDValue TLVPAddr =
10825 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10826 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10827
10828 // The first entry in the descriptor is a function pointer that we must call
10829 // to obtain the address of the variable.
10830 SDValue Chain = DAG.getEntryNode();
10831 SDValue FuncTLVGet = DAG.getLoad(
10832 PtrMemVT, DL, Chain, DescAddr,
10834 Align(PtrMemVT.getSizeInBits() / 8),
10836 Chain = FuncTLVGet.getValue(1);
10837
10838 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10839 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10840
10841 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10842 MFI.setAdjustsStack(true);
10843
10844 // TLS calls preserve all registers except those that absolutely must be
10845 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10846 // silly).
10847 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10848 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10849 if (Subtarget->hasCustomCallingConv())
10850 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10851
10852 // Finally, we can make the call. This is just a degenerate version of a
10853 // normal AArch64 call node: x0 takes the address of the descriptor, and
10854 // returns the address of the variable in this thread.
10855 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10856
10857 unsigned Opcode = AArch64ISD::CALL;
10859 Ops.push_back(Chain);
10860 Ops.push_back(FuncTLVGet);
10861
10862 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10863 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10864 Opcode = AArch64ISD::AUTH_CALL;
10865 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10866 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10867 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10868 }
10869
10870 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10871 Ops.push_back(DAG.getRegisterMask(Mask));
10872 Ops.push_back(Chain.getValue(1));
10873 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10874 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10875}
10876
10877/// Convert a thread-local variable reference into a sequence of instructions to
10878/// compute the variable's address for the local exec TLS model of ELF targets.
10879/// The sequence depends on the maximum TLS area size.
10880SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10881 SDValue ThreadBase,
10882 const SDLoc &DL,
10883 SelectionDAG &DAG) const {
10884 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10885 SDValue TPOff, Addr;
10886
10887 switch (DAG.getTarget().Options.TLSSize) {
10888 default:
10889 llvm_unreachable("Unexpected TLS size");
10890
10891 case 12: {
10892 // mrs x0, TPIDR_EL0
10893 // add x0, x0, :tprel_lo12:a
10895 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10896 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10897 Var,
10898 DAG.getTargetConstant(0, DL, MVT::i32)),
10899 0);
10900 }
10901
10902 case 24: {
10903 // mrs x0, TPIDR_EL0
10904 // add x0, x0, :tprel_hi12:a
10905 // add x0, x0, :tprel_lo12_nc:a
10906 SDValue HiVar = DAG.getTargetGlobalAddress(
10907 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10908 SDValue LoVar = DAG.getTargetGlobalAddress(
10909 GV, DL, PtrVT, 0,
10911 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10912 HiVar,
10913 DAG.getTargetConstant(0, DL, MVT::i32)),
10914 0);
10915 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10916 LoVar,
10917 DAG.getTargetConstant(0, DL, MVT::i32)),
10918 0);
10919 }
10920
10921 case 32: {
10922 // mrs x1, TPIDR_EL0
10923 // movz x0, #:tprel_g1:a
10924 // movk x0, #:tprel_g0_nc:a
10925 // add x0, x1, x0
10926 SDValue HiVar = DAG.getTargetGlobalAddress(
10927 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10928 SDValue LoVar = DAG.getTargetGlobalAddress(
10929 GV, DL, PtrVT, 0,
10931 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10932 DAG.getTargetConstant(16, DL, MVT::i32)),
10933 0);
10934 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10935 DAG.getTargetConstant(0, DL, MVT::i32)),
10936 0);
10937 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10938 }
10939
10940 case 48: {
10941 // mrs x1, TPIDR_EL0
10942 // movz x0, #:tprel_g2:a
10943 // movk x0, #:tprel_g1_nc:a
10944 // movk x0, #:tprel_g0_nc:a
10945 // add x0, x1, x0
10946 SDValue HiVar = DAG.getTargetGlobalAddress(
10947 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10948 SDValue MiVar = DAG.getTargetGlobalAddress(
10949 GV, DL, PtrVT, 0,
10951 SDValue LoVar = DAG.getTargetGlobalAddress(
10952 GV, DL, PtrVT, 0,
10954 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10955 DAG.getTargetConstant(32, DL, MVT::i32)),
10956 0);
10957 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10958 DAG.getTargetConstant(16, DL, MVT::i32)),
10959 0);
10960 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10961 DAG.getTargetConstant(0, DL, MVT::i32)),
10962 0);
10963 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10964 }
10965 }
10966}
10967
10968/// When accessing thread-local variables under either the general-dynamic or
10969/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10970/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10971/// is a function pointer to carry out the resolution.
10972///
10973/// The sequence is:
10974/// adrp x0, :tlsdesc:var
10975/// ldr x1, [x0, #:tlsdesc_lo12:var]
10976/// add x0, x0, #:tlsdesc_lo12:var
10977/// .tlsdesccall var
10978/// blr x1
10979/// (TPIDR_EL0 offset now in x0)
10980///
10981/// The above sequence must be produced unscheduled, to enable the linker to
10982/// optimize/relax this sequence.
10983/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10984/// above sequence, and expanded really late in the compilation flow, to ensure
10985/// the sequence is produced as per above.
10986SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10987 const SDLoc &DL,
10988 SelectionDAG &DAG) const {
10989 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10990 auto &MF = DAG.getMachineFunction();
10991 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10992
10993 SDValue Glue;
10994 SDValue Chain = DAG.getEntryNode();
10995 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10996
10997 SMECallAttrs TLSCallAttrs(FuncInfo->getSMEFnAttrs(), {}, SMEAttrs::Normal);
10998 bool RequiresSMChange = TLSCallAttrs.requiresSMChange();
10999
11000 auto ChainAndGlue = [](SDValue Chain) -> std::pair<SDValue, SDValue> {
11001 return {Chain, Chain.getValue(1)};
11002 };
11003
11004 if (RequiresSMChange)
11005 std::tie(Chain, Glue) =
11006 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/false, Chain, Glue,
11007 getSMToggleCondition(TLSCallAttrs)));
11008
11009 unsigned Opcode =
11010 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
11011 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
11012 : AArch64ISD::TLSDESC_CALLSEQ;
11013 SDValue Ops[] = {Chain, SymAddr, Glue};
11014 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11015 Opcode, DL, NodeTys, Glue ? ArrayRef(Ops) : ArrayRef(Ops).drop_back()));
11016
11017 if (TLSCallAttrs.requiresLazySave())
11018 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11019 AArch64ISD::REQUIRES_ZA_SAVE, DL, NodeTys, {Chain, Chain.getValue(1)}));
11020
11021 if (RequiresSMChange)
11022 std::tie(Chain, Glue) =
11023 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
11024 getSMToggleCondition(TLSCallAttrs)));
11025
11026 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
11027}
11028
11029SDValue
11030AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
11031 SelectionDAG &DAG) const {
11032 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
11033
11034 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11035 AArch64FunctionInfo *MFI =
11036 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11037
11041
11043 if (Model == TLSModel::LocalDynamic)
11045 }
11046
11048 Model != TLSModel::LocalExec)
11049 report_fatal_error("ELF TLS only supported in small memory model or "
11050 "in local exec TLS model");
11051 // Different choices can be made for the maximum size of the TLS area for a
11052 // module. For the small address model, the default TLS size is 16MiB and the
11053 // maximum TLS size is 4GiB.
11054 // FIXME: add tiny and large code model support for TLS access models other
11055 // than local exec. We currently generate the same code as small for tiny,
11056 // which may be larger than needed.
11057
11058 SDValue TPOff;
11059 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11060 SDLoc DL(Op);
11061 const GlobalValue *GV = GA->getGlobal();
11062
11063 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
11064
11065 if (Model == TLSModel::LocalExec) {
11066 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
11067 } else if (Model == TLSModel::InitialExec) {
11068 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11069 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
11070 } else if (Model == TLSModel::LocalDynamic) {
11071 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
11072 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
11073 // the beginning of the module's TLS region, followed by a DTPREL offset
11074 // calculation.
11075
11076 // These accesses will need deduplicating if there's more than one.
11078
11079 // The call needs a relocation too for linker relaxation. It doesn't make
11080 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11081 // the address.
11082 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
11084
11085 // Now we can calculate the offset from TPIDR_EL0 to this module's
11086 // thread-local area.
11087 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11088
11089 // Now use :dtprel_whatever: operations to calculate this variable's offset
11090 // in its thread-storage area.
11091 SDValue HiVar = DAG.getTargetGlobalAddress(
11092 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11093 SDValue LoVar = DAG.getTargetGlobalAddress(
11094 GV, DL, MVT::i64, 0,
11096
11097 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
11098 DAG.getTargetConstant(0, DL, MVT::i32)),
11099 0);
11100 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
11101 DAG.getTargetConstant(0, DL, MVT::i32)),
11102 0);
11103 } else if (Model == TLSModel::GeneralDynamic) {
11104 // The call needs a relocation too for linker relaxation. It doesn't make
11105 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11106 // the address.
11107 SDValue SymAddr =
11108 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11109
11110 // Finally we can make a call to calculate the offset from tpidr_el0.
11111 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11112 } else
11113 llvm_unreachable("Unsupported ELF TLS access model");
11114
11115 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
11116}
11117
11118SDValue
11119AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
11120 SelectionDAG &DAG) const {
11121 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
11122
11123 SDValue Chain = DAG.getEntryNode();
11124 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11125 SDLoc DL(Op);
11126
11127 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
11128
11129 // Load the ThreadLocalStoragePointer from the TEB
11130 // A pointer to the TLS array is located at offset 0x58 from the TEB.
11131 SDValue TLSArray =
11132 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
11133 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
11134 Chain = TLSArray.getValue(1);
11135
11136 // Load the TLS index from the C runtime;
11137 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
11138 // This also does the same as LOADgot, but using a generic i32 load,
11139 // while LOADgot only loads i64.
11140 SDValue TLSIndexHi =
11141 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
11142 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
11143 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
11144 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
11145 SDValue TLSIndex =
11146 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
11147 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
11148 Chain = TLSIndex.getValue(1);
11149
11150 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
11151 // offset into the TLSArray.
11152 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
11153 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
11154 DAG.getConstant(3, DL, PtrVT));
11155 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
11156 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
11157 MachinePointerInfo());
11158 Chain = TLS.getValue(1);
11159
11160 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11161 const GlobalValue *GV = GA->getGlobal();
11162 SDValue TGAHi = DAG.getTargetGlobalAddress(
11163 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11164 SDValue TGALo = DAG.getTargetGlobalAddress(
11165 GV, DL, PtrVT, 0,
11167
11168 // Add the offset from the start of the .tls section (section base).
11169 SDValue Addr =
11170 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
11171 DAG.getTargetConstant(0, DL, MVT::i32)),
11172 0);
11173 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
11174 return Addr;
11175}
11176
11177SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
11178 SelectionDAG &DAG) const {
11179 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11180 if (DAG.getTarget().useEmulatedTLS())
11181 return LowerToTLSEmulatedModel(GA, DAG);
11182
11183 if (Subtarget->isTargetDarwin())
11184 return LowerDarwinGlobalTLSAddress(Op, DAG);
11185 if (Subtarget->isTargetELF())
11186 return LowerELFGlobalTLSAddress(Op, DAG);
11187 if (Subtarget->isTargetWindows())
11188 return LowerWindowsGlobalTLSAddress(Op, DAG);
11189
11190 llvm_unreachable("Unexpected platform trying to use TLS");
11191}
11192
11193//===----------------------------------------------------------------------===//
11194// PtrAuthGlobalAddress lowering
11195//
11196// We have 3 lowering alternatives to choose from:
11197// - MOVaddrPAC: similar to MOVaddr, with added PAC.
11198// If the GV doesn't need a GOT load (i.e., is locally defined)
11199// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
11200//
11201// - LOADgotPAC: similar to LOADgot, with added PAC.
11202// If the GV needs a GOT load, materialize the pointer using the usual
11203// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
11204// section is assumed to be read-only (for example, via relro mechanism). See
11205// LowerMOVaddrPAC.
11206//
11207// - LOADauthptrstatic: similar to LOADgot, but use a
11208// special stub slot instead of a GOT slot.
11209// Load a signed pointer for symbol 'sym' from a stub slot named
11210// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
11211// resolving. This usually lowers to adrp+ldr, but also emits an entry into
11212// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
11213//
11214// All 3 are pseudos that are expand late to longer sequences: this lets us
11215// provide integrity guarantees on the to-be-signed intermediate values.
11216//
11217// LOADauthptrstatic is undesirable because it requires a large section filled
11218// with often similarly-signed pointers, making it a good harvesting target.
11219// Thus, it's only used for ptrauth references to extern_weak to avoid null
11220// checks.
11221
11223 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
11224 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
11225 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
11226 assert(TGN->getGlobal()->hasExternalWeakLinkage());
11227
11228 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
11229 // offset alone as a pointer if the symbol wasn't available, which would
11230 // probably break null checks in users. Ptrauth complicates things further:
11231 // error out.
11232 if (TGN->getOffset() != 0)
11234 "unsupported non-zero offset in weak ptrauth global reference");
11235
11236 if (!isNullConstant(AddrDiscriminator))
11237 report_fatal_error("unsupported weak addr-div ptrauth global");
11238
11239 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11240 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
11241 {TGA, Key, Discriminator}),
11242 0);
11243}
11244
11245SDValue
11246AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
11247 SelectionDAG &DAG) const {
11248 SDValue Ptr = Op.getOperand(0);
11249 uint64_t KeyC = Op.getConstantOperandVal(1);
11250 SDValue AddrDiscriminator = Op.getOperand(2);
11251 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
11252 EVT VT = Op.getValueType();
11253 SDLoc DL(Op);
11254
11255 if (KeyC > AArch64PACKey::LAST)
11256 report_fatal_error("key in ptrauth global out of range [0, " +
11257 Twine((int)AArch64PACKey::LAST) + "]");
11258
11259 // Blend only works if the integer discriminator is 16-bit wide.
11260 if (!isUInt<16>(DiscriminatorC))
11262 "constant discriminator in ptrauth global out of range [0, 0xffff]");
11263
11264 // Choosing between 3 lowering alternatives is target-specific.
11265 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
11266 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
11267
11268 int64_t PtrOffsetC = 0;
11269 if (Ptr.getOpcode() == ISD::ADD) {
11270 PtrOffsetC = Ptr.getConstantOperandVal(1);
11271 Ptr = Ptr.getOperand(0);
11272 }
11273 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
11274 const GlobalValue *PtrGV = PtrN->getGlobal();
11275
11276 // Classify the reference to determine whether it needs a GOT load.
11277 const unsigned OpFlags =
11278 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
11279 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
11280 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
11281 "unsupported non-GOT op flags on ptrauth global reference");
11282
11283 // Fold any offset into the GV; our pseudos expect it there.
11284 PtrOffsetC += PtrN->getOffset();
11285 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
11286 /*TargetFlags=*/0);
11287 assert(PtrN->getTargetFlags() == 0 &&
11288 "unsupported target flags on ptrauth global");
11289
11290 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11291 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
11292 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
11293 ? AddrDiscriminator
11294 : DAG.getRegister(AArch64::XZR, MVT::i64);
11295
11296 // No GOT load needed -> MOVaddrPAC
11297 if (!NeedsGOTLoad) {
11298 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
11299 return SDValue(
11300 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
11301 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11302 0);
11303 }
11304
11305 // GOT load -> LOADgotPAC
11306 // Note that we disallow extern_weak refs to avoid null checks later.
11307 if (!PtrGV->hasExternalWeakLinkage())
11308 return SDValue(
11309 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
11310 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11311 0);
11312
11313 // extern_weak ref -> LOADauthptrstatic
11315 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
11316 DAG);
11317}
11318
11319// Looks through \param Val to determine the bit that can be used to
11320// check the sign of the value. It returns the unextended value and
11321// the sign bit position.
11322std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
11323 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
11324 return {Val.getOperand(0),
11325 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
11326 1};
11327
11328 if (Val.getOpcode() == ISD::SIGN_EXTEND)
11329 return {Val.getOperand(0),
11330 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
11331
11332 return {Val, Val.getValueSizeInBits() - 1};
11333}
11334
11335// Op is an SDValue that is being compared to 0. If the comparison is a bit
11336// test, optimize it to a TBZ or TBNZ.
11338 SDValue Dest, unsigned Opcode,
11339 SelectionDAG &DAG) {
11340 if (Op.getOpcode() != ISD::AND)
11341 return SDValue();
11342
11343 // See if we can use a TBZ to fold in an AND as well.
11344 // TBZ has a smaller branch displacement than CBZ. If the offset is
11345 // out of bounds, a late MI-layer pass rewrites branches.
11346 // 403.gcc is an example that hits this case.
11347 if (isa<ConstantSDNode>(Op.getOperand(1)) &&
11348 isPowerOf2_64(Op.getConstantOperandVal(1))) {
11349 SDValue Test = Op.getOperand(0);
11350 uint64_t Mask = Op.getConstantOperandVal(1);
11351 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Test,
11352 DAG.getConstant(Log2_64(Mask), DL, MVT::i64), Dest);
11353 }
11354
11355 if (Op.getOperand(0).getOpcode() == ISD::SHL) {
11356 auto Op00 = Op.getOperand(0).getOperand(0);
11357 if (isa<ConstantSDNode>(Op00) && Op00->getAsZExtVal() == 1) {
11358 auto Shr = DAG.getNode(ISD::SRL, DL, Op00.getValueType(),
11359 Op.getOperand(1), Op.getOperand(0).getOperand(1));
11360 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Shr,
11361 DAG.getConstant(0, DL, MVT::i64), Dest);
11362 }
11363 }
11364
11365 return SDValue();
11366}
11367
11368SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
11369 SDValue Chain = Op.getOperand(0);
11370 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
11371 SDValue LHS = Op.getOperand(2);
11372 SDValue RHS = Op.getOperand(3);
11373 SDValue Dest = Op.getOperand(4);
11374 SDLoc DL(Op);
11375
11376 MachineFunction &MF = DAG.getMachineFunction();
11377 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
11378 // will not be produced, as they are conditional branch instructions that do
11379 // not set flags.
11380 bool ProduceNonFlagSettingCondBr =
11381 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
11382
11383 // Handle f128 first, since lowering it will result in comparing the return
11384 // value of a libcall against zero, which is just what the rest of LowerBR_CC
11385 // is expecting to deal with.
11386 if (LHS.getValueType() == MVT::f128) {
11387 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11388
11389 // If softenSetCCOperands returned a scalar, we need to compare the result
11390 // against zero to select between true and false values.
11391 if (!RHS.getNode()) {
11392 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11393 CC = ISD::SETNE;
11394 }
11395 }
11396
11397 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
11398 // instruction.
11400 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
11401 // Only lower legal XALUO ops.
11402 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
11403 return SDValue();
11404
11405 // The actual operation with overflow check.
11407 SDValue Value, Overflow;
11408 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
11409
11410 if (CC == ISD::SETNE)
11411 OFCC = getInvertedCondCode(OFCC);
11412 SDValue CCVal = getCondCode(DAG, OFCC);
11413
11414 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11415 Overflow);
11416 }
11417
11418 if (LHS.getValueType().isInteger()) {
11419 assert((LHS.getValueType() == RHS.getValueType()) &&
11420 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11421
11422 // If the RHS of the comparison is zero, we can potentially fold this
11423 // to a specialized branch.
11424 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11425 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
11426 if (CC == ISD::SETEQ) {
11427 if (SDValue Result =
11428 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBZ, DAG))
11429 return Result;
11430
11431 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11432 } else if (CC == ISD::SETNE) {
11433 if (SDValue Result =
11434 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBNZ, DAG))
11435 return Result;
11436
11437 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11438 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11439 // Don't combine AND since emitComparison converts the AND to an ANDS
11440 // (a.k.a. TST) and the test in the test bit and branch instruction
11441 // becomes redundant. This would also increase register pressure.
11442 uint64_t SignBitPos;
11443 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11444 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11445 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11446 }
11447 }
11448 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11449 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11450 // Don't combine AND since emitComparison converts the AND to an ANDS
11451 // (a.k.a. TST) and the test in the test bit and branch instruction
11452 // becomes redundant. This would also increase register pressure.
11453 uint64_t SignBitPos;
11454 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11455 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11456 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11457 }
11458
11459 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11460 // larger branch displacement but do prefer CB over cmp + br.
11461 if (Subtarget->hasCMPBR() &&
11463 ProduceNonFlagSettingCondBr) {
11464 SDValue Cond =
11466 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11467 Dest);
11468 }
11469
11470 SDValue CCVal;
11471 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11472 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11473 Cmp);
11474 }
11475
11476 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11477 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11478
11479 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11480 // clean. Some of them require two branches to implement.
11481 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11482 AArch64CC::CondCode CC1, CC2;
11483 changeFPCCToAArch64CC(CC, CC1, CC2);
11484 SDValue CC1Val = getCondCode(DAG, CC1);
11485 SDValue BR1 =
11486 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11487 if (CC2 != AArch64CC::AL) {
11488 SDValue CC2Val = getCondCode(DAG, CC2);
11489 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11490 Cmp);
11491 }
11492
11493 return BR1;
11494}
11495
11496SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11497 SelectionDAG &DAG) const {
11498 if (!Subtarget->isNeonAvailable() &&
11499 !Subtarget->useSVEForFixedLengthVectors())
11500 return SDValue();
11501
11502 EVT VT = Op.getValueType();
11503 EVT IntVT = VT.changeTypeToInteger();
11504 SDLoc DL(Op);
11505
11506 SDValue In1 = Op.getOperand(0);
11507 SDValue In2 = Op.getOperand(1);
11508 EVT SrcVT = In2.getValueType();
11509
11510 if (!SrcVT.bitsEq(VT))
11511 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11512
11513 if (VT.isScalableVector())
11514 IntVT =
11516
11517 if (VT.isFixedLengthVector() &&
11518 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11519 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11520
11521 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11522 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11523
11524 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11525 return convertFromScalableVector(DAG, VT, Res);
11526 }
11527
11528 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11529 // a SVE FCOPYSIGN.
11530 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11531 Subtarget->isSVEorStreamingSVEAvailable()) {
11532 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11533 return SDValue();
11534 EVT SVT = getPackedSVEVectorVT(VT);
11535
11536 SDValue Ins1 =
11537 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11538 DAG.getConstant(0, DL, MVT::i64));
11539 SDValue Ins2 =
11540 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11541 DAG.getConstant(0, DL, MVT::i64));
11542 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11543 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11544 DAG.getConstant(0, DL, MVT::i64));
11545 }
11546
11547 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11548 if (VT.isScalableVector())
11549 return getSVESafeBitCast(VT, Op, DAG);
11550
11551 return DAG.getBitcast(VT, Op);
11552 };
11553
11554 SDValue VecVal1, VecVal2;
11555 EVT VecVT;
11556 auto SetVecVal = [&](int Idx = -1) {
11557 if (!VT.isVector()) {
11558 VecVal1 =
11559 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11560 VecVal2 =
11561 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11562 } else {
11563 VecVal1 = BitCast(VecVT, In1, DAG);
11564 VecVal2 = BitCast(VecVT, In2, DAG);
11565 }
11566 };
11567 if (VT.isVector()) {
11568 VecVT = IntVT;
11569 SetVecVal();
11570 } else if (VT == MVT::f64) {
11571 VecVT = MVT::v2i64;
11572 SetVecVal(AArch64::dsub);
11573 } else if (VT == MVT::f32) {
11574 VecVT = MVT::v4i32;
11575 SetVecVal(AArch64::ssub);
11576 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11577 VecVT = MVT::v8i16;
11578 SetVecVal(AArch64::hsub);
11579 } else {
11580 llvm_unreachable("Invalid type for copysign!");
11581 }
11582
11583 unsigned BitWidth = In1.getScalarValueSizeInBits();
11584 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11585
11586 // We want to materialize a mask with every bit but the high bit set, but the
11587 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11588 // 64-bit elements. Instead, materialize all bits set and then negate that.
11589 if (VT == MVT::f64 || VT == MVT::v2f64) {
11590 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11591 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11592 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11593 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11594 }
11595
11596 SDValue BSP =
11597 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11598 if (VT == MVT::f16 || VT == MVT::bf16)
11599 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11600 if (VT == MVT::f32)
11601 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11602 if (VT == MVT::f64)
11603 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11604
11605 return BitCast(VT, BSP, DAG);
11606}
11607
11608SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11609 SelectionDAG &DAG) const {
11611 Attribute::NoImplicitFloat))
11612 return SDValue();
11613
11614 EVT VT = Op.getValueType();
11615 if (VT.isScalableVector() ||
11616 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11617 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11618
11619 bool IsParity = Op.getOpcode() == ISD::PARITY;
11620 SDValue Val = Op.getOperand(0);
11621 SDLoc DL(Op);
11622
11623 // for i32, general parity function using EORs is more efficient compared to
11624 // using floating point
11625 if (VT == MVT::i32 && IsParity)
11626 return SDValue();
11627
11628 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11629 if (VT == MVT::i32 || VT == MVT::i64) {
11630 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11631 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11632 DAG.getUNDEF(ContainerVT), Val,
11633 DAG.getVectorIdxConstant(0, DL));
11634 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11635 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11636 DAG.getVectorIdxConstant(0, DL));
11637 if (IsParity)
11638 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11639 return Val;
11640 }
11641
11642 if (VT == MVT::i128) {
11643 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11644 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11645 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11646 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11647 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11648 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11649 if (IsParity)
11650 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11651 return Val;
11652 }
11653 }
11654
11655 if (!Subtarget->isNeonAvailable())
11656 return SDValue();
11657
11658 // If there is no CNT instruction available, GPR popcount can
11659 // be more efficiently lowered to the following sequence that uses
11660 // AdvSIMD registers/instructions as long as the copies to/from
11661 // the AdvSIMD registers are cheap.
11662 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11663 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11664 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11665 // FMOV X0, D0 // copy result back to integer reg
11666 if (VT == MVT::i32 || VT == MVT::i64) {
11667 if (VT == MVT::i32)
11668 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11669 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11670
11671 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11672 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11673 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11674 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11675 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11676 DAG.getConstant(0, DL, MVT::i64));
11677 if (IsParity)
11678 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11679 return AddV;
11680 } else if (VT == MVT::i128) {
11681 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11682
11683 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11684 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11685 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11686 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11687 DAG.getConstant(0, DL, MVT::i64));
11688 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11689 if (IsParity)
11690 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11691 return AddV;
11692 }
11693
11694 assert(!IsParity && "ISD::PARITY of vector types not supported");
11695
11696 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11697 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11698 "Unexpected type for custom ctpop lowering");
11699
11700 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11701 Val = DAG.getBitcast(VT8Bit, Val);
11702 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11703
11704 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11705 VT.getVectorNumElements() >= 2) {
11706 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11707 SDValue Zeros = DAG.getConstant(0, DL, DT);
11708 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11709
11710 if (VT == MVT::v2i64) {
11711 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11712 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11713 } else if (VT == MVT::v2i32) {
11714 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11715 } else if (VT == MVT::v4i32) {
11716 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11717 } else {
11718 llvm_unreachable("Unexpected type for custom ctpop lowering");
11719 }
11720
11721 return Val;
11722 }
11723
11724 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11725 unsigned EltSize = 8;
11726 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11727 while (EltSize != VT.getScalarSizeInBits()) {
11728 EltSize *= 2;
11729 NumElts /= 2;
11730 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11731 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11732 }
11733
11734 return Val;
11735}
11736
11737SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11738 EVT VT = Op.getValueType();
11739 assert(VT.isScalableVector() ||
11741 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11742
11743 SDLoc DL(Op);
11744 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11745 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11746}
11747
11748SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11749 SelectionDAG &DAG) const {
11750
11751 EVT VT = Op.getValueType();
11752 SDLoc DL(Op);
11753 unsigned Opcode = Op.getOpcode();
11754 ISD::CondCode CC;
11755 switch (Opcode) {
11756 default:
11757 llvm_unreachable("Wrong instruction");
11758 case ISD::SMAX:
11759 CC = ISD::SETGT;
11760 break;
11761 case ISD::SMIN:
11762 CC = ISD::SETLT;
11763 break;
11764 case ISD::UMAX:
11765 CC = ISD::SETUGT;
11766 break;
11767 case ISD::UMIN:
11768 CC = ISD::SETULT;
11769 break;
11770 }
11771
11772 // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11773 // prefer using SVE if available.
11774 if (VT.isScalableVector() ||
11775 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
11776 switch (Opcode) {
11777 default:
11778 llvm_unreachable("Wrong instruction");
11779 case ISD::SMAX:
11780 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11781 case ISD::SMIN:
11782 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11783 case ISD::UMAX:
11784 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11785 case ISD::UMIN:
11786 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11787 }
11788 }
11789
11790 SDValue Op0 = Op.getOperand(0);
11791 SDValue Op1 = Op.getOperand(1);
11792 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11793 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11794}
11795
11796SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11797 SelectionDAG &DAG) const {
11798 EVT VT = Op.getValueType();
11799
11800 if (VT.isScalableVector() ||
11802 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11803 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11804
11805 SDLoc DL(Op);
11806 SDValue REVB;
11807 MVT VST;
11808
11809 switch (VT.getSimpleVT().SimpleTy) {
11810 default:
11811 llvm_unreachable("Invalid type for bitreverse!");
11812
11813 case MVT::v2i32: {
11814 VST = MVT::v8i8;
11815 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11816
11817 break;
11818 }
11819
11820 case MVT::v4i32: {
11821 VST = MVT::v16i8;
11822 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11823
11824 break;
11825 }
11826
11827 case MVT::v1i64: {
11828 VST = MVT::v8i8;
11829 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11830
11831 break;
11832 }
11833
11834 case MVT::v2i64: {
11835 VST = MVT::v16i8;
11836 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11837
11838 break;
11839 }
11840 }
11841
11842 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11843 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11844}
11845
11846// Check whether the continuous comparison sequence.
11847static bool
11848isOrXorChain(SDValue N, unsigned &Num,
11849 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11850 if (Num == MaxXors)
11851 return false;
11852
11853 // Skip the one-use zext
11854 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11855 N = N->getOperand(0);
11856
11857 // The leaf node must be XOR
11858 if (N->getOpcode() == ISD::XOR) {
11859 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11860 Num++;
11861 return true;
11862 }
11863
11864 // All the non-leaf nodes must be OR.
11865 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11866 return false;
11867
11868 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11869 isOrXorChain(N->getOperand(1), Num, WorkList))
11870 return true;
11871 return false;
11872}
11873
11874// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11876 SDValue LHS = N->getOperand(0);
11877 SDValue RHS = N->getOperand(1);
11878 SDLoc DL(N);
11879 EVT VT = N->getValueType(0);
11881
11882 // Only handle integer compares.
11883 if (N->getOpcode() != ISD::SETCC)
11884 return SDValue();
11885
11886 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11887 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11888 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11889 unsigned NumXors = 0;
11890 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11891 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11892 isOrXorChain(LHS, NumXors, WorkList)) {
11893 SDValue XOR0, XOR1;
11894 std::tie(XOR0, XOR1) = WorkList[0];
11895 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11896 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11897 for (unsigned I = 1; I < WorkList.size(); I++) {
11898 std::tie(XOR0, XOR1) = WorkList[I];
11899 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11900 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11901 }
11902
11903 // Exit early by inverting the condition, which help reduce indentations.
11904 return Cmp;
11905 }
11906
11907 return SDValue();
11908}
11909
11910SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11911
11912 if (Op.getValueType().isVector())
11913 return LowerVSETCC(Op, DAG);
11914
11915 bool IsStrict = Op->isStrictFPOpcode();
11916 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11917 unsigned OpNo = IsStrict ? 1 : 0;
11918 SDValue Chain;
11919 if (IsStrict)
11920 Chain = Op.getOperand(0);
11921 SDValue LHS = Op.getOperand(OpNo + 0);
11922 SDValue RHS = Op.getOperand(OpNo + 1);
11923 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11924 SDLoc DL(Op);
11925
11926 // We chose ZeroOrOneBooleanContents, so use zero and one.
11927 EVT VT = Op.getValueType();
11928 SDValue TVal = DAG.getConstant(1, DL, VT);
11929 SDValue FVal = DAG.getConstant(0, DL, VT);
11930
11931 // Handle f128 first, since one possible outcome is a normal integer
11932 // comparison which gets picked up by the next if statement.
11933 if (LHS.getValueType() == MVT::f128) {
11934 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11935 IsSignaling);
11936
11937 // If softenSetCCOperands returned a scalar, use it.
11938 if (!RHS.getNode()) {
11939 assert(LHS.getValueType() == Op.getValueType() &&
11940 "Unexpected setcc expansion!");
11941 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11942 }
11943 }
11944
11945 if (LHS.getValueType().isInteger()) {
11946 if (Subtarget->hasCSSC() && CC == ISD::SETNE && isNullConstant(RHS)) {
11947 SDValue One = DAG.getConstant(1, DL, LHS.getValueType());
11948 SDValue UMin = DAG.getNode(ISD::UMIN, DL, LHS.getValueType(), LHS, One);
11949 SDValue Res = DAG.getZExtOrTrunc(UMin, DL, VT);
11950 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11951 }
11952 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11953
11954 SDValue CCVal;
11956 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11957
11958 // Note that we inverted the condition above, so we reverse the order of
11959 // the true and false operands here. This will allow the setcc to be
11960 // matched to a single CSINC instruction.
11961 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11962 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11963 }
11964
11965 // Now we know we're dealing with FP values.
11966 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11967 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11968
11969 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11970 // and do the comparison.
11971 SDValue Cmp;
11972 if (IsStrict)
11973 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11974 else
11975 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11976
11977 AArch64CC::CondCode CC1, CC2;
11978 changeFPCCToAArch64CC(CC, CC1, CC2);
11979 SDValue Res;
11980 if (CC2 == AArch64CC::AL) {
11981 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11982 CC2);
11983 SDValue CC1Val = getCondCode(DAG, CC1);
11984
11985 // Note that we inverted the condition above, so we reverse the order of
11986 // the true and false operands here. This will allow the setcc to be
11987 // matched to a single CSINC instruction.
11988 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11989 } else {
11990 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11991 // totally clean. Some of them require two CSELs to implement. As is in
11992 // this case, we emit the first CSEL and then emit a second using the output
11993 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11994
11995 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11996 SDValue CC1Val = getCondCode(DAG, CC1);
11997 SDValue CS1 =
11998 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11999
12000 SDValue CC2Val = getCondCode(DAG, CC2);
12001 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12002 }
12003 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
12004}
12005
12006SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
12007 SelectionDAG &DAG) const {
12008
12009 SDValue LHS = Op.getOperand(0);
12010 SDValue RHS = Op.getOperand(1);
12011 EVT VT = LHS.getValueType();
12012 if (VT != MVT::i32 && VT != MVT::i64)
12013 return SDValue();
12014
12015 SDLoc DL(Op);
12016 SDValue Carry = Op.getOperand(2);
12017 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
12018 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
12019 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
12020 LHS, RHS, InvCarry);
12021
12022 EVT OpVT = Op.getValueType();
12023 SDValue TVal = DAG.getConstant(1, DL, OpVT);
12024 SDValue FVal = DAG.getConstant(0, DL, OpVT);
12025
12026 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
12028 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
12029 // Inputs are swapped because the condition is inverted. This will allow
12030 // matching with a single CSINC instruction.
12031 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
12032 Cmp.getValue(1));
12033}
12034
12035/// Emit vector comparison for floating-point values, producing a mask.
12037 AArch64CC::CondCode CC, bool NoNans, EVT VT,
12038 const SDLoc &DL, SelectionDAG &DAG) {
12039 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
12040 "function only supposed to emit natural comparisons");
12041
12042 switch (CC) {
12043 default:
12044 return SDValue();
12045 case AArch64CC::NE: {
12046 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12047 // Use vector semantics for the inversion to potentially save a copy between
12048 // SIMD and regular registers.
12049 if (!LHS.getValueType().isVector()) {
12050 EVT VecVT =
12051 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12052 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12053 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
12054 DAG.getUNDEF(VecVT), Fcmeq, Zero);
12055 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
12056 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
12057 }
12058 return DAG.getNOT(DL, Fcmeq, VT);
12059 }
12060 case AArch64CC::EQ:
12061 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12062 case AArch64CC::GE:
12063 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
12064 case AArch64CC::GT:
12065 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
12066 case AArch64CC::LE:
12067 if (!NoNans)
12068 return SDValue();
12069 // If we ignore NaNs then we can use to the LS implementation.
12070 [[fallthrough]];
12071 case AArch64CC::LS:
12072 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
12073 case AArch64CC::LT:
12074 if (!NoNans)
12075 return SDValue();
12076 // If we ignore NaNs then we can use to the MI implementation.
12077 [[fallthrough]];
12078 case AArch64CC::MI:
12079 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
12080 }
12081}
12082
12083/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
12084/// values are scalars, try to emit a mask generating vector instruction.
12086 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
12087 const SDLoc &DL, SelectionDAG &DAG) {
12088 assert(!LHS.getValueType().isVector());
12089 assert(!RHS.getValueType().isVector());
12090
12091 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
12092 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
12093 if (!CTVal || !CFVal)
12094 return {};
12095 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
12096 !(CTVal->isZero() && CFVal->isAllOnes()))
12097 return {};
12098
12099 if (CTVal->isZero())
12100 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12101
12102 EVT VT = TVal.getValueType();
12103 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
12104 return {};
12105
12106 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
12107 bool OneNaN = false;
12108 if (LHS == RHS) {
12109 OneNaN = true;
12110 } else if (DAG.isKnownNeverNaN(RHS)) {
12111 OneNaN = true;
12112 RHS = LHS;
12113 } else if (DAG.isKnownNeverNaN(LHS)) {
12114 OneNaN = true;
12115 LHS = RHS;
12116 }
12117 if (OneNaN)
12118 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
12119 }
12120
12123 bool ShouldInvert = false;
12124 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12125 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
12126 SDValue Cmp2;
12127 if (CC2 != AArch64CC::AL) {
12128 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
12129 if (!Cmp2)
12130 return {};
12131 }
12132 if (!Cmp2 && !ShouldInvert)
12133 return Cmp;
12134
12135 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12136 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12137 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
12138 Zero);
12139 if (Cmp2) {
12140 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
12141 Cmp2, Zero);
12142 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
12143 }
12144 if (ShouldInvert)
12145 Cmp = DAG.getNOT(DL, Cmp, VecVT);
12146 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
12147 return Cmp;
12148}
12149
12150SDValue AArch64TargetLowering::LowerSELECT_CC(
12153 const SDLoc &DL, SelectionDAG &DAG) const {
12154 // Handle f128 first, because it will result in a comparison of some RTLIB
12155 // call result against zero.
12156 if (LHS.getValueType() == MVT::f128) {
12157 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
12158
12159 // If softenSetCCOperands returned a scalar, we need to compare the result
12160 // against zero to select between true and false values.
12161 if (!RHS.getNode()) {
12162 RHS = DAG.getConstant(0, DL, LHS.getValueType());
12163 CC = ISD::SETNE;
12164 }
12165 }
12166
12167 // Also handle f16, for which we need to do a f32 comparison.
12168 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
12169 LHS.getValueType() == MVT::bf16) {
12170 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
12171 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
12172 }
12173
12174 // Next, handle integers.
12175 if (LHS.getValueType().isInteger()) {
12176 assert((LHS.getValueType() == RHS.getValueType()) &&
12177 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
12178
12179 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
12180 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
12181 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
12182
12183 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
12184 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
12185 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
12186 // Both require less instructions than compare and conditional select.
12187 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
12188 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
12189 LHS.getValueType() == RHS.getValueType()) {
12190 EVT VT = LHS.getValueType();
12191 SDValue Shift =
12192 DAG.getNode(ISD::SRA, DL, VT, LHS,
12193 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
12194
12195 if (CC == ISD::SETGT)
12196 Shift = DAG.getNOT(DL, Shift, VT);
12197
12198 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
12199 }
12200
12201 // Check for sign bit test patterns that can use TST optimization.
12202 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
12203 // -> TST %operand, sign_bit; CSEL
12204 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
12205 // -> TST %operand, sign_bit; CSEL
12206 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
12207 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
12208 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12209
12210 uint64_t SignBitPos;
12211 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
12212 EVT TestVT = LHS.getValueType();
12213 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
12214 SDValue TST =
12215 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
12216 LHS, SignBitConst);
12217
12218 SDValue Flags = TST.getValue(1);
12219 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
12220 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
12221 }
12222
12223 // Canonicalise absolute difference patterns:
12224 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
12225 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
12226 //
12227 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
12228 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
12229 // The second forms can be matched into subs+cneg.
12230 // NOTE: Drop poison generating flags from the negated operand to avoid
12231 // inadvertently propagating poison after the canonicalisation.
12232 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
12233 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
12234 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
12236 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
12237 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
12238 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
12240 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
12241 }
12242 }
12243
12244 unsigned Opcode = AArch64ISD::CSEL;
12245
12246 // If both the TVal and the FVal are constants, see if we can swap them in
12247 // order to for a CSINV or CSINC out of them.
12248 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
12249 std::swap(TVal, FVal);
12250 std::swap(CTVal, CFVal);
12251 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12252 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
12253 std::swap(TVal, FVal);
12254 std::swap(CTVal, CFVal);
12255 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12256 } else if (TVal.getOpcode() == ISD::XOR) {
12257 // If TVal is a NOT we want to swap TVal and FVal so that we can match
12258 // with a CSINV rather than a CSEL.
12259 if (isAllOnesConstant(TVal.getOperand(1))) {
12260 std::swap(TVal, FVal);
12261 std::swap(CTVal, CFVal);
12262 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12263 }
12264 } else if (TVal.getOpcode() == ISD::SUB) {
12265 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
12266 // that we can match with a CSNEG rather than a CSEL.
12267 if (isNullConstant(TVal.getOperand(0))) {
12268 std::swap(TVal, FVal);
12269 std::swap(CTVal, CFVal);
12270 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12271 }
12272 } else if (CTVal && CFVal) {
12273 const int64_t TrueVal = CTVal->getSExtValue();
12274 const int64_t FalseVal = CFVal->getSExtValue();
12275 bool Swap = false;
12276
12277 // If both TVal and FVal are constants, see if FVal is the
12278 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
12279 // instead of a CSEL in that case.
12280 if (TrueVal == ~FalseVal) {
12281 Opcode = AArch64ISD::CSINV;
12282 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
12283 TrueVal == -FalseVal) {
12284 Opcode = AArch64ISD::CSNEG;
12285 } else if (TVal.getValueType() == MVT::i32) {
12286 // If our operands are only 32-bit wide, make sure we use 32-bit
12287 // arithmetic for the check whether we can use CSINC. This ensures that
12288 // the addition in the check will wrap around properly in case there is
12289 // an overflow (which would not be the case if we do the check with
12290 // 64-bit arithmetic).
12291 const uint32_t TrueVal32 = CTVal->getZExtValue();
12292 const uint32_t FalseVal32 = CFVal->getZExtValue();
12293
12294 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
12295 Opcode = AArch64ISD::CSINC;
12296
12297 if (TrueVal32 > FalseVal32) {
12298 Swap = true;
12299 }
12300 }
12301 } else {
12302 // 64-bit check whether we can use CSINC.
12303 const uint64_t TrueVal64 = TrueVal;
12304 const uint64_t FalseVal64 = FalseVal;
12305
12306 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
12307 Opcode = AArch64ISD::CSINC;
12308
12309 if (TrueVal > FalseVal) {
12310 Swap = true;
12311 }
12312 }
12313 }
12314
12315 // Swap TVal and FVal if necessary.
12316 if (Swap) {
12317 std::swap(TVal, FVal);
12318 std::swap(CTVal, CFVal);
12319 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12320 }
12321
12322 if (Opcode != AArch64ISD::CSEL) {
12323 // Drop FVal since we can get its value by simply inverting/negating
12324 // TVal.
12325 FVal = TVal;
12326 }
12327 }
12328
12329 // Avoid materializing a constant when possible by reusing a known value in
12330 // a register. However, don't perform this optimization if the known value
12331 // is one, zero or negative one in the case of a CSEL. We can always
12332 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
12333 // FVal, respectively.
12334 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
12335 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
12336 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
12338 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
12339 // "a != C ? x : a" to avoid materializing C.
12340 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
12341 TVal = LHS;
12342 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
12343 FVal = LHS;
12344 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
12345 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
12346 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
12347 // avoid materializing C.
12349 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
12350 Opcode = AArch64ISD::CSINV;
12351 TVal = LHS;
12352 FVal = DAG.getConstant(0, DL, FVal.getValueType());
12353 }
12354 }
12355
12356 SDValue CCVal;
12357 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
12358 EVT VT = TVal.getValueType();
12359 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
12360 }
12361
12362 // Now we know we're dealing with FP values.
12363 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
12364 LHS.getValueType() == MVT::f64);
12365 assert(LHS.getValueType() == RHS.getValueType());
12366 EVT VT = TVal.getValueType();
12367
12368 // If the purpose of the comparison is to select between all ones
12369 // or all zeros, try to use a vector comparison because the operands are
12370 // already stored in SIMD registers.
12371 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
12372 switch (U->getOpcode()) {
12373 default:
12374 return false;
12377 case AArch64ISD::DUP:
12378 return true;
12379 }
12380 })) {
12381 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
12382 SDValue VectorCmp =
12383 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
12384 if (VectorCmp)
12385 return VectorCmp;
12386 }
12387
12388 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12389
12390 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12391 // clean. Some of them require two CSELs to implement.
12392 AArch64CC::CondCode CC1, CC2;
12393 changeFPCCToAArch64CC(CC, CC1, CC2);
12394
12395 if (Flags.hasNoSignedZeros()) {
12396 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
12397 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
12398 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
12399 if (RHSVal && RHSVal->isZero()) {
12400 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
12401 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
12402
12403 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
12404 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
12405 TVal = LHS;
12406 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
12407 CFVal && CFVal->isZero() &&
12408 FVal.getValueType() == LHS.getValueType())
12409 FVal = LHS;
12410 }
12411 }
12412
12413 // Emit first, and possibly only, CSEL.
12414 SDValue CC1Val = getCondCode(DAG, CC1);
12415 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12416
12417 // If we need a second CSEL, emit it, using the output of the first as the
12418 // RHS. We're effectively OR'ing the two CC's together.
12419 if (CC2 != AArch64CC::AL) {
12420 SDValue CC2Val = getCondCode(DAG, CC2);
12421 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12422 }
12423
12424 // Otherwise, return the output of the first CSEL.
12425 return CS1;
12426}
12427
12428SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12429 SelectionDAG &DAG) const {
12430 EVT Ty = Op.getValueType();
12431 if (!isa<ConstantSDNode>(Op.getOperand(2)))
12432 return SDValue();
12433 auto Idx = Op.getConstantOperandAPInt(2);
12434 int64_t IdxVal = Idx.getSExtValue();
12435 assert(Ty.isScalableVector() &&
12436 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12437
12438 // We can use the splice instruction for certain index values where we are
12439 // able to efficiently generate the correct predicate. The index will be
12440 // inverted and used directly as the input to the ptrue instruction, i.e.
12441 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12442 // splice predicate. However, we can only do this if we can guarantee that
12443 // there are enough elements in the vector, hence we check the index <= min
12444 // number of elements.
12445 std::optional<unsigned> PredPattern;
12446 if (Ty.isScalableVector() && Op.getOpcode() == ISD::VECTOR_SPLICE_RIGHT &&
12447 (PredPattern = getSVEPredPatternFromNumElements(IdxVal)) !=
12448 std::nullopt) {
12449 SDLoc DL(Op);
12450
12451 // Create a predicate where all but the last -IdxVal elements are false.
12452 EVT PredVT = Ty.changeVectorElementType(*DAG.getContext(), MVT::i1);
12453 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12454 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12455
12456 // Now splice the two inputs together using the predicate.
12457 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12458 Op.getOperand(1));
12459 }
12460
12461 // We can select to an EXT instruction when indexing the first 256 bytes.
12463 if (Op.getOpcode() == ISD::VECTOR_SPLICE_LEFT &&
12464 (IdxVal * BlockSize / 8) < 256)
12465 return Op;
12466
12467 return SDValue();
12468}
12469
12470SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12471 SelectionDAG &DAG) const {
12472 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12473 SDValue LHS = Op.getOperand(0);
12474 SDValue RHS = Op.getOperand(1);
12475 SDValue TVal = Op.getOperand(2);
12476 SDValue FVal = Op.getOperand(3);
12477 SDNodeFlags Flags = Op->getFlags();
12478 SDLoc DL(Op);
12479 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12480}
12481
12482SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12483 SelectionDAG &DAG) const {
12484 SDValue CCVal = Op->getOperand(0);
12485 SDValue TVal = Op->getOperand(1);
12486 SDValue FVal = Op->getOperand(2);
12487 SDLoc DL(Op);
12488
12489 EVT Ty = Op.getValueType();
12490 if (Ty == MVT::aarch64svcount) {
12491 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12492 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12493 SDValue Sel =
12494 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12495 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12496 }
12497
12498 if (Ty.isScalableVector()) {
12499 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12500 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12501 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12502 }
12503
12504 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12505 // FIXME: Ideally this would be the same as above using i1 types, however
12506 // for the moment we can't deal with fixed i1 vector types properly, so
12507 // instead extend the predicate to a result type sized integer vector.
12508 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12509 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12510 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12511 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12512 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12513 }
12514
12515 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12516 // instruction.
12517 if (ISD::isOverflowIntrOpRes(CCVal)) {
12518 // Only lower legal XALUO ops.
12519 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12520 return SDValue();
12521
12523 SDValue Value, Overflow;
12524 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12525 SDValue CCVal = getCondCode(DAG, OFCC);
12526
12527 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12528 CCVal, Overflow);
12529 }
12530
12531 // Lower it the same way as we would lower a SELECT_CC node.
12532 ISD::CondCode CC;
12533 SDValue LHS, RHS;
12534 if (CCVal.getOpcode() == ISD::SETCC) {
12535 LHS = CCVal.getOperand(0);
12536 RHS = CCVal.getOperand(1);
12537 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12538 } else {
12539 LHS = CCVal;
12540 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12541 CC = ISD::SETNE;
12542 }
12543
12544 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12545 // order to use FCSELSrrr
12546 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12547 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12548 DAG.getUNDEF(MVT::f32), TVal);
12549 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12550 DAG.getUNDEF(MVT::f32), FVal);
12551 }
12552
12553 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12554 Op->getFlags(), DL, DAG);
12555
12556 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12557 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12558 }
12559
12560 return Res;
12561}
12562
12563SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12564 SelectionDAG &DAG) const {
12565 // Jump table entries as PC relative offsets. No additional tweaking
12566 // is necessary here. Just get the address of the jump table.
12567 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12568
12571 !Subtarget->isTargetMachO())
12572 return getAddrLarge(JT, DAG);
12573 if (CM == CodeModel::Tiny)
12574 return getAddrTiny(JT, DAG);
12575 return getAddr(JT, DAG);
12576}
12577
12578SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12579 SelectionDAG &DAG) const {
12580 // Jump table entries as PC relative offsets. No additional tweaking
12581 // is necessary here. Just get the address of the jump table.
12582 SDLoc DL(Op);
12583 SDValue JT = Op.getOperand(1);
12584 SDValue Entry = Op.getOperand(2);
12585 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12586
12587 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12588 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12589
12590 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12591 // sequence later, to guarantee the integrity of the intermediate values.
12593 "aarch64-jump-table-hardening")) {
12595 if (Subtarget->isTargetMachO()) {
12596 if (CM != CodeModel::Small && CM != CodeModel::Large)
12597 report_fatal_error("Unsupported code-model for hardened jump-table");
12598 } else {
12599 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12600 assert(Subtarget->isTargetELF() &&
12601 "jump table hardening only supported on MachO/ELF");
12602 if (CM != CodeModel::Small)
12603 report_fatal_error("Unsupported code-model for hardened jump-table");
12604 }
12605
12606 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12607 Entry, SDValue());
12608 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12609 DAG.getTargetJumpTable(JTI, MVT::i32),
12610 X16Copy.getValue(0), X16Copy.getValue(1));
12611 return SDValue(B, 0);
12612 }
12613
12614 SDNode *Dest =
12615 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12616 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12617 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12618 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12619}
12620
12621SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12622 SDValue Chain = Op.getOperand(0);
12623 SDValue Dest = Op.getOperand(1);
12624
12625 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12626 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12627 if (Dest->isMachineOpcode() &&
12628 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12629 return SDValue();
12630
12631 const MachineFunction &MF = DAG.getMachineFunction();
12632 std::optional<uint16_t> BADisc =
12633 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12634 if (!BADisc)
12635 return SDValue();
12636
12637 SDLoc DL(Op);
12638
12639 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12641 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12642
12643 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12644 {Dest, Key, Disc, AddrDisc, Chain});
12645 return SDValue(BrA, 0);
12646}
12647
12648SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12649 SelectionDAG &DAG) const {
12650 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12652 if (CM == CodeModel::Large) {
12653 // Use the GOT for the large code model on iOS.
12654 if (Subtarget->isTargetMachO()) {
12655 return getGOT(CP, DAG);
12656 }
12658 return getAddrLarge(CP, DAG);
12659 } else if (CM == CodeModel::Tiny) {
12660 return getAddrTiny(CP, DAG);
12661 }
12662 return getAddr(CP, DAG);
12663}
12664
12665SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12666 SelectionDAG &DAG) const {
12667 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12668 const BlockAddress *BA = BAN->getBlockAddress();
12669
12670 if (std::optional<uint16_t> BADisc =
12671 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12672 *BA->getFunction())) {
12673 SDLoc DL(Op);
12674
12675 // This isn't cheap, but BRIND is rare.
12676 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12677
12678 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12679
12681 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12682
12683 SDNode *MOV =
12684 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12685 {TargetBA, Key, AddrDisc, Disc});
12686 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12687 SDValue(MOV, 1));
12688 }
12689
12691 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12693 return getAddrLarge(BAN, DAG);
12694 } else if (CM == CodeModel::Tiny) {
12695 return getAddrTiny(BAN, DAG);
12696 }
12697 return getAddr(BAN, DAG);
12698}
12699
12700SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12701 SelectionDAG &DAG) const {
12702 AArch64FunctionInfo *FuncInfo =
12703 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12704
12705 SDLoc DL(Op);
12706 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12708 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12709 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12710 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12711 MachinePointerInfo(SV));
12712}
12713
12714SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12715 SelectionDAG &DAG) const {
12716 MachineFunction &MF = DAG.getMachineFunction();
12717 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12718
12719 SDLoc DL(Op);
12720 SDValue FR;
12721 if (Subtarget->isWindowsArm64EC()) {
12722 // With the Arm64EC ABI, we compute the address of the varargs save area
12723 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12724 // but calls from an entry thunk can pass in a different address.
12725 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12726 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12727 uint64_t StackOffset;
12728 if (FuncInfo->getVarArgsGPRSize() > 0)
12729 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12730 else
12731 StackOffset = FuncInfo->getVarArgsStackOffset();
12732 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12733 DAG.getConstant(StackOffset, DL, MVT::i64));
12734 } else {
12735 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12736 ? FuncInfo->getVarArgsGPRIndex()
12737 : FuncInfo->getVarArgsStackIndex(),
12739 }
12740 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12741 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12742 MachinePointerInfo(SV));
12743}
12744
12745SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12746 SelectionDAG &DAG) const {
12747 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12748 // Standard, section B.3.
12749 MachineFunction &MF = DAG.getMachineFunction();
12750 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12751 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12752 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12753 auto PtrVT = getPointerTy(DAG.getDataLayout());
12754 SDLoc DL(Op);
12755
12756 SDValue Chain = Op.getOperand(0);
12757 SDValue VAList = Op.getOperand(1);
12758 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12760
12761 // void *__stack at offset 0
12762 unsigned Offset = 0;
12763 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12764 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12765 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12766 MachinePointerInfo(SV), Align(PtrSize)));
12767
12768 // void *__gr_top at offset 8 (4 on ILP32)
12769 Offset += PtrSize;
12770 int GPRSize = FuncInfo->getVarArgsGPRSize();
12771 if (GPRSize > 0) {
12772 SDValue GRTop, GRTopAddr;
12773
12774 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12775 DAG.getConstant(Offset, DL, PtrVT));
12776
12777 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12778 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12779 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12780 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12781
12782 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12783 MachinePointerInfo(SV, Offset),
12784 Align(PtrSize)));
12785 }
12786
12787 // void *__vr_top at offset 16 (8 on ILP32)
12788 Offset += PtrSize;
12789 int FPRSize = FuncInfo->getVarArgsFPRSize();
12790 if (FPRSize > 0) {
12791 SDValue VRTop, VRTopAddr;
12792 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12793 DAG.getConstant(Offset, DL, PtrVT));
12794
12795 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12796 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12797 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12798 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12799
12800 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12801 MachinePointerInfo(SV, Offset),
12802 Align(PtrSize)));
12803 }
12804
12805 // int __gr_offs at offset 24 (12 on ILP32)
12806 Offset += PtrSize;
12807 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12808 DAG.getConstant(Offset, DL, PtrVT));
12809 MemOps.push_back(
12810 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12811 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12812
12813 // int __vr_offs at offset 28 (16 on ILP32)
12814 Offset += 4;
12815 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12816 DAG.getConstant(Offset, DL, PtrVT));
12817 MemOps.push_back(
12818 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12819 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12820
12821 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12822}
12823
12824SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12825 SelectionDAG &DAG) const {
12826 MachineFunction &MF = DAG.getMachineFunction();
12827 Function &F = MF.getFunction();
12828
12829 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12830 return LowerWin64_VASTART(Op, DAG);
12831 else if (Subtarget->isTargetDarwin())
12832 return LowerDarwin_VASTART(Op, DAG);
12833 else
12834 return LowerAAPCS_VASTART(Op, DAG);
12835}
12836
12837SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12838 SelectionDAG &DAG) const {
12839 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12840 // pointer.
12841 SDLoc DL(Op);
12842 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12843 unsigned VaListSize =
12844 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12845 ? PtrSize
12846 : Subtarget->isTargetILP32() ? 20 : 32;
12847 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12848 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12849
12850 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12851 DAG.getConstant(VaListSize, DL, MVT::i32),
12852 Align(PtrSize), false, false, /*CI=*/nullptr,
12853 std::nullopt, MachinePointerInfo(DestSV),
12854 MachinePointerInfo(SrcSV));
12855}
12856
12857SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12858 assert(Subtarget->isTargetDarwin() &&
12859 "automatic va_arg instruction only works on Darwin");
12860
12861 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12862 EVT VT = Op.getValueType();
12863 SDLoc DL(Op);
12864 SDValue Chain = Op.getOperand(0);
12865 SDValue Addr = Op.getOperand(1);
12866 MaybeAlign Align(Op.getConstantOperandVal(3));
12867 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12868 auto PtrVT = getPointerTy(DAG.getDataLayout());
12869 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12870 SDValue VAList =
12871 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12872 Chain = VAList.getValue(1);
12873 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12874
12875 if (VT.isScalableVector())
12876 report_fatal_error("Passing SVE types to variadic functions is "
12877 "currently not supported");
12878
12879 if (Align && *Align > MinSlotSize) {
12880 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12881 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12882 VAList =
12883 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12884 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12885 }
12886
12887 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12888 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12889
12890 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12891 // up to 64 bits. At the very least, we have to increase the striding of the
12892 // vaargs list to match this, and for FP values we need to introduce
12893 // FP_ROUND nodes as well.
12894 if (VT.isInteger() && !VT.isVector())
12895 ArgSize = std::max(ArgSize, MinSlotSize);
12896 bool NeedFPTrunc = false;
12897 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12898 ArgSize = 8;
12899 NeedFPTrunc = true;
12900 }
12901
12902 // Increment the pointer, VAList, to the next vaarg
12903 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12904 DAG.getConstant(ArgSize, DL, PtrVT));
12905 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12906
12907 // Store the incremented VAList to the legalized pointer
12908 SDValue APStore =
12909 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12910
12911 // Load the actual argument out of the pointer VAList
12912 if (NeedFPTrunc) {
12913 // Load the value as an f64.
12914 SDValue WideFP =
12915 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12916 // Round the value down to an f32.
12917 SDValue NarrowFP =
12918 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12919 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12920 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12921 // Merge the rounded value with the chain output of the load.
12922 return DAG.getMergeValues(Ops, DL);
12923 }
12924
12925 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12926}
12927
12928SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12929 SelectionDAG &DAG) const {
12930 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12931 MFI.setFrameAddressIsTaken(true);
12932
12933 EVT VT = Op.getValueType();
12934 SDLoc DL(Op);
12935 unsigned Depth = Op.getConstantOperandVal(0);
12936 SDValue FrameAddr =
12937 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12938 while (Depth--)
12939 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12940 MachinePointerInfo());
12941
12942 if (Subtarget->isTargetILP32())
12943 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12944 DAG.getValueType(VT));
12945
12946 return FrameAddr;
12947}
12948
12949SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12950 SelectionDAG &DAG) const {
12951 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12952
12953 EVT VT = getPointerTy(DAG.getDataLayout());
12954 int FI = MFI.CreateFixedObject(4, 0, false);
12955 return DAG.getFrameIndex(FI, VT);
12956}
12957
12958#define GET_REGISTER_MATCHER
12959#include "AArch64GenAsmMatcher.inc"
12960
12961// FIXME? Maybe this could be a TableGen attribute on some registers and
12962// this table could be generated automatically from RegInfo.
12963Register AArch64TargetLowering::
12964getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12966 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12967 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12968 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12969 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12970 !MRI->isReservedReg(MF, Reg))
12971 Reg = Register();
12972 }
12973 return Reg;
12974}
12975
12976SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12977 SelectionDAG &DAG) const {
12979
12980 EVT VT = Op.getValueType();
12981 SDLoc DL(Op);
12982
12983 SDValue FrameAddr =
12984 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12986
12987 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12988}
12989
12990SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12991 SelectionDAG &DAG) const {
12992 MachineFunction &MF = DAG.getMachineFunction();
12993 MachineFrameInfo &MFI = MF.getFrameInfo();
12994 MFI.setReturnAddressIsTaken(true);
12995
12996 EVT VT = Op.getValueType();
12997 SDLoc DL(Op);
12998 unsigned Depth = Op.getConstantOperandVal(0);
12999 SDValue ReturnAddress;
13000 if (Depth) {
13001 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
13003 ReturnAddress = DAG.getLoad(
13004 VT, DL, DAG.getEntryNode(),
13005 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
13006 } else {
13007 // Return LR, which contains the return address. Mark it an implicit
13008 // live-in.
13009 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
13010 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
13011 }
13012
13013 // The XPACLRI instruction assembles to a hint-space instruction before
13014 // Armv8.3-A therefore this instruction can be safely used for any pre
13015 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
13016 // that instead.
13017 SDNode *St;
13018 if (Subtarget->hasPAuth()) {
13019 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
13020 } else {
13021 // XPACLRI operates on LR therefore we must move the operand accordingly.
13022 SDValue Chain =
13023 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
13024 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
13025 }
13026 return SDValue(St, 0);
13027}
13028
13029/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
13030/// i32 values and take a 2 x i32 value to shift plus a shift amount.
13031SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
13032 SelectionDAG &DAG) const {
13033 SDValue Lo, Hi;
13034 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
13035 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
13036}
13037
13039 const GlobalAddressSDNode *GA) const {
13040 // Offsets are folded in the DAG combine rather than here so that we can
13041 // intelligently choose an offset based on the uses.
13042 return false;
13043}
13044
13046 bool OptForSize) const {
13047 bool IsLegal = false;
13048 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
13049 // 16-bit case when target has full fp16 support.
13050 // We encode bf16 bit patterns as if they were fp16. This results in very
13051 // strange looking assembly but should populate the register with appropriate
13052 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
13053 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
13054 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
13055 // FIXME: We should be able to handle f128 as well with a clever lowering.
13056 const APInt ImmInt = Imm.bitcastToAPInt();
13057 if (VT == MVT::f64)
13058 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
13059 else if (VT == MVT::f32)
13060 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
13061 else if (VT == MVT::f16 || VT == MVT::bf16)
13062 IsLegal =
13063 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
13064 Imm.isPosZero();
13065
13066 // If we can not materialize in immediate field for fmov, check if the
13067 // value can be encoded as the immediate operand of a logical instruction.
13068 // The immediate value will be created with either MOVZ, MOVN, or ORR.
13069 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
13070 // generate that fmov.
13071 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
13072 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
13073 // however the mov+fmov sequence is always better because of the reduced
13074 // cache pressure. The timings are still the same if you consider
13075 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
13076 // movw+movk is fused). So we limit up to 2 instrdduction at most.
13079 assert(Insn.size() <= 4 &&
13080 "Should be able to build any value with at most 4 moves");
13081 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
13082 IsLegal = Insn.size() <= Limit;
13083 }
13084
13085 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
13086 << " imm value: "; Imm.dump(););
13087 return IsLegal;
13088}
13089
13090//===----------------------------------------------------------------------===//
13091// AArch64 Optimization Hooks
13092//===----------------------------------------------------------------------===//
13093
13094static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
13095 SDValue Operand, SelectionDAG &DAG,
13096 int &ExtraSteps) {
13097 EVT VT = Operand.getValueType();
13098 if ((ST->hasNEON() &&
13099 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
13100 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
13101 VT == MVT::v4f32)) ||
13102 (ST->hasSVE() &&
13103 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
13105 // For the reciprocal estimates, convergence is quadratic, so the number
13106 // of digits is doubled after each iteration. In ARMv8, the accuracy of
13107 // the initial estimate is 2^-8. Thus the number of extra steps to refine
13108 // the result for float (23 mantissa bits) is 2 and for double (52
13109 // mantissa bits) is 3.
13110 constexpr unsigned AccurateBits = 8;
13111 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
13112 ExtraSteps = DesiredBits <= AccurateBits
13113 ? 0
13114 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
13115 }
13116
13117 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
13118 }
13119
13120 return SDValue();
13121}
13122
13123SDValue
13124AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13125 const DenormalMode &Mode) const {
13126 SDLoc DL(Op);
13127 EVT VT = Op.getValueType();
13128 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
13129 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
13130 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
13131}
13132
13133SDValue
13134AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
13135 SelectionDAG &DAG) const {
13136 return Op;
13137}
13138
13139SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
13140 SelectionDAG &DAG, int Enabled,
13141 int &ExtraSteps,
13142 bool &UseOneConst,
13143 bool Reciprocal) const {
13145 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
13146 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
13147 DAG, ExtraSteps)) {
13148 SDLoc DL(Operand);
13149 EVT VT = Operand.getValueType();
13150
13151 // Ensure nodes can be recognized by isAssociativeAndCommutative.
13152 SDNodeFlags Flags =
13154
13155 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
13156 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
13157 for (int i = ExtraSteps; i > 0; --i) {
13158 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
13159 Flags);
13160 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
13161 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13162 }
13163 if (!Reciprocal)
13164 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
13165
13166 ExtraSteps = 0;
13167 return Estimate;
13168 }
13169
13170 return SDValue();
13171}
13172
13173SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
13174 SelectionDAG &DAG, int Enabled,
13175 int &ExtraSteps) const {
13177 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
13178 DAG, ExtraSteps)) {
13179 SDLoc DL(Operand);
13180 EVT VT = Operand.getValueType();
13181
13183
13184 // Newton reciprocal iteration: E * (2 - X * E)
13185 // AArch64 reciprocal iteration instruction: (2 - M * N)
13186 for (int i = ExtraSteps; i > 0; --i) {
13187 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
13188 Estimate, Flags);
13189 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13190 }
13191
13192 ExtraSteps = 0;
13193 return Estimate;
13194 }
13195
13196 return SDValue();
13197}
13198
13199//===----------------------------------------------------------------------===//
13200// AArch64 Inline Assembly Support
13201//===----------------------------------------------------------------------===//
13202
13203// Table of Constraints
13204// TODO: This is the current set of constraints supported by ARM for the
13205// compiler, not all of them may make sense.
13206//
13207// r - A general register
13208// w - An FP/SIMD register of some size in the range v0-v31
13209// x - An FP/SIMD register of some size in the range v0-v15
13210// I - Constant that can be used with an ADD instruction
13211// J - Constant that can be used with a SUB instruction
13212// K - Constant that can be used with a 32-bit logical instruction
13213// L - Constant that can be used with a 64-bit logical instruction
13214// M - Constant that can be used as a 32-bit MOV immediate
13215// N - Constant that can be used as a 64-bit MOV immediate
13216// Q - A memory reference with base register and no offset
13217// S - A symbolic address
13218// Y - Floating point constant zero
13219// Z - Integer constant zero
13220//
13221// Note that general register operands will be output using their 64-bit x
13222// register name, whatever the size of the variable, unless the asm operand
13223// is prefixed by the %w modifier. Floating-point and SIMD register operands
13224// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
13225// %q modifier.
13226const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
13227 // At this point, we have to lower this constraint to something else, so we
13228 // lower it to an "r" or "w". However, by doing this we will force the result
13229 // to be in register, while the X constraint is much more permissive.
13230 //
13231 // Although we are correct (we are free to emit anything, without
13232 // constraints), we might break use cases that would expect us to be more
13233 // efficient and emit something else.
13234 if (!Subtarget->hasFPARMv8())
13235 return "r";
13236
13237 if (ConstraintVT.isFloatingPoint())
13238 return "w";
13239
13240 if (ConstraintVT.isVector() &&
13241 (ConstraintVT.getSizeInBits() == 64 ||
13242 ConstraintVT.getSizeInBits() == 128))
13243 return "w";
13244
13245 return "r";
13246}
13247
13249
13250// Returns a {Reg, RegisterClass} tuple if the constraint is
13251// a specific predicate register.
13252//
13253// For some constraint like "{pn3}" the default path in
13254// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
13255// suitable register class for this register is "PPRorPNR", after which it
13256// determines that nxv16i1 is an appropriate type for the constraint, which is
13257// not what we want. The code here pre-empts this by matching the register
13258// explicitly.
13259static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
13261 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
13262 (Constraint[1] != 'p' && Constraint[1] != 'z'))
13263 return std::nullopt;
13264
13265 bool IsPredicate = Constraint[1] == 'p';
13266 Constraint = Constraint.substr(2, Constraint.size() - 3);
13267 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
13268 if (IsPredicateAsCount)
13269 Constraint = Constraint.drop_front(1);
13270
13271 unsigned V;
13272 if (Constraint.getAsInteger(10, V) || V > 31)
13273 return std::nullopt;
13274
13275 if (IsPredicateAsCount)
13276 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
13277 if (IsPredicate)
13278 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
13279 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
13280}
13281
13282static std::optional<PredicateConstraint>
13285 .Case("Uph", PredicateConstraint::Uph)
13288 .Default(std::nullopt);
13289}
13290
13291static const TargetRegisterClass *
13293 if (VT != MVT::aarch64svcount &&
13294 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
13295 return nullptr;
13296
13297 switch (Constraint) {
13299 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
13300 : &AArch64::PPR_p8to15RegClass;
13302 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
13303 : &AArch64::PPR_3bRegClass;
13305 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
13306 : &AArch64::PPRRegClass;
13307 }
13308
13309 llvm_unreachable("Missing PredicateConstraint!");
13310}
13311
13313
13314static std::optional<ReducedGprConstraint>
13317 .Case("Uci", ReducedGprConstraint::Uci)
13319 .Default(std::nullopt);
13320}
13321
13322static const TargetRegisterClass *
13324 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
13325 return nullptr;
13326
13327 switch (Constraint) {
13329 return &AArch64::MatrixIndexGPR32_8_11RegClass;
13331 return &AArch64::MatrixIndexGPR32_12_15RegClass;
13332 }
13333
13334 llvm_unreachable("Missing ReducedGprConstraint!");
13335}
13336
13337// The set of cc code supported is from
13338// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
13341 .Case("{@cchi}", AArch64CC::HI)
13342 .Case("{@cccs}", AArch64CC::HS)
13343 .Case("{@cclo}", AArch64CC::LO)
13344 .Case("{@ccls}", AArch64CC::LS)
13345 .Case("{@cccc}", AArch64CC::LO)
13346 .Case("{@cceq}", AArch64CC::EQ)
13347 .Case("{@ccgt}", AArch64CC::GT)
13348 .Case("{@ccge}", AArch64CC::GE)
13349 .Case("{@cclt}", AArch64CC::LT)
13350 .Case("{@ccle}", AArch64CC::LE)
13351 .Case("{@cchs}", AArch64CC::HS)
13352 .Case("{@ccne}", AArch64CC::NE)
13353 .Case("{@ccvc}", AArch64CC::VC)
13354 .Case("{@ccpl}", AArch64CC::PL)
13355 .Case("{@ccvs}", AArch64CC::VS)
13356 .Case("{@ccmi}", AArch64CC::MI)
13358 return Cond;
13359}
13360
13361/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
13362/// WZR, invert(<cond>)'.
13364 SelectionDAG &DAG) {
13365 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
13366 DAG.getConstant(0, DL, MVT::i32),
13367 DAG.getConstant(0, DL, MVT::i32),
13368 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
13369}
13370
13371// Lower @cc flag output via getSETCC.
13372SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
13373 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
13374 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
13375 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
13376 if (Cond == AArch64CC::Invalid)
13377 return SDValue();
13378 // The output variable should be a scalar integer.
13379 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
13380 OpInfo.ConstraintVT.getSizeInBits() < 8)
13381 report_fatal_error("Flag output operand is of invalid type");
13382
13383 // Get NZCV register. Only update chain when copyfrom is glued.
13384 if (Glue.getNode()) {
13385 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
13386 Chain = Glue.getValue(1);
13387 } else
13388 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
13389 // Extract CC code.
13390 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
13391
13393
13394 // Truncate or ZERO_EXTEND based on value types.
13395 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
13396 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
13397 else
13398 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
13399
13400 return Result;
13401}
13402
13403/// getConstraintType - Given a constraint letter, return the type of
13404/// constraint it is for this target.
13406AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
13407 if (Constraint.size() == 1) {
13408 switch (Constraint[0]) {
13409 default:
13410 break;
13411 case 'x':
13412 case 'w':
13413 case 'y':
13414 return C_RegisterClass;
13415 // An address with a single base register. Due to the way we
13416 // currently handle addresses it is the same as 'r'.
13417 case 'Q':
13418 return C_Memory;
13419 case 'I':
13420 case 'J':
13421 case 'K':
13422 case 'L':
13423 case 'M':
13424 case 'N':
13425 case 'Y':
13426 case 'Z':
13427 return C_Immediate;
13428 case 'z':
13429 case 'S': // A symbol or label reference with a constant offset
13430 return C_Other;
13431 }
13432 } else if (parsePredicateConstraint(Constraint))
13433 return C_RegisterClass;
13434 else if (parseReducedGprConstraint(Constraint))
13435 return C_RegisterClass;
13436 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13437 return C_Other;
13438 return TargetLowering::getConstraintType(Constraint);
13439}
13440
13441/// Examine constraint type and operand type and determine a weight value.
13442/// This object must already have been set up with the operand type
13443/// and the current alternative constraint selected.
13445AArch64TargetLowering::getSingleConstraintMatchWeight(
13446 AsmOperandInfo &info, const char *constraint) const {
13448 Value *CallOperandVal = info.CallOperandVal;
13449 // If we don't have a value, we can't do a match,
13450 // but allow it at the lowest weight.
13451 if (!CallOperandVal)
13452 return CW_Default;
13453 Type *type = CallOperandVal->getType();
13454 // Look at the constraint type.
13455 switch (*constraint) {
13456 default:
13458 break;
13459 case 'x':
13460 case 'w':
13461 case 'y':
13462 if (type->isFloatingPointTy() || type->isVectorTy())
13463 weight = CW_Register;
13464 break;
13465 case 'z':
13466 weight = CW_Constant;
13467 break;
13468 case 'U':
13469 if (parsePredicateConstraint(constraint) ||
13470 parseReducedGprConstraint(constraint))
13471 weight = CW_Register;
13472 break;
13473 }
13474 return weight;
13475}
13476
13477std::pair<unsigned, const TargetRegisterClass *>
13478AArch64TargetLowering::getRegForInlineAsmConstraint(
13479 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13480 if (Constraint.size() == 1) {
13481 switch (Constraint[0]) {
13482 case 'r':
13483 if (VT.isScalableVector())
13484 return std::make_pair(0U, nullptr);
13485 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13486 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13487 if (VT.getFixedSizeInBits() == 64)
13488 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13489 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13490 case 'w': {
13491 if (!Subtarget->hasFPARMv8())
13492 break;
13493 if (VT.isScalableVector()) {
13494 if (VT.getVectorElementType() != MVT::i1)
13495 return std::make_pair(0U, &AArch64::ZPRRegClass);
13496 return std::make_pair(0U, nullptr);
13497 }
13498 if (VT == MVT::Other)
13499 break;
13500 uint64_t VTSize = VT.getFixedSizeInBits();
13501 if (VTSize == 16)
13502 return std::make_pair(0U, &AArch64::FPR16RegClass);
13503 if (VTSize == 32)
13504 return std::make_pair(0U, &AArch64::FPR32RegClass);
13505 if (VTSize == 64)
13506 return std::make_pair(0U, &AArch64::FPR64RegClass);
13507 if (VTSize == 128)
13508 return std::make_pair(0U, &AArch64::FPR128RegClass);
13509 break;
13510 }
13511 // The instructions that this constraint is designed for can
13512 // only take 128-bit registers so just use that regclass.
13513 case 'x':
13514 if (!Subtarget->hasFPARMv8())
13515 break;
13516 if (VT.isScalableVector())
13517 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13518 if (VT.getSizeInBits() == 128)
13519 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13520 break;
13521 case 'y':
13522 if (!Subtarget->hasFPARMv8())
13523 break;
13524 if (VT.isScalableVector())
13525 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13526 break;
13527 }
13528 } else {
13529 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13530 // SME functions that are not in streaming mode, should
13531 // still observe clobbers of Z-registers by clobbering
13532 // the lower 128bits of those registers.
13533 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13534 !Subtarget->isSVEorStreamingSVEAvailable())
13535 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13536 &AArch64::FPR128RegClass);
13537 return *P;
13538 }
13539 if (const auto PC = parsePredicateConstraint(Constraint))
13540 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13541 return std::make_pair(0U, RegClass);
13542
13543 if (const auto RGC = parseReducedGprConstraint(Constraint))
13544 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13545 return std::make_pair(0U, RegClass);
13546 }
13547 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13549 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13550
13551 if (Constraint == "{za}") {
13552 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13553 }
13554
13555 if (Constraint == "{zt0}") {
13556 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13557 }
13558
13559 // Use the default implementation in TargetLowering to convert the register
13560 // constraint into a member of a register class.
13561 std::pair<unsigned, const TargetRegisterClass *> Res;
13563
13564 // Not found as a standard register?
13565 if (!Res.second) {
13566 unsigned Size = Constraint.size();
13567 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13568 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13569 int RegNo;
13570 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13571 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13572 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13573 // By default we'll emit v0-v31 for this unless there's a modifier where
13574 // we'll emit the correct register as well.
13575 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13576 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13577 Res.second = &AArch64::FPR64RegClass;
13578 } else {
13579 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13580 Res.second = &AArch64::FPR128RegClass;
13581 }
13582 }
13583 }
13584 }
13585
13586 if (Res.second && !Subtarget->hasFPARMv8() &&
13587 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13588 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13589 return std::make_pair(0U, nullptr);
13590
13591 return Res;
13592}
13593
13595 llvm::Type *Ty,
13596 bool AllowUnknown) const {
13597 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13598 return EVT(MVT::i64x8);
13599
13600 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13601}
13602
13603/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13604/// vector. If it is invalid, don't add anything to Ops.
13605void AArch64TargetLowering::LowerAsmOperandForConstraint(
13606 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13607 SelectionDAG &DAG) const {
13608 SDValue Result;
13609
13610 // Currently only support length 1 constraints.
13611 if (Constraint.size() != 1)
13612 return;
13613
13614 char ConstraintLetter = Constraint[0];
13615 switch (ConstraintLetter) {
13616 default:
13617 break;
13618
13619 // This set of constraints deal with valid constants for various instructions.
13620 // Validate and return a target constant for them if we can.
13621 case 'z': {
13622 // 'z' maps to xzr or wzr so it needs an input of 0.
13623 if (!isNullConstant(Op))
13624 return;
13625
13626 if (Op.getValueType() == MVT::i64)
13627 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13628 else
13629 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13630 break;
13631 }
13632 case 'S':
13633 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13634 // supported for PIC while "s" isn't, making "s" less useful. We implement
13635 // "S" but not "s".
13637 break;
13638
13639 case 'I':
13640 case 'J':
13641 case 'K':
13642 case 'L':
13643 case 'M':
13644 case 'N':
13646 if (!C)
13647 return;
13648
13649 // Grab the value and do some validation.
13650 uint64_t CVal = C->getZExtValue();
13651 switch (ConstraintLetter) {
13652 // The I constraint applies only to simple ADD or SUB immediate operands:
13653 // i.e. 0 to 4095 with optional shift by 12
13654 // The J constraint applies only to ADD or SUB immediates that would be
13655 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13656 // instruction [or vice versa], in other words -1 to -4095 with optional
13657 // left shift by 12.
13658 case 'I':
13659 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13660 break;
13661 return;
13662 case 'J': {
13663 uint64_t NVal = -C->getSExtValue();
13664 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13665 CVal = C->getSExtValue();
13666 break;
13667 }
13668 return;
13669 }
13670 // The K and L constraints apply *only* to logical immediates, including
13671 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13672 // been removed and MOV should be used). So these constraints have to
13673 // distinguish between bit patterns that are valid 32-bit or 64-bit
13674 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13675 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13676 // versa.
13677 case 'K':
13678 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13679 break;
13680 return;
13681 case 'L':
13682 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13683 break;
13684 return;
13685 // The M and N constraints are a superset of K and L respectively, for use
13686 // with the MOV (immediate) alias. As well as the logical immediates they
13687 // also match 32 or 64-bit immediates that can be loaded either using a
13688 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13689 // (M) or 64-bit 0x1234000000000000 (N) etc.
13690 // As a note some of this code is liberally stolen from the asm parser.
13691 case 'M': {
13692 if (!isUInt<32>(CVal))
13693 return;
13694 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13695 break;
13696 if ((CVal & 0xFFFF) == CVal)
13697 break;
13698 if ((CVal & 0xFFFF0000ULL) == CVal)
13699 break;
13700 uint64_t NCVal = ~(uint32_t)CVal;
13701 if ((NCVal & 0xFFFFULL) == NCVal)
13702 break;
13703 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13704 break;
13705 return;
13706 }
13707 case 'N': {
13708 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13709 break;
13710 if ((CVal & 0xFFFFULL) == CVal)
13711 break;
13712 if ((CVal & 0xFFFF0000ULL) == CVal)
13713 break;
13714 if ((CVal & 0xFFFF00000000ULL) == CVal)
13715 break;
13716 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13717 break;
13718 uint64_t NCVal = ~CVal;
13719 if ((NCVal & 0xFFFFULL) == NCVal)
13720 break;
13721 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13722 break;
13723 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13724 break;
13725 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13726 break;
13727 return;
13728 }
13729 default:
13730 return;
13731 }
13732
13733 // All assembler immediates are 64-bit integers.
13734 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13735 break;
13736 }
13737
13738 if (Result.getNode()) {
13739 Ops.push_back(Result);
13740 return;
13741 }
13742
13743 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13744}
13745
13746//===----------------------------------------------------------------------===//
13747// AArch64 Advanced SIMD Support
13748//===----------------------------------------------------------------------===//
13749
13750/// WidenVector - Given a value in the V64 register class, produce the
13751/// equivalent value in the V128 register class.
13753 EVT VT = V64Reg.getValueType();
13754 unsigned NarrowSize = VT.getVectorNumElements();
13755 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13756 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13757 SDLoc DL(V64Reg);
13758
13759 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13760 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13761}
13762
13763/// getExtFactor - Determine the adjustment factor for the position when
13764/// generating an "extract from vector registers" instruction.
13765static unsigned getExtFactor(SDValue &V) {
13766 EVT EltType = V.getValueType().getVectorElementType();
13767 return EltType.getSizeInBits() / 8;
13768}
13769
13770// Check if a vector is built from one vector via extracted elements of
13771// another together with an AND mask, ensuring that all elements fit
13772// within range. This can be reconstructed using AND and NEON's TBL1.
13774 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13775 SDLoc DL(Op);
13776 EVT VT = Op.getValueType();
13777 assert(!VT.isScalableVector() &&
13778 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13779
13780 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13781 // directly to TBL1.
13782 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13783 return SDValue();
13784
13785 unsigned NumElts = VT.getVectorNumElements();
13786 assert((NumElts == 8 || NumElts == 16) &&
13787 "Need to have exactly 8 or 16 elements in vector.");
13788
13789 SDValue SourceVec;
13790 SDValue MaskSourceVec;
13791 SmallVector<SDValue, 16> AndMaskConstants;
13792
13793 for (unsigned i = 0; i < NumElts; ++i) {
13794 SDValue V = Op.getOperand(i);
13795 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13796 return SDValue();
13797
13798 SDValue OperandSourceVec = V.getOperand(0);
13799 if (!SourceVec)
13800 SourceVec = OperandSourceVec;
13801 else if (SourceVec != OperandSourceVec)
13802 return SDValue();
13803
13804 // This only looks at shuffles with elements that are
13805 // a) truncated by a constant AND mask extracted from a mask vector, or
13806 // b) extracted directly from a mask vector.
13807 SDValue MaskSource = V.getOperand(1);
13808 if (MaskSource.getOpcode() == ISD::AND) {
13809 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13810 return SDValue();
13811
13812 AndMaskConstants.push_back(MaskSource.getOperand(1));
13813 MaskSource = MaskSource->getOperand(0);
13814 } else if (!AndMaskConstants.empty()) {
13815 // Either all or no operands should have an AND mask.
13816 return SDValue();
13817 }
13818
13819 // An ANY_EXTEND may be inserted between the AND and the source vector
13820 // extraction. We don't care about that, so we can just skip it.
13821 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13822 MaskSource = MaskSource.getOperand(0);
13823
13824 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13825 return SDValue();
13826
13827 SDValue MaskIdx = MaskSource.getOperand(1);
13828 if (!isa<ConstantSDNode>(MaskIdx) ||
13829 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13830 return SDValue();
13831
13832 // We only apply this if all elements come from the same vector with the
13833 // same vector type.
13834 if (!MaskSourceVec) {
13835 MaskSourceVec = MaskSource->getOperand(0);
13836 if (MaskSourceVec.getValueType() != VT)
13837 return SDValue();
13838 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13839 return SDValue();
13840 }
13841 }
13842
13843 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13844 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13845 // insert, we know that the index in the mask must be smaller than the number
13846 // of elements in the source, or we would have an out-of-bounds access.
13847 if (NumElts == 8)
13848 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13849 DAG.getUNDEF(VT));
13850
13851 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13852 if (!AndMaskConstants.empty())
13853 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13854 DAG.getBuildVector(VT, DL, AndMaskConstants));
13855
13856 return DAG.getNode(
13858 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
13859 SourceVec, MaskSourceVec);
13860}
13861
13862// Gather data to see if the operation can be modelled as a
13863// shuffle in combination with VEXTs.
13865 SelectionDAG &DAG) const {
13866 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13867 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13868 SDLoc DL(Op);
13869 EVT VT = Op.getValueType();
13870 assert(!VT.isScalableVector() &&
13871 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13872 unsigned NumElts = VT.getVectorNumElements();
13873
13874 struct ShuffleSourceInfo {
13875 SDValue Vec;
13876 unsigned MinElt;
13877 unsigned MaxElt;
13878
13879 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13880 // be compatible with the shuffle we intend to construct. As a result
13881 // ShuffleVec will be some sliding window into the original Vec.
13882 SDValue ShuffleVec;
13883
13884 // Code should guarantee that element i in Vec starts at element "WindowBase
13885 // + i * WindowScale in ShuffleVec".
13886 int WindowBase;
13887 int WindowScale;
13888
13889 ShuffleSourceInfo(SDValue Vec)
13890 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13891 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13892
13893 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13894 };
13895
13896 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13897 // node.
13899 for (unsigned i = 0; i < NumElts; ++i) {
13900 SDValue V = Op.getOperand(i);
13901 if (V.isUndef())
13902 continue;
13903 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13904 !isa<ConstantSDNode>(V.getOperand(1)) ||
13905 V.getOperand(0).getValueType().isScalableVector()) {
13906 LLVM_DEBUG(
13907 dbgs() << "Reshuffle failed: "
13908 "a shuffle can only come from building a vector from "
13909 "various elements of other fixed-width vectors, provided "
13910 "their indices are constant\n");
13911 return SDValue();
13912 }
13913
13914 // Add this element source to the list if it's not already there.
13915 SDValue SourceVec = V.getOperand(0);
13916 auto Source = find(Sources, SourceVec);
13917 if (Source == Sources.end())
13918 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13919
13920 // Update the minimum and maximum lane number seen.
13921 unsigned EltNo = V.getConstantOperandVal(1);
13922 Source->MinElt = std::min(Source->MinElt, EltNo);
13923 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13924 }
13925
13926 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13927 // better than moving to/from gpr registers for larger vectors.
13928 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13929 // Construct a mask for the tbl. We may need to adjust the index for types
13930 // larger than i8.
13932 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13933 for (unsigned I = 0; I < NumElts; ++I) {
13934 SDValue V = Op.getOperand(I);
13935 if (V.isUndef()) {
13936 for (unsigned OF = 0; OF < OutputFactor; OF++)
13937 Mask.push_back(-1);
13938 continue;
13939 }
13940 // Set the Mask lanes adjusted for the size of the input and output
13941 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13942 // output element, adjusted in their positions per input and output types.
13943 unsigned Lane = V.getConstantOperandVal(1);
13944 for (unsigned S = 0; S < Sources.size(); S++) {
13945 if (V.getOperand(0) == Sources[S].Vec) {
13946 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13947 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13948 for (unsigned OF = 0; OF < OutputFactor; OF++)
13949 Mask.push_back(InputBase + OF);
13950 break;
13951 }
13952 }
13953 }
13954
13955 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13956 // v16i8, and the TBLMask
13957 SmallVector<SDValue, 16> TBLOperands;
13958 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13959 ? Intrinsic::aarch64_neon_tbl3
13960 : Intrinsic::aarch64_neon_tbl4,
13961 DL, MVT::i32));
13962 for (unsigned i = 0; i < Sources.size(); i++) {
13963 SDValue Src = Sources[i].Vec;
13964 EVT SrcVT = Src.getValueType();
13965 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13966 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13967 "Expected a legally typed vector");
13968 if (SrcVT.is64BitVector())
13969 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13970 DAG.getUNDEF(MVT::v8i8));
13971 TBLOperands.push_back(Src);
13972 }
13973
13975 for (unsigned i = 0; i < Mask.size(); i++)
13976 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13977 assert((Mask.size() == 8 || Mask.size() == 16) &&
13978 "Expected a v8i8 or v16i8 Mask");
13979 TBLOperands.push_back(DAG.getBuildVector(
13980 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13981
13982 SDValue Shuffle =
13984 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13985 return DAG.getBitcast(VT, Shuffle);
13986 }
13987
13988 if (Sources.size() > 2) {
13989 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13990 << "sensible when at most two source vectors are "
13991 << "involved\n");
13992 return SDValue();
13993 }
13994
13995 // Find out the smallest element size among result and two sources, and use
13996 // it as element size to build the shuffle_vector.
13997 EVT SmallestEltTy = VT.getVectorElementType();
13998 for (auto &Source : Sources) {
13999 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
14000 if (SrcEltTy.bitsLT(SmallestEltTy)) {
14001 SmallestEltTy = SrcEltTy;
14002 }
14003 }
14004 unsigned ResMultiplier =
14005 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14006 uint64_t VTSize = VT.getFixedSizeInBits();
14007 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
14008 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
14009
14010 // If the source vector is too wide or too narrow, we may nevertheless be able
14011 // to construct a compatible shuffle either by concatenating it with UNDEF or
14012 // extracting a suitable range of elements.
14013 for (auto &Src : Sources) {
14014 EVT SrcVT = Src.ShuffleVec.getValueType();
14015
14016 TypeSize SrcVTSize = SrcVT.getSizeInBits();
14017 if (SrcVTSize == TypeSize::getFixed(VTSize))
14018 continue;
14019
14020 // This stage of the search produces a source with the same element type as
14021 // the original, but with a total width matching the BUILD_VECTOR output.
14022 EVT EltVT = SrcVT.getVectorElementType();
14023 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
14024 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
14025
14026 if (SrcVTSize.getFixedValue() < VTSize) {
14027 assert(2 * SrcVTSize == VTSize);
14028 // We can pad out the smaller vector for free, so if it's part of a
14029 // shuffle...
14030 Src.ShuffleVec =
14031 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
14032 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
14033 continue;
14034 }
14035
14036 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
14037 LLVM_DEBUG(
14038 dbgs() << "Reshuffle failed: result vector too small to extract\n");
14039 return SDValue();
14040 }
14041
14042 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
14043 LLVM_DEBUG(
14044 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
14045 return SDValue();
14046 }
14047
14048 if (Src.MinElt >= NumSrcElts) {
14049 // The extraction can just take the second half
14050 Src.ShuffleVec =
14051 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14052 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14053 Src.WindowBase = -NumSrcElts;
14054 } else if (Src.MaxElt < NumSrcElts) {
14055 // The extraction can just take the first half
14056 Src.ShuffleVec =
14057 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14058 DAG.getConstant(0, DL, MVT::i64));
14059 } else {
14060 // An actual VEXT is needed
14061 SDValue VEXTSrc1 =
14062 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14063 DAG.getConstant(0, DL, MVT::i64));
14064 SDValue VEXTSrc2 =
14065 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14066 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14067 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
14068
14069 if (!SrcVT.is64BitVector()) {
14070 LLVM_DEBUG(
14071 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
14072 "for SVE vectors.");
14073 return SDValue();
14074 }
14075
14076 Src.ShuffleVec =
14077 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
14078 DAG.getConstant(Imm, DL, MVT::i32));
14079 Src.WindowBase = -Src.MinElt;
14080 }
14081 }
14082
14083 // Another possible incompatibility occurs from the vector element types. We
14084 // can fix this by bitcasting the source vectors to the same type we intend
14085 // for the shuffle.
14086 for (auto &Src : Sources) {
14087 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
14088 if (SrcEltTy == SmallestEltTy)
14089 continue;
14090 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
14091 if (DAG.getDataLayout().isBigEndian()) {
14092 Src.ShuffleVec =
14093 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
14094 } else {
14095 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
14096 }
14097 Src.WindowScale =
14098 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14099 Src.WindowBase *= Src.WindowScale;
14100 }
14101
14102 // Final check before we try to actually produce a shuffle.
14103 LLVM_DEBUG({
14104 for (auto Src : Sources)
14105 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
14106 });
14107
14108 // The stars all align, our next step is to produce the mask for the shuffle.
14109 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
14110 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
14111 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
14112 SDValue Entry = Op.getOperand(i);
14113 if (Entry.isUndef())
14114 continue;
14115
14116 auto Src = find(Sources, Entry.getOperand(0));
14117 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
14118
14119 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
14120 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
14121 // segment.
14122 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
14123 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
14124 VT.getScalarSizeInBits());
14125 int LanesDefined = BitsDefined / BitsPerShuffleLane;
14126
14127 // This source is expected to fill ResMultiplier lanes of the final shuffle,
14128 // starting at the appropriate offset.
14129 int *LaneMask = &Mask[i * ResMultiplier];
14130
14131 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
14132 ExtractBase += NumElts * (Src - Sources.begin());
14133 for (int j = 0; j < LanesDefined; ++j)
14134 LaneMask[j] = ExtractBase + j;
14135 }
14136
14137 // Final check before we try to produce nonsense...
14138 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
14139 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
14140 return SDValue();
14141 }
14142
14143 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
14144 for (unsigned i = 0; i < Sources.size(); ++i)
14145 ShuffleOps[i] = Sources[i].ShuffleVec;
14146
14147 SDValue Shuffle =
14148 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
14149 SDValue V;
14150 if (DAG.getDataLayout().isBigEndian()) {
14151 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
14152 } else {
14153 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
14154 }
14155
14156 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
14157 dbgs() << "Reshuffle, creating node: "; V.dump(););
14158
14159 return V;
14160}
14161
14162// check if an EXT instruction can handle the shuffle mask when the
14163// vector sources of the shuffle are the same.
14164static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
14165 unsigned NumElts = VT.getVectorNumElements();
14166
14167 // Assume that the first shuffle index is not UNDEF. Fail if it is.
14168 if (M[0] < 0)
14169 return false;
14170
14171 Imm = M[0];
14172
14173 // If this is a VEXT shuffle, the immediate value is the index of the first
14174 // element. The other shuffle indices must be the successive elements after
14175 // the first one.
14176 unsigned ExpectedElt = Imm;
14177 for (unsigned i = 1; i < NumElts; ++i) {
14178 // Increment the expected index. If it wraps around, just follow it
14179 // back to index zero and keep going.
14180 ++ExpectedElt;
14181 if (ExpectedElt == NumElts)
14182 ExpectedElt = 0;
14183
14184 if (M[i] < 0)
14185 continue; // ignore UNDEF indices
14186 if (ExpectedElt != static_cast<unsigned>(M[i]))
14187 return false;
14188 }
14189
14190 return true;
14191}
14192
14193// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14194// v4i32s. This is really a truncate, which we can construct out of (legal)
14195// concats and truncate nodes.
14197 if (V.getValueType() != MVT::v16i8)
14198 return SDValue();
14199 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
14200
14201 for (unsigned X = 0; X < 4; X++) {
14202 // Check the first item in each group is an extract from lane 0 of a v4i32
14203 // or v4i16.
14204 SDValue BaseExt = V.getOperand(X * 4);
14205 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14206 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
14207 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
14208 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
14209 BaseExt.getConstantOperandVal(1) != 0)
14210 return SDValue();
14211 SDValue Base = BaseExt.getOperand(0);
14212 // And check the other items are extracts from the same vector.
14213 for (unsigned Y = 1; Y < 4; Y++) {
14214 SDValue Ext = V.getOperand(X * 4 + Y);
14215 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14216 Ext.getOperand(0) != Base ||
14218 Ext.getConstantOperandVal(1) != Y)
14219 return SDValue();
14220 }
14221 }
14222
14223 // Turn the buildvector into a series of truncates and concates, which will
14224 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
14225 // concat together to produce 2 v8i16. These are both truncated and concat
14226 // together.
14227 SDLoc DL(V);
14228 SDValue Trunc[4] = {
14229 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
14230 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
14231 for (SDValue &V : Trunc)
14232 if (V.getValueType() == MVT::v4i32)
14233 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
14234 SDValue Concat0 =
14235 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
14236 SDValue Concat1 =
14237 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
14238 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
14239 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
14240 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
14241}
14242
14243/// Check if a vector shuffle corresponds to a DUP instructions with a larger
14244/// element width than the vector lane type. If that is the case the function
14245/// returns true and writes the value of the DUP instruction lane operand into
14246/// DupLaneOp
14247static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
14248 unsigned &DupLaneOp) {
14249 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
14250 "Only possible block sizes for wide DUP are: 16, 32, 64");
14251
14252 if (BlockSize <= VT.getScalarSizeInBits())
14253 return false;
14254 if (BlockSize % VT.getScalarSizeInBits() != 0)
14255 return false;
14256 if (VT.getSizeInBits() % BlockSize != 0)
14257 return false;
14258
14259 size_t SingleVecNumElements = VT.getVectorNumElements();
14260 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
14261 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
14262
14263 // We are looking for masks like
14264 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
14265 // might be replaced by 'undefined'. BlockIndices will eventually contain
14266 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
14267 // for the above examples)
14268 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
14269 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
14270 for (size_t I = 0; I < NumEltsPerBlock; I++) {
14271 int Elt = M[BlockIndex * NumEltsPerBlock + I];
14272 if (Elt < 0)
14273 continue;
14274 // For now we don't support shuffles that use the second operand
14275 if ((unsigned)Elt >= SingleVecNumElements)
14276 return false;
14277 if (BlockElts[I] < 0)
14278 BlockElts[I] = Elt;
14279 else if (BlockElts[I] != Elt)
14280 return false;
14281 }
14282
14283 // We found a candidate block (possibly with some undefs). It must be a
14284 // sequence of consecutive integers starting with a value divisible by
14285 // NumEltsPerBlock with some values possibly replaced by undef-s.
14286
14287 // Find first non-undef element
14288 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
14289 assert(FirstRealEltIter != BlockElts.end() &&
14290 "Shuffle with all-undefs must have been caught by previous cases, "
14291 "e.g. isSplat()");
14292 if (FirstRealEltIter == BlockElts.end()) {
14293 DupLaneOp = 0;
14294 return true;
14295 }
14296
14297 // Index of FirstRealElt in BlockElts
14298 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
14299
14300 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
14301 return false;
14302 // BlockElts[0] must have the following value if it isn't undef:
14303 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
14304
14305 // Check the first element
14306 if (Elt0 % NumEltsPerBlock != 0)
14307 return false;
14308 // Check that the sequence indeed consists of consecutive integers (modulo
14309 // undefs)
14310 for (size_t I = 0; I < NumEltsPerBlock; I++)
14311 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
14312 return false;
14313
14314 DupLaneOp = Elt0 / NumEltsPerBlock;
14315 return true;
14316}
14317
14318// check if an EXT instruction can handle the shuffle mask when the
14319// vector sources of the shuffle are different.
14320static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
14321 unsigned &Imm) {
14322 // Look for the first non-undef element.
14323 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
14324
14325 // Benefit from APInt to handle overflow when calculating expected element.
14326 unsigned NumElts = VT.getVectorNumElements();
14327 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
14328 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
14329 /*implicitTrunc=*/true);
14330 // The following shuffle indices must be the successive elements after the
14331 // first real element.
14332 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
14333 return Elt != ExpectedElt++ && Elt >= 0;
14334 });
14335 if (FoundWrongElt)
14336 return false;
14337
14338 // The index of an EXT is the first element if it is not UNDEF.
14339 // Watch out for the beginning UNDEFs. The EXT index should be the expected
14340 // value of the first element. E.g.
14341 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
14342 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
14343 // ExpectedElt is the last mask index plus 1.
14344 Imm = ExpectedElt.getZExtValue();
14345
14346 // There are two difference cases requiring to reverse input vectors.
14347 // For example, for vector <4 x i32> we have the following cases,
14348 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
14349 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
14350 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
14351 // to reverse two input vectors.
14352 if (Imm < NumElts)
14353 ReverseEXT = true;
14354 else
14355 Imm -= NumElts;
14356
14357 return true;
14358}
14359
14360/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
14361/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14362/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
14363static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14364 unsigned NumElts = VT.getVectorNumElements();
14365 if (NumElts % 2 != 0)
14366 return false;
14367 WhichResult = (M[0] == 0 ? 0 : 1);
14368 unsigned Idx = WhichResult * NumElts / 2;
14369 for (unsigned i = 0; i != NumElts; i += 2) {
14370 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
14371 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
14372 return false;
14373 Idx += 1;
14374 }
14375
14376 return true;
14377}
14378
14379/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
14380/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14381/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
14382static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14383 unsigned Half = VT.getVectorNumElements() / 2;
14384 WhichResult = (M[0] == 0 ? 0 : 1);
14385 for (unsigned j = 0; j != 2; ++j) {
14386 unsigned Idx = WhichResult;
14387 for (unsigned i = 0; i != Half; ++i) {
14388 int MIdx = M[i + j * Half];
14389 if (MIdx >= 0 && (unsigned)MIdx != Idx)
14390 return false;
14391 Idx += 2;
14392 }
14393 }
14394
14395 return true;
14396}
14397
14398/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
14399/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14400/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
14401static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14402 unsigned NumElts = VT.getVectorNumElements();
14403 if (NumElts % 2 != 0)
14404 return false;
14405 WhichResult = (M[0] == 0 ? 0 : 1);
14406 for (unsigned i = 0; i < NumElts; i += 2) {
14407 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
14408 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
14409 return false;
14410 }
14411 return true;
14412}
14413
14414static bool isINSMask(ArrayRef<int> M, int NumInputElements,
14415 bool &DstIsLeft, int &Anomaly) {
14416 if (M.size() != static_cast<size_t>(NumInputElements))
14417 return false;
14418
14419 int NumLHSMatch = 0, NumRHSMatch = 0;
14420 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14421
14422 for (int i = 0; i < NumInputElements; ++i) {
14423 if (M[i] == -1) {
14424 ++NumLHSMatch;
14425 ++NumRHSMatch;
14426 continue;
14427 }
14428
14429 if (M[i] == i)
14430 ++NumLHSMatch;
14431 else
14432 LastLHSMismatch = i;
14433
14434 if (M[i] == i + NumInputElements)
14435 ++NumRHSMatch;
14436 else
14437 LastRHSMismatch = i;
14438 }
14439
14440 if (NumLHSMatch == NumInputElements - 1) {
14441 DstIsLeft = true;
14442 Anomaly = LastLHSMismatch;
14443 return true;
14444 } else if (NumRHSMatch == NumInputElements - 1) {
14445 DstIsLeft = false;
14446 Anomaly = LastRHSMismatch;
14447 return true;
14448 }
14449
14450 return false;
14451}
14452
14453static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14454 if (VT.getSizeInBits() != 128)
14455 return false;
14456
14457 unsigned NumElts = VT.getVectorNumElements();
14458
14459 for (int I = 0, E = NumElts / 2; I != E; I++) {
14460 if (Mask[I] != I)
14461 return false;
14462 }
14463
14464 int Offset = NumElts / 2;
14465 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14466 if (Mask[I] != I + SplitLHS * Offset)
14467 return false;
14468 }
14469
14470 return true;
14471}
14472
14474 SDLoc DL(Op);
14475 EVT VT = Op.getValueType();
14476 SDValue V0 = Op.getOperand(0);
14477 SDValue V1 = Op.getOperand(1);
14478 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14479
14482 return SDValue();
14483
14484 bool SplitV0 = V0.getValueSizeInBits() == 128;
14485
14486 if (!isConcatMask(Mask, VT, SplitV0))
14487 return SDValue();
14488
14489 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14490 if (SplitV0) {
14491 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14492 DAG.getConstant(0, DL, MVT::i64));
14493 }
14494 if (V1.getValueSizeInBits() == 128) {
14495 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14496 DAG.getConstant(0, DL, MVT::i64));
14497 }
14498 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14499}
14500
14501/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14502/// the specified operations to build the shuffle. ID is the perfect-shuffle
14503//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14504//table entry and LHS/RHS are the immediate inputs for this stage of the
14505//shuffle.
14507 unsigned PFEntry, SDValue LHS,
14508 SDValue RHS, SelectionDAG &DAG,
14509 const SDLoc &DL) {
14510 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14511 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14512 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14513
14514 enum {
14515 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14516 OP_VREV,
14517 OP_VDUP0,
14518 OP_VDUP1,
14519 OP_VDUP2,
14520 OP_VDUP3,
14521 OP_VEXT1,
14522 OP_VEXT2,
14523 OP_VEXT3,
14524 OP_VUZPL, // VUZP, left result
14525 OP_VUZPR, // VUZP, right result
14526 OP_VZIPL, // VZIP, left result
14527 OP_VZIPR, // VZIP, right result
14528 OP_VTRNL, // VTRN, left result
14529 OP_VTRNR, // VTRN, right result
14530 OP_MOVLANE // Move lane. RHSID is the lane to move into
14531 };
14532
14533 if (OpNum == OP_COPY) {
14534 if (LHSID == (1 * 9 + 2) * 9 + 3)
14535 return LHS;
14536 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14537 return RHS;
14538 }
14539
14540 if (OpNum == OP_MOVLANE) {
14541 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14542 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14543 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14544 Elt = 3 - Elt;
14545 while (Elt > 0) {
14546 ID /= 9;
14547 Elt--;
14548 }
14549 return (ID % 9 == 8) ? -1 : ID % 9;
14550 };
14551
14552 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14553 // get the lane to move from the PFID, which is always from the
14554 // original vectors (V1 or V2).
14556 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14557 EVT VT = OpLHS.getValueType();
14558 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14559 unsigned ExtLane = 0;
14560 SDValue Input;
14561
14562 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14563 // convert into a higher type.
14564 if (RHSID & 0x4) {
14565 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14566 if (MaskElt == -1)
14567 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14568 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14569 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14570 Input = MaskElt < 2 ? V1 : V2;
14571 if (VT.getScalarSizeInBits() == 16) {
14572 Input = DAG.getBitcast(MVT::v2f32, Input);
14573 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14574 } else {
14575 assert(VT.getScalarSizeInBits() == 32 &&
14576 "Expected 16 or 32 bit shuffle elements");
14577 Input = DAG.getBitcast(MVT::v2f64, Input);
14578 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14579 }
14580 } else {
14581 int MaskElt = getPFIDLane(ID, RHSID);
14582 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14583 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14584 Input = MaskElt < 4 ? V1 : V2;
14585 // Be careful about creating illegal types. Use f16 instead of i16.
14586 if (VT == MVT::v4i16) {
14587 Input = DAG.getBitcast(MVT::v4f16, Input);
14588 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14589 }
14590 }
14592 Input.getValueType().getVectorElementType(),
14593 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14594 SDValue Ins =
14595 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14596 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14597 return DAG.getBitcast(VT, Ins);
14598 }
14599
14600 SDValue OpLHS, OpRHS;
14601 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14602 RHS, DAG, DL);
14603 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14604 RHS, DAG, DL);
14605 EVT VT = OpLHS.getValueType();
14606
14607 switch (OpNum) {
14608 default:
14609 llvm_unreachable("Unknown shuffle opcode!");
14610 case OP_VREV:
14611 // VREV divides the vector in half and swaps within the half.
14612 if (VT.getVectorElementType() == MVT::i32 ||
14613 VT.getVectorElementType() == MVT::f32)
14614 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14615 // vrev <4 x i16> -> REV32
14616 if (VT.getVectorElementType() == MVT::i16 ||
14617 VT.getVectorElementType() == MVT::f16 ||
14618 VT.getVectorElementType() == MVT::bf16)
14619 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14620 // vrev <4 x i8> -> REV16
14621 assert(VT.getVectorElementType() == MVT::i8);
14622 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14623 case OP_VDUP0:
14624 case OP_VDUP1:
14625 case OP_VDUP2:
14626 case OP_VDUP3: {
14627 EVT EltTy = VT.getVectorElementType();
14628 unsigned Opcode;
14629 if (EltTy == MVT::i8)
14630 Opcode = AArch64ISD::DUPLANE8;
14631 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14632 Opcode = AArch64ISD::DUPLANE16;
14633 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14634 Opcode = AArch64ISD::DUPLANE32;
14635 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14636 Opcode = AArch64ISD::DUPLANE64;
14637 else
14638 llvm_unreachable("Invalid vector element type?");
14639
14640 if (VT.getSizeInBits() == 64)
14641 OpLHS = WidenVector(OpLHS, DAG);
14642 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14643 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14644 }
14645 case OP_VEXT1:
14646 case OP_VEXT2:
14647 case OP_VEXT3: {
14648 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14649 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14650 DAG.getConstant(Imm, DL, MVT::i32));
14651 }
14652 case OP_VUZPL:
14653 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14654 case OP_VUZPR:
14655 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14656 case OP_VZIPL:
14657 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14658 case OP_VZIPR:
14659 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14660 case OP_VTRNL:
14661 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14662 case OP_VTRNR:
14663 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14664 }
14665}
14666
14668 SelectionDAG &DAG) {
14669 // Check to see if we can use the TBL instruction.
14670 SDValue V1 = Op.getOperand(0);
14671 SDValue V2 = Op.getOperand(1);
14672 SDLoc DL(Op);
14673
14674 EVT EltVT = Op.getValueType().getVectorElementType();
14675 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14676
14677 bool Swap = false;
14678 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14679 std::swap(V1, V2);
14680 Swap = true;
14681 }
14682
14683 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14684 // out of range values with 0s. We do need to make sure that any out-of-range
14685 // values are really out-of-range for a v16i8 vector.
14686 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14687 MVT IndexVT = MVT::v8i8;
14688 unsigned IndexLen = 8;
14689 if (Op.getValueSizeInBits() == 128) {
14690 IndexVT = MVT::v16i8;
14691 IndexLen = 16;
14692 }
14693
14695 for (int Val : ShuffleMask) {
14696 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14697 unsigned Offset = Byte + Val * BytesPerElt;
14698 if (Swap)
14699 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14700 if (IsUndefOrZero && Offset >= IndexLen)
14701 Offset = 255;
14702 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14703 }
14704 }
14705
14706 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14707 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14708
14709 SDValue Shuffle;
14710 if (IsUndefOrZero) {
14711 if (IndexLen == 8)
14712 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14713 Shuffle = DAG.getNode(
14714 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14715 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14716 V1Cst,
14717 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14718 } else {
14719 if (IndexLen == 8) {
14720 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14721 Shuffle = DAG.getNode(
14722 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14723 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14724 V1Cst,
14725 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14726 } else {
14727 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14728 // cannot currently represent the register constraints on the input
14729 // table registers.
14730 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14731 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14732 // IndexLen));
14733 Shuffle = DAG.getNode(
14734 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14735 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
14736 V1Cst, V2Cst,
14737 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14738 }
14739 }
14740 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14741}
14742
14743static unsigned getDUPLANEOp(EVT EltType) {
14744 if (EltType == MVT::i8)
14745 return AArch64ISD::DUPLANE8;
14746 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14747 return AArch64ISD::DUPLANE16;
14748 if (EltType == MVT::i32 || EltType == MVT::f32)
14749 return AArch64ISD::DUPLANE32;
14750 if (EltType == MVT::i64 || EltType == MVT::f64)
14751 return AArch64ISD::DUPLANE64;
14752
14753 llvm_unreachable("Invalid vector element type?");
14754}
14755
14756static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14757 unsigned Opcode, SelectionDAG &DAG) {
14758 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14759 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14760 // Match: dup (bitcast (extract_subv X, C)), LaneC
14761 if (BitCast.getOpcode() != ISD::BITCAST ||
14763 return false;
14764
14765 // The extract index must align in the destination type. That may not
14766 // happen if the bitcast is from narrow to wide type.
14767 SDValue Extract = BitCast.getOperand(0);
14768 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14769 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14770 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14771 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14772 if (ExtIdxInBits % CastedEltBitWidth != 0)
14773 return false;
14774
14775 // Can't handle cases where vector size is not 128-bit
14776 if (!Extract.getOperand(0).getValueType().is128BitVector())
14777 return false;
14778
14779 // Update the lane value by offsetting with the scaled extract index.
14780 LaneC += ExtIdxInBits / CastedEltBitWidth;
14781
14782 // Determine the casted vector type of the wide vector input.
14783 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14784 // Examples:
14785 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14786 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14787 unsigned SrcVecNumElts =
14788 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14790 SrcVecNumElts);
14791 return true;
14792 };
14793 MVT CastVT;
14794 if (getScaledOffsetDup(V, Lane, CastVT)) {
14795 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14796 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14797 V.getOperand(0).getValueType().is128BitVector()) {
14798 // The lane is incremented by the index of the extract.
14799 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14800 Lane += V.getConstantOperandVal(1);
14801 V = V.getOperand(0);
14802 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14803 // The lane is decremented if we are splatting from the 2nd operand.
14804 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14805 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14806 Lane -= Idx * VT.getVectorNumElements() / 2;
14807 V = WidenVector(V.getOperand(Idx), DAG);
14808 } else if (VT.getSizeInBits() == 64) {
14809 // Widen the operand to 128-bit register with undef.
14810 V = WidenVector(V, DAG);
14811 }
14812 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14813}
14814
14815// Try to widen element type to get a new mask value for a better permutation
14816// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14817// UZP1/2, TRN1/2, REV, INS, etc.
14818// For example:
14819// shufflevector <4 x i32> %a, <4 x i32> %b,
14820// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14821// is equivalent to:
14822// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14823// Finally, we can get:
14824// mov v0.d[0], v1.d[1]
14826 SDLoc DL(Op);
14827 EVT VT = Op.getValueType();
14828 EVT ScalarVT = VT.getVectorElementType();
14829 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14830 SDValue V0 = Op.getOperand(0);
14831 SDValue V1 = Op.getOperand(1);
14832 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14833
14834 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14835 // We need to make sure the wider element type is legal. Thus, ElementSize
14836 // should be not larger than 32 bits, and i1 type should also be excluded.
14837 if (ElementSize > 32 || ElementSize == 1)
14838 return SDValue();
14839
14840 SmallVector<int, 8> NewMask;
14841 if (widenShuffleMaskElts(Mask, NewMask)) {
14842 MVT NewEltVT = VT.isFloatingPoint()
14843 ? MVT::getFloatingPointVT(ElementSize * 2)
14844 : MVT::getIntegerVT(ElementSize * 2);
14845 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14846 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14847 V0 = DAG.getBitcast(NewVT, V0);
14848 V1 = DAG.getBitcast(NewVT, V1);
14849 return DAG.getBitcast(VT,
14850 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14851 }
14852 }
14853
14854 return SDValue();
14855}
14856
14857// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14859 ArrayRef<int> ShuffleMask,
14860 SelectionDAG &DAG) {
14861 SDValue Tbl1 = Op->getOperand(0);
14862 SDValue Tbl2 = Op->getOperand(1);
14863 SDLoc DL(Op);
14864 SDValue Tbl2ID =
14865 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14866
14867 EVT VT = Op.getValueType();
14868 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14869 Tbl1.getOperand(0) != Tbl2ID ||
14871 Tbl2.getOperand(0) != Tbl2ID)
14872 return SDValue();
14873
14874 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14875 return SDValue();
14876
14877 SDValue Mask1 = Tbl1.getOperand(3);
14878 SDValue Mask2 = Tbl2.getOperand(3);
14879 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14880 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14881 return SDValue();
14882
14883 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14884 for (unsigned I = 0; I < 16; I++) {
14885 if (ShuffleMask[I] < 16)
14886 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14887 else {
14888 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14889 if (!C)
14890 return SDValue();
14891 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14892 }
14893 }
14894
14895 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14896 SDValue ID =
14897 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14898
14899 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14900 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14901 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14902}
14903
14904SDValue
14905AArch64TargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
14906 SelectionDAG &DAG) const {
14907 SDLoc DL(Op);
14908 EVT VT = Op.getValueType();
14909 assert(VT.isScalableVector() && "Unexpected result type!");
14910
14911 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
14912 unsigned UnpackOpcode = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14913
14914 // Repeatedly unpack Val until the result is of the desired type.
14915 SDValue Val = Op.getOperand(0);
14916 switch (Val.getSimpleValueType().SimpleTy) {
14917 default:
14918 return SDValue();
14919 case MVT::nxv16i8:
14920 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv8i16, Val);
14921 if (VT == MVT::nxv8i16)
14922 break;
14923 [[fallthrough]];
14924 case MVT::nxv8i16:
14925 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv4i32, Val);
14926 if (VT == MVT::nxv4i32)
14927 break;
14928 [[fallthrough]];
14929 case MVT::nxv4i32:
14930 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv2i64, Val);
14931 assert(VT == MVT::nxv2i64 && "Unexpected result type!");
14932 break;
14933 }
14934
14935 return Val;
14936}
14937
14938// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14939// but we don't have an appropriate instruction,
14940// so custom-lower it as ZIP1-with-zeros.
14941SDValue
14942AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14943 SelectionDAG &DAG) const {
14944 SDLoc DL(Op);
14945 EVT VT = Op.getValueType();
14946
14947 if (VT.isScalableVector())
14948 return LowerEXTEND_VECTOR_INREG(Op, DAG);
14949
14950 SDValue SrcOp = Op.getOperand(0);
14951 EVT SrcVT = SrcOp.getValueType();
14952 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14953 "Unexpected extension factor.");
14954 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14955 // FIXME: support multi-step zipping?
14956 if (Scale != 2)
14957 return SDValue();
14958 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14959 return DAG.getBitcast(VT,
14960 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14961}
14962
14963SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14964 SelectionDAG &DAG) const {
14965 SDLoc DL(Op);
14966 EVT VT = Op.getValueType();
14967
14968 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14969
14970 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14971 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14972
14973 // Convert shuffles that are directly supported on NEON to target-specific
14974 // DAG nodes, instead of keeping them as shuffles and matching them again
14975 // during code selection. This is more efficient and avoids the possibility
14976 // of inconsistencies between legalization and selection.
14977 ArrayRef<int> ShuffleMask = SVN->getMask();
14978
14979 SDValue V1 = Op.getOperand(0);
14980 SDValue V2 = Op.getOperand(1);
14981
14982 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14983 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14984 "Unexpected VECTOR_SHUFFLE mask size!");
14985
14986 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14987 return Res;
14988
14989 if (SVN->isSplat()) {
14990 int Lane = SVN->getSplatIndex();
14991 // If this is undef splat, generate it via "just" vdup, if possible.
14992 if (Lane == -1)
14993 Lane = 0;
14994
14995 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14996 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14997 V1.getOperand(0));
14998 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14999 // constant. If so, we can just reference the lane's definition directly.
15000 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
15002 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
15003
15004 // Otherwise, duplicate from the lane of the input vector.
15005 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
15006 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
15007 }
15008
15009 // Check if the mask matches a DUP for a wider element
15010 for (unsigned LaneSize : {64U, 32U, 16U}) {
15011 unsigned Lane = 0;
15012 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
15013 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
15014 : LaneSize == 32 ? AArch64ISD::DUPLANE32
15015 : AArch64ISD::DUPLANE16;
15016 // Cast V1 to an integer vector with required lane size
15017 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
15018 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
15019 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
15020 V1 = DAG.getBitcast(NewVecTy, V1);
15021 // Construct the DUP instruction
15022 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
15023 // Cast back to the original type
15024 return DAG.getBitcast(VT, V1);
15025 }
15026 }
15027
15028 unsigned NumElts = VT.getVectorNumElements();
15029 unsigned EltSize = VT.getScalarSizeInBits();
15030 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
15031 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
15032 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
15033 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
15034 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
15035 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
15036
15037 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
15038 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
15039 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
15040 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
15041 DAG.getConstant(8, DL, MVT::i32));
15042 }
15043
15044 bool ReverseEXT = false;
15045 unsigned Imm;
15046 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
15047 if (ReverseEXT)
15048 std::swap(V1, V2);
15049 Imm *= getExtFactor(V1);
15050 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
15051 DAG.getConstant(Imm, DL, MVT::i32));
15052 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
15053 Imm *= getExtFactor(V1);
15054 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
15055 DAG.getConstant(Imm, DL, MVT::i32));
15056 }
15057
15058 unsigned WhichResult;
15059 unsigned OperandOrder;
15060 if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15061 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15062 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15063 OperandOrder == 0 ? V2 : V1);
15064 }
15065 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
15066 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15067 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
15068 }
15069 if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15070 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15071 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15072 OperandOrder == 0 ? V2 : V1);
15073 }
15074
15075 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15076 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15077 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15078 }
15079 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15080 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15081 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15082 }
15083 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15084 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15085 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15086 }
15087
15089 return Concat;
15090
15091 bool DstIsLeft;
15092 int Anomaly;
15093 int NumInputElements = V1.getValueType().getVectorNumElements();
15094 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
15095 SDValue DstVec = DstIsLeft ? V1 : V2;
15096 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
15097
15098 SDValue SrcVec = V1;
15099 int SrcLane = ShuffleMask[Anomaly];
15100 if (SrcLane >= NumInputElements) {
15101 SrcVec = V2;
15102 SrcLane -= NumElts;
15103 }
15104 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
15105
15106 EVT ScalarVT = VT.getVectorElementType();
15107
15108 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
15109 ScalarVT = MVT::i32;
15110
15111 return DAG.getNode(
15112 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
15113 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
15114 DstLaneV);
15115 }
15116
15117 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
15118 return NewSD;
15119
15120 // If the shuffle is not directly supported and it has 4 elements, use
15121 // the PerfectShuffle-generated table to synthesize it from other shuffles.
15122 if (NumElts == 4) {
15123 unsigned PFIndexes[4];
15124 for (unsigned i = 0; i != 4; ++i) {
15125 if (ShuffleMask[i] < 0)
15126 PFIndexes[i] = 8;
15127 else
15128 PFIndexes[i] = ShuffleMask[i];
15129 }
15130
15131 // Compute the index in the perfect shuffle table.
15132 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
15133 PFIndexes[2] * 9 + PFIndexes[3];
15134 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
15135 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
15136 DL);
15137 }
15138
15139 // Check for a "select shuffle", generating a BSL to pick between lanes in
15140 // V1/V2.
15141 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
15142 assert(VT.getScalarSizeInBits() <= 32 &&
15143 "Expected larger vector element sizes to be handled already");
15144 SmallVector<SDValue> MaskElts;
15145 for (int M : ShuffleMask)
15146 MaskElts.push_back(DAG.getConstant(
15147 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
15148 EVT IVT = VT.changeVectorElementTypeToInteger();
15149 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
15150 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
15151 DAG.getBitcast(IVT, V1),
15152 DAG.getBitcast(IVT, V2)));
15153 }
15154
15155 // Fall back to generating a TBL
15156 return GenerateTBL(Op, ShuffleMask, DAG);
15157}
15158
15159SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
15160 SelectionDAG &DAG) const {
15161 EVT VT = Op.getValueType();
15162
15163 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15164 return LowerToScalableOp(Op, DAG);
15165
15166 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
15167 "Unexpected vector type!");
15168
15169 // We can handle the constant cases during isel.
15170 if (isa<ConstantSDNode>(Op.getOperand(0)))
15171 return Op;
15172
15173 // There isn't a natural way to handle the general i1 case, so we use some
15174 // trickery with whilelo.
15175 SDLoc DL(Op);
15176 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
15177 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
15178 DAG.getValueType(MVT::i1));
15179 SDValue ID =
15180 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
15181 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15182 if (VT == MVT::nxv1i1)
15183 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
15184 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
15185 Zero, SplatVal),
15186 Zero);
15187 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
15188}
15189
15190SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
15191 SelectionDAG &DAG) const {
15192 SDLoc DL(Op);
15193
15194 EVT VT = Op.getValueType();
15195 if (!isTypeLegal(VT) || !VT.isScalableVector())
15196 return SDValue();
15197
15198 // Current lowering only supports the SVE-ACLE types.
15200 return SDValue();
15201
15202 // The DUPQ operation is independent of element type so normalise to i64s.
15203 SDValue Idx128 = Op.getOperand(2);
15204
15205 // DUPQ can be used when idx is in range.
15206 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
15207 if (CIdx && (CIdx->getZExtValue() <= 3)) {
15208 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
15209 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
15210 }
15211
15212 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
15213
15214 // The ACLE says this must produce the same result as:
15215 // svtbl(data, svadd_x(svptrue_b64(),
15216 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
15217 // index * 2))
15218 SDValue One = DAG.getConstant(1, DL, MVT::i64);
15219 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
15220
15221 // create the vector 0,1,0,1,...
15222 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
15223 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
15224
15225 // create the vector idx64,idx64+1,idx64,idx64+1,...
15226 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
15227 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
15228 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
15229
15230 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
15231 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
15232 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
15233}
15234
15235
15236static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
15237 APInt &UndefBits) {
15238 EVT VT = BVN->getValueType(0);
15239 APInt SplatBits, SplatUndef;
15240 unsigned SplatBitSize;
15241 bool HasAnyUndefs;
15242 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
15243 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
15244
15245 for (unsigned i = 0; i < NumSplats; ++i) {
15246 CnstBits <<= SplatBitSize;
15247 UndefBits <<= SplatBitSize;
15248 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
15249 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
15250 }
15251
15252 return true;
15253 }
15254
15255 return false;
15256}
15257
15258// Try 64-bit splatted SIMD immediate.
15259static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15260 const APInt &Bits) {
15261 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15262 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15263 EVT VT = Op.getValueType();
15264 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
15265
15268
15269 SDLoc DL(Op);
15270 SDValue Mov =
15271 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15272 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15273 }
15274 }
15275
15276 return SDValue();
15277}
15278
15279// Try 32-bit splatted SIMD immediate.
15280static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15281 const APInt &Bits,
15282 const SDValue *LHS = nullptr) {
15283 EVT VT = Op.getValueType();
15284 if (VT.isFixedLengthVector() &&
15286 return SDValue();
15287
15288 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15289 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15290 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15291 bool isAdvSIMDModImm = false;
15292 uint64_t Shift;
15293
15294 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
15296 Shift = 0;
15297 }
15298 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
15300 Shift = 8;
15301 }
15302 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
15304 Shift = 16;
15305 }
15306 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
15308 Shift = 24;
15309 }
15310
15311 if (isAdvSIMDModImm) {
15312 SDLoc DL(Op);
15313 SDValue Mov;
15314
15315 if (LHS)
15316 Mov = DAG.getNode(NewOp, DL, MovTy,
15317 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15318 DAG.getConstant(Value, DL, MVT::i32),
15319 DAG.getConstant(Shift, DL, MVT::i32));
15320 else
15321 Mov =
15322 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15323 DAG.getConstant(Shift, DL, MVT::i32));
15324
15325 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15326 }
15327 }
15328
15329 return SDValue();
15330}
15331
15332// Try 16-bit splatted SIMD immediate.
15333static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15334 const APInt &Bits,
15335 const SDValue *LHS = nullptr) {
15336 EVT VT = Op.getValueType();
15337 if (VT.isFixedLengthVector() &&
15339 return SDValue();
15340
15341 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15342 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15343 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
15344 bool isAdvSIMDModImm = false;
15345 uint64_t Shift;
15346
15347 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
15349 Shift = 0;
15350 }
15351 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
15353 Shift = 8;
15354 }
15355
15356 if (isAdvSIMDModImm) {
15357 SDLoc DL(Op);
15358 SDValue Mov;
15359
15360 if (LHS)
15361 Mov = DAG.getNode(NewOp, DL, MovTy,
15362 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15363 DAG.getConstant(Value, DL, MVT::i32),
15364 DAG.getConstant(Shift, DL, MVT::i32));
15365 else
15366 Mov =
15367 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15368 DAG.getConstant(Shift, DL, MVT::i32));
15369
15370 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15371 }
15372 }
15373
15374 return SDValue();
15375}
15376
15377// Try 32-bit splatted SIMD immediate with shifted ones.
15379 SelectionDAG &DAG, const APInt &Bits) {
15380 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15381 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15382 EVT VT = Op.getValueType();
15383 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15384 bool isAdvSIMDModImm = false;
15385 uint64_t Shift;
15386
15387 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
15389 Shift = 264;
15390 }
15391 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
15393 Shift = 272;
15394 }
15395
15396 if (isAdvSIMDModImm) {
15397 SDLoc DL(Op);
15398 SDValue Mov =
15399 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15400 DAG.getConstant(Shift, DL, MVT::i32));
15401 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15402 }
15403 }
15404
15405 return SDValue();
15406}
15407
15408// Try 8-bit splatted SIMD immediate.
15409static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15410 const APInt &Bits) {
15411 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15412 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15413 EVT VT = Op.getValueType();
15414 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
15415
15418
15419 SDLoc DL(Op);
15420 SDValue Mov =
15421 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15422 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15423 }
15424 }
15425
15426 return SDValue();
15427}
15428
15429// Try FP splatted SIMD immediate.
15430static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15431 const APInt &Bits) {
15432 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15433 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15434 EVT VT = Op.getValueType();
15435 bool isWide = (VT.getSizeInBits() == 128);
15436 MVT MovTy;
15437 bool isAdvSIMDModImm = false;
15438
15439 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
15441 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
15442 }
15443 else if (isWide &&
15444 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
15446 MovTy = MVT::v2f64;
15447 }
15448
15449 if (isAdvSIMDModImm) {
15450 SDLoc DL(Op);
15451 SDValue Mov =
15452 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15453 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15454 }
15455 }
15456
15457 return SDValue();
15458}
15459
15460// Specialized code to quickly find if PotentialBVec is a BuildVector that
15461// consists of only the same constant int value, returned in reference arg
15462// ConstVal
15463static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15464 uint64_t &ConstVal) {
15465 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15466 if (!Bvec)
15467 return false;
15469 if (!FirstElt)
15470 return false;
15471 EVT VT = Bvec->getValueType(0);
15472 unsigned NumElts = VT.getVectorNumElements();
15473 for (unsigned i = 1; i < NumElts; ++i)
15474 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15475 return false;
15476 ConstVal = FirstElt->getZExtValue();
15477 return true;
15478}
15479
15481 // Look through cast.
15482 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15483 N = N.getOperand(0);
15484
15485 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15486}
15487
15489 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15490
15491 // Look through cast.
15492 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15493 N = N.getOperand(0);
15494 // When reinterpreting from a type with fewer elements the "new" elements
15495 // are not active, so bail if they're likely to be used.
15496 if (N.getValueType().getVectorMinNumElements() < NumElts)
15497 return false;
15498 }
15499
15500 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15501 return true;
15502
15503 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15504 // or smaller than the implicit element type represented by N.
15505 // NOTE: A larger element count implies a smaller element type.
15506 if (N.getOpcode() == AArch64ISD::PTRUE &&
15507 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15508 return N.getValueType().getVectorMinNumElements() >= NumElts;
15509
15510 return false;
15511}
15512
15513// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15514// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15515// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15516// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15517// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15518// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15520 EVT VT = N->getValueType(0);
15521
15522 if (!VT.isVector())
15523 return SDValue();
15524
15525 SDLoc DL(N);
15526
15527 SDValue And;
15528 SDValue Shift;
15529
15530 SDValue FirstOp = N->getOperand(0);
15531 unsigned FirstOpc = FirstOp.getOpcode();
15532 SDValue SecondOp = N->getOperand(1);
15533 unsigned SecondOpc = SecondOp.getOpcode();
15534
15535 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15536 // a BICi in order to use an immediate instead of a register.
15537 // Is the other operand an shl or lshr? This will have been turned into:
15538 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15539 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15540 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15541 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15542 SecondOpc == AArch64ISD::SHL_PRED ||
15543 SecondOpc == AArch64ISD::SRL_PRED)) {
15544 And = FirstOp;
15545 Shift = SecondOp;
15546
15547 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15548 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15549 FirstOpc == AArch64ISD::SHL_PRED ||
15550 FirstOpc == AArch64ISD::SRL_PRED)) {
15551 And = SecondOp;
15552 Shift = FirstOp;
15553 } else
15554 return SDValue();
15555
15556 bool IsAnd = And.getOpcode() == ISD::AND;
15557 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15558 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15559 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15560 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15561
15562 // Is the shift amount constant and are all lanes active?
15563 uint64_t C2;
15564 if (ShiftHasPredOp) {
15565 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15566 return SDValue();
15567 APInt C;
15569 return SDValue();
15570 C2 = C.getZExtValue();
15571 } else if (ConstantSDNode *C2node =
15573 C2 = C2node->getZExtValue();
15574 else
15575 return SDValue();
15576
15577 APInt C1AsAPInt;
15578 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15579 if (IsAnd) {
15580 // Is the and mask vector all constant?
15581 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15582 return SDValue();
15583 } else {
15584 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15585 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15586 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15587 assert(C1nodeImm && C1nodeShift);
15588 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15589 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15590 }
15591
15592 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15593 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15594 // how much one can shift elements of a particular size?
15595 if (C2 > ElemSizeInBits)
15596 return SDValue();
15597
15598 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15599 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15600 if (C1AsAPInt != RequiredC1)
15601 return SDValue();
15602
15603 SDValue X = And.getOperand(0);
15604 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15605 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15606 : Shift.getOperand(1);
15607
15608 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15609 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15610}
15611
15613 EVT VT = N->getValueType(0);
15614 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15615 SDLoc DL(N);
15616 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15617
15618 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15619 return SDValue();
15620
15621 SDValue N0 = N->getOperand(0);
15622 if (N0.getOpcode() != ISD::AND)
15623 return SDValue();
15624
15625 SDValue N1 = N->getOperand(1);
15626 if (N1.getOpcode() != ISD::AND)
15627 return SDValue();
15628
15629 // InstCombine does (not (neg a)) => (add a -1).
15630 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15631 // Loop over all combinations of AND operands.
15632 for (int i = 1; i >= 0; --i) {
15633 for (int j = 1; j >= 0; --j) {
15634 SDValue O0 = N0->getOperand(i);
15635 SDValue O1 = N1->getOperand(j);
15636 SDValue Sub, Add, SubSibling, AddSibling;
15637
15638 // Find a SUB and an ADD operand, one from each AND.
15639 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15640 Sub = O0;
15641 Add = O1;
15642 SubSibling = N0->getOperand(1 - i);
15643 AddSibling = N1->getOperand(1 - j);
15644 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15645 Add = O0;
15646 Sub = O1;
15647 AddSibling = N0->getOperand(1 - i);
15648 SubSibling = N1->getOperand(1 - j);
15649 } else
15650 continue;
15651
15652 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15653 continue;
15654
15655 // Constant ones is always righthand operand of the Add.
15656 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15657 continue;
15658
15659 if (Sub.getOperand(1) != Add.getOperand(0))
15660 continue;
15661
15662 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15663 }
15664 }
15665
15666 // (or (and a b) (and (not a) c)) => (bsl a b c)
15667 // We only have to look for constant vectors here since the general, variable
15668 // case can be handled in TableGen.
15669 unsigned Bits = VT.getScalarSizeInBits();
15670 for (int i = 1; i >= 0; --i)
15671 for (int j = 1; j >= 0; --j) {
15672 APInt Val1, Val2;
15673
15674 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15676 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15677 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15678 N0->getOperand(1 - i), N1->getOperand(1 - j));
15679 }
15682 if (!BVN0 || !BVN1)
15683 continue;
15684
15685 bool FoundMatch = true;
15686 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15689 if (!CN0 || !CN1 ||
15690 CN0->getAPIntValue().trunc(Bits) !=
15691 ~CN1->getAsAPIntVal().trunc(Bits)) {
15692 FoundMatch = false;
15693 break;
15694 }
15695 }
15696 if (FoundMatch)
15697 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15698 N0->getOperand(1 - i), N1->getOperand(1 - j));
15699 }
15700
15701 return SDValue();
15702}
15703
15704SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15705 SelectionDAG &DAG) const {
15706 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15707 !Subtarget->isNeonAvailable()))
15708 return LowerToScalableOp(Op, DAG);
15709
15710 if (SDValue Res = tryLowerToBSL(Op, DAG))
15711 return Res;
15712
15713 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15714 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15715 return Res;
15716
15717 EVT VT = Op.getValueType();
15718 if (VT.isScalableVector())
15719 return Op;
15720
15721 SDValue LHS = Op.getOperand(0);
15722 BuildVectorSDNode *BVN =
15723 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15724 if (!BVN) {
15725 // OR commutes, so try swapping the operands.
15726 LHS = Op.getOperand(1);
15727 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15728 }
15729 if (!BVN)
15730 return Op;
15731
15732 APInt DefBits(VT.getSizeInBits(), 0);
15733 APInt UndefBits(VT.getSizeInBits(), 0);
15734 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15735 SDValue NewOp;
15736
15737 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15738 DefBits, &LHS)) ||
15739 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15740 DefBits, &LHS)))
15741 return NewOp;
15742
15743 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15744 UndefBits, &LHS)) ||
15745 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15746 UndefBits, &LHS)))
15747 return NewOp;
15748 }
15749
15750 // We can always fall back to a non-immediate OR.
15751 return Op;
15752}
15753
15754// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15755// be truncated to fit element width.
15757 SelectionDAG &DAG) {
15758 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15759 SDLoc DL(Op);
15760 EVT VT = Op.getValueType();
15761 EVT EltTy= VT.getVectorElementType();
15762
15763 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15764 return Op;
15765
15767 for (SDValue Lane : Op->ops()) {
15768 // For integer vectors, type legalization would have promoted the
15769 // operands already. Otherwise, if Op is a floating-point splat
15770 // (with operands cast to integers), then the only possibilities
15771 // are constants and UNDEFs.
15772 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15773 Lane = DAG.getConstant(
15774 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15775 DL, MVT::i32);
15776 } else if (Lane.getNode()->isUndef()) {
15777 Lane = DAG.getUNDEF(MVT::i32);
15778 } else {
15779 assert(Lane.getValueType() == MVT::i32 &&
15780 "Unexpected BUILD_VECTOR operand type");
15781 }
15782 Ops.push_back(Lane);
15783 }
15784 return DAG.getBuildVector(VT, DL, Ops);
15785}
15786
15788 const AArch64Subtarget *ST, APInt &DefBits) {
15789 EVT VT = Op.getValueType();
15790 // TODO: We should be able to support 64-bit destinations too
15791 if (!ST->hasSVE() || !VT.is128BitVector() ||
15792 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15793 return SDValue();
15794
15795 // See if we can make use of the SVE dup instruction.
15796 APInt Val64 = DefBits.trunc(64);
15797 int32_t ImmVal, ShiftVal;
15798 uint64_t Encoding;
15799 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal) &&
15800 !AArch64_AM::isSVELogicalImm(64, Val64.getZExtValue(), Encoding))
15801 return SDValue();
15802
15803 SDLoc DL(Op);
15804 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15805 DAG.getConstant(Val64, DL, MVT::i64));
15806 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15807 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15808}
15809
15811 const AArch64Subtarget *ST) {
15812 EVT VT = Op.getValueType();
15813 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15814 "Expected a legal NEON vector");
15815
15816 APInt DefBits(VT.getSizeInBits(), 0);
15817 APInt UndefBits(VT.getSizeInBits(), 0);
15819 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15820 auto TryMOVIWithBits = [&](APInt DefBits) {
15821 SDValue NewOp;
15822 if ((NewOp =
15823 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15824 (NewOp =
15825 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15826 (NewOp =
15827 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15828 (NewOp =
15829 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15830 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15831 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15832 return NewOp;
15833
15834 APInt NotDefBits = ~DefBits;
15835 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15836 NotDefBits)) ||
15837 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15838 NotDefBits)) ||
15839 (NewOp =
15840 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15841 return NewOp;
15842 return SDValue();
15843 };
15844 if (SDValue R = TryMOVIWithBits(DefBits))
15845 return R;
15846 if (SDValue R = TryMOVIWithBits(UndefBits))
15847 return R;
15848
15849 // Try to materialise the constant using SVE when available.
15850 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15851 return R;
15852
15853 // See if a fneg of the constant can be materialized with a MOVI, etc
15854 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15855 // FNegate each sub-element of the constant
15856 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15857 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15858 .zext(VT.getSizeInBits());
15859 APInt NegBits(VT.getSizeInBits(), 0);
15860 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15861 for (unsigned i = 0; i < NumElts; i++)
15862 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15863 NegBits = DefBits ^ NegBits;
15864
15865 // Try to create the new constants with MOVI, and if so generate a fneg
15866 // for it.
15867 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15868 SDLoc DL(Op);
15869 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15870 return DAG.getNode(
15871 AArch64ISD::NVCAST, DL, VT,
15872 DAG.getNode(ISD::FNEG, DL, VFVT,
15873 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15874 }
15875 return SDValue();
15876 };
15877 SDValue R;
15878 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15879 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15880 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15881 return R;
15882 }
15883
15884 return SDValue();
15885}
15886
15887SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15888 SDValue Op, SelectionDAG &DAG) const {
15889 EVT VT = Op.getValueType();
15890 SDLoc DL(Op);
15891 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15892 auto *BVN = cast<BuildVectorSDNode>(Op);
15893
15894 if (auto SeqInfo = BVN->isConstantSequence()) {
15895 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15896 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15897 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15898 return convertFromScalableVector(DAG, VT, Seq);
15899 }
15900
15901 unsigned NumElems = VT.getVectorNumElements();
15902 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15903 NumElems <= 1 || BVN->isConstant())
15904 return SDValue();
15905
15906 auto IsExtractElt = [](SDValue Op) {
15907 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15908 };
15909
15910 // For integer types that are not already in vectors limit to at most four
15911 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15912 if (VT.getScalarType().isInteger() &&
15913 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15914 return SDValue();
15915
15916 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15917 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15919 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15920 return Op.isUndef() ? Undef
15921 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15922 ContainerVT, Undef, Op, ZeroI64);
15923 });
15924
15925 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15926 while (Intermediates.size() > 1) {
15927 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15928
15929 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15930 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15931 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15932 Intermediates[I / 2] =
15933 Op1.isUndef() ? Op0
15934 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15935 }
15936
15937 Intermediates.resize(Intermediates.size() / 2);
15938 ZipEC = ZipEC.divideCoefficientBy(2);
15939 }
15940
15941 assert(Intermediates.size() == 1);
15942 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15943 return convertFromScalableVector(DAG, VT, Vec);
15944}
15945
15946SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15947 SelectionDAG &DAG) const {
15948 EVT VT = Op.getValueType();
15949
15950 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15951 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15952 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15953 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15954
15955 // Try to build a simple constant vector.
15956 Op = NormalizeBuildVector(Op, DAG);
15957 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15958 // abort.
15959 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15960 return SDValue();
15961
15962 // Certain vector constants, used to express things like logical NOT and
15963 // arithmetic NEG, are passed through unmodified. This allows special
15964 // patterns for these operations to match, which will lower these constants
15965 // to whatever is proven necessary.
15966 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15967 if (BVN->isConstant()) {
15968 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15969 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15970 APInt Val(BitSize,
15971 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15972 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15973 return Op;
15974 }
15975 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15976 if (Const->isZero() && !Const->isNegative())
15977 return Op;
15978 }
15979
15980 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15981 return V;
15982
15983 // Scan through the operands to find some interesting properties we can
15984 // exploit:
15985 // 1) If only one value is used, we can use a DUP, or
15986 // 2) if only the low element is not undef, we can just insert that, or
15987 // 3) if only one constant value is used (w/ some non-constant lanes),
15988 // we can splat the constant value into the whole vector then fill
15989 // in the non-constant lanes.
15990 // 4) FIXME: If different constant values are used, but we can intelligently
15991 // select the values we'll be overwriting for the non-constant
15992 // lanes such that we can directly materialize the vector
15993 // some other way (MOVI, e.g.), we can be sneaky.
15994 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15995 SDLoc DL(Op);
15996 unsigned NumElts = VT.getVectorNumElements();
15997 bool isOnlyLowElement = true;
15998 bool usesOnlyOneValue = true;
15999 bool usesOnlyOneConstantValue = true;
16000 bool isConstant = true;
16001 bool AllLanesExtractElt = true;
16002 unsigned NumConstantLanes = 0;
16003 unsigned NumDifferentLanes = 0;
16004 unsigned NumUndefLanes = 0;
16005 SDValue Value;
16006 SDValue ConstantValue;
16007 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
16008 unsigned ConsecutiveValCount = 0;
16009 SDValue PrevVal;
16010 for (unsigned i = 0; i < NumElts; ++i) {
16011 SDValue V = Op.getOperand(i);
16012 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16013 AllLanesExtractElt = false;
16014 if (V.isUndef()) {
16015 ++NumUndefLanes;
16016 continue;
16017 }
16018 if (i > 0)
16019 isOnlyLowElement = false;
16020 if (!isIntOrFPConstant(V))
16021 isConstant = false;
16022
16023 if (isIntOrFPConstant(V)) {
16024 ++NumConstantLanes;
16025 if (!ConstantValue.getNode())
16026 ConstantValue = V;
16027 else if (ConstantValue != V)
16028 usesOnlyOneConstantValue = false;
16029 }
16030
16031 if (!Value.getNode())
16032 Value = V;
16033 else if (V != Value) {
16034 usesOnlyOneValue = false;
16035 ++NumDifferentLanes;
16036 }
16037
16038 if (PrevVal != V) {
16039 ConsecutiveValCount = 0;
16040 PrevVal = V;
16041 }
16042
16043 // Keep different values and its last consecutive count. For example,
16044 //
16045 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16046 // t24, t24, t24, t24, t24, t24, t24, t24
16047 // t23 = consecutive count 8
16048 // t24 = consecutive count 8
16049 // ------------------------------------------------------------------
16050 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
16051 // t24, t24, t24, t24, t24, t24, t24, t24
16052 // t23 = consecutive count 5
16053 // t24 = consecutive count 9
16054 DifferentValueMap[V] = ++ConsecutiveValCount;
16055 }
16056
16057 if (!Value.getNode()) {
16058 LLVM_DEBUG(
16059 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
16060 return DAG.getUNDEF(VT);
16061 }
16062
16063 // Convert BUILD_VECTOR where all elements but the lowest are undef into
16064 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
16065 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
16066 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
16067 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
16068 "SCALAR_TO_VECTOR node\n");
16069 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
16070 }
16071
16072 if (AllLanesExtractElt) {
16073 SDNode *Vector = nullptr;
16074 bool Even = false;
16075 bool Odd = false;
16076 // Check whether the extract elements match the Even pattern <0,2,4,...> or
16077 // the Odd pattern <1,3,5,...>.
16078 for (unsigned i = 0; i < NumElts; ++i) {
16079 SDValue V = Op.getOperand(i);
16080 const SDNode *N = V.getNode();
16081 if (!isa<ConstantSDNode>(N->getOperand(1))) {
16082 Even = false;
16083 Odd = false;
16084 break;
16085 }
16086 SDValue N0 = N->getOperand(0);
16087
16088 // All elements are extracted from the same vector.
16089 if (!Vector) {
16090 Vector = N0.getNode();
16091 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
16092 // BUILD_VECTOR.
16093 if (VT.getVectorElementType() !=
16095 break;
16096 } else if (Vector != N0.getNode()) {
16097 Odd = false;
16098 Even = false;
16099 break;
16100 }
16101
16102 // Extracted values are either at Even indices <0,2,4,...> or at Odd
16103 // indices <1,3,5,...>.
16104 uint64_t Val = N->getConstantOperandVal(1);
16105 if (Val == 2 * i) {
16106 Even = true;
16107 continue;
16108 }
16109 if (Val - 1 == 2 * i) {
16110 Odd = true;
16111 continue;
16112 }
16113
16114 // Something does not match: abort.
16115 Odd = false;
16116 Even = false;
16117 break;
16118 }
16119 if (Even || Odd) {
16120 SDValue LHS =
16122 DAG.getConstant(0, DL, MVT::i64));
16123 SDValue RHS =
16125 DAG.getConstant(NumElts, DL, MVT::i64));
16126
16127 if (Even && !Odd)
16128 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
16129 if (Odd && !Even)
16130 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
16131 }
16132 }
16133
16134 // Use DUP for non-constant splats. For f32 constant splats, reduce to
16135 // i32 and try again.
16136 if (usesOnlyOneValue) {
16137 if (!isConstant) {
16138 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16139 Value.getValueType() != VT) {
16140 LLVM_DEBUG(
16141 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
16142 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
16143 }
16144
16145 // This is actually a DUPLANExx operation, which keeps everything vectory.
16146
16147 SDValue Lane = Value.getOperand(1);
16148 Value = Value.getOperand(0);
16149 if (Value.getValueSizeInBits() == 64) {
16150 LLVM_DEBUG(
16151 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
16152 "widening it\n");
16153 Value = WidenVector(Value, DAG);
16154 }
16155
16156 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
16157 return DAG.getNode(Opcode, DL, VT, Value, Lane);
16158 }
16159
16162 EVT EltTy = VT.getVectorElementType();
16163 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
16164 EltTy == MVT::f64) && "Unsupported floating-point vector type");
16165 LLVM_DEBUG(
16166 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
16167 "BITCASTS, and try again\n");
16168 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
16169 for (unsigned i = 0; i < NumElts; ++i)
16170 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
16171 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
16172 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
16173 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
16174 Val.dump(););
16175 Val = LowerBUILD_VECTOR(Val, DAG);
16176 if (Val.getNode())
16177 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
16178 }
16179 }
16180
16181 // If we need to insert a small number of different non-constant elements and
16182 // the vector width is sufficiently large, prefer using DUP with the common
16183 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
16184 // skip the constant lane handling below.
16185 bool PreferDUPAndInsert =
16186 !isConstant && NumDifferentLanes >= 1 &&
16187 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
16188 NumDifferentLanes >= NumConstantLanes;
16189
16190 // If there was only one constant value used and for more than one lane,
16191 // start by splatting that value, then replace the non-constant lanes. This
16192 // is better than the default, which will perform a separate initialization
16193 // for each lane.
16194 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
16195 // Firstly, try to materialize the splat constant.
16196 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
16197 unsigned BitSize = VT.getScalarSizeInBits();
16198 APInt ConstantValueAPInt(1, 0);
16199 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
16200 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
16201 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
16202 !ConstantValueAPInt.isAllOnes()) {
16203 Val = ConstantBuildVector(Val, DAG, Subtarget);
16204 if (!Val)
16205 // Otherwise, materialize the constant and splat it.
16206 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
16207 }
16208
16209 // Now insert the non-constant lanes.
16210 for (unsigned i = 0; i < NumElts; ++i) {
16211 SDValue V = Op.getOperand(i);
16212 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16213 if (!isIntOrFPConstant(V) && !V.isUndef())
16214 // Note that type legalization likely mucked about with the VT of the
16215 // source operand, so we may have to convert it here before inserting.
16216 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
16217 }
16218 return Val;
16219 }
16220
16221 // This will generate a load from the constant pool.
16222 if (isConstant) {
16223 LLVM_DEBUG(
16224 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
16225 "expansion\n");
16226 return SDValue();
16227 }
16228
16229 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
16230 // v4i32s. This is really a truncate, which we can construct out of (legal)
16231 // concats and truncate nodes.
16233 return M;
16234
16235 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
16236 if (NumElts >= 4) {
16237 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
16238 return Shuffle;
16239
16240 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
16241 return Shuffle;
16242 }
16243
16244 if (PreferDUPAndInsert) {
16245 // First, build a constant vector with the common element.
16247 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
16248 // Next, insert the elements that do not match the common value.
16249 for (unsigned I = 0; I < NumElts; ++I)
16250 if (Op.getOperand(I) != Value)
16251 NewVector =
16252 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
16253 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
16254
16255 return NewVector;
16256 }
16257
16258 // If vector consists of two different values, try to generate two DUPs and
16259 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
16260 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
16262 // Check the consecutive count of the value is the half number of vector
16263 // elements. In this case, we can use CONCAT_VECTORS. For example,
16264 //
16265 // canUseVECTOR_CONCAT = true;
16266 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16267 // t24, t24, t24, t24, t24, t24, t24, t24
16268 //
16269 // canUseVECTOR_CONCAT = false;
16270 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
16271 // t24, t24, t24, t24, t24, t24, t24, t24
16272 bool canUseVECTOR_CONCAT = true;
16273 for (auto Pair : DifferentValueMap) {
16274 // Check different values have same length which is NumElts / 2.
16275 if (Pair.second != NumElts / 2)
16276 canUseVECTOR_CONCAT = false;
16277 Vals.push_back(Pair.first);
16278 }
16279
16280 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
16281 // CONCAT_VECTORs. For example,
16282 //
16283 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
16284 // t24, t24, t24, t24, t24, t24, t24, t24
16285 // ==>
16286 // t26: v8i8 = AArch64ISD::DUP t23
16287 // t28: v8i8 = AArch64ISD::DUP t24
16288 // t29: v16i8 = concat_vectors t26, t28
16289 if (canUseVECTOR_CONCAT) {
16290 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16291 if (isTypeLegal(SubVT) && SubVT.isVector() &&
16292 SubVT.getVectorNumElements() >= 2) {
16293 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
16294 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
16295 SDValue DUP1 =
16296 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
16297 SDValue DUP2 =
16298 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
16300 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
16301 return CONCAT_VECTORS;
16302 }
16303 }
16304
16305 // Let's try to generate VECTOR_SHUFFLE. For example,
16306 //
16307 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
16308 // ==>
16309 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
16310 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
16311 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
16312 if (NumElts >= 8) {
16313 SmallVector<int, 16> MaskVec;
16314 // Build mask for VECTOR_SHUFLLE.
16315 SDValue FirstLaneVal = Op.getOperand(0);
16316 for (unsigned i = 0; i < NumElts; ++i) {
16317 SDValue Val = Op.getOperand(i);
16318 if (FirstLaneVal == Val)
16319 MaskVec.push_back(i);
16320 else
16321 MaskVec.push_back(i + NumElts);
16322 }
16323
16324 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
16325 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
16326 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
16327 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
16329 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
16330 return VECTOR_SHUFFLE;
16331 }
16332 }
16333
16334 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
16335 // know the default expansion would otherwise fall back on something even
16336 // worse. For a vector with one or two non-undef values, that's
16337 // scalar_to_vector for the elements followed by a shuffle (provided the
16338 // shuffle is valid for the target) and materialization element by element
16339 // on the stack followed by a load for everything else.
16340 if (!isConstant && !usesOnlyOneValue) {
16341 LLVM_DEBUG(
16342 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
16343 "of INSERT_VECTOR_ELT\n");
16344
16345 SDValue Vec = DAG.getUNDEF(VT);
16346 SDValue Op0 = Op.getOperand(0);
16347 unsigned i = 0;
16348
16349 // Use SCALAR_TO_VECTOR for lane zero to
16350 // a) Avoid a RMW dependency on the full vector register, and
16351 // b) Allow the register coalescer to fold away the copy if the
16352 // value is already in an S or D register, and we're forced to emit an
16353 // INSERT_SUBREG that we can't fold anywhere.
16354 //
16355 // We also allow types like i8 and i16 which are illegal scalar but legal
16356 // vector element types. After type-legalization the inserted value is
16357 // extended (i32) and it is safe to cast them to the vector type by ignoring
16358 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
16359 if (!Op0.isUndef()) {
16360 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
16361 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
16362 ++i;
16363 }
16364 LLVM_DEBUG({
16365 if (i < NumElts)
16366 dbgs() << "Creating nodes for the other vector elements:\n";
16367 });
16368 for (; i < NumElts; ++i) {
16369 SDValue V = Op.getOperand(i);
16370 if (V.isUndef())
16371 continue;
16372 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16373 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
16374 }
16375 return Vec;
16376 }
16377
16378 LLVM_DEBUG(
16379 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
16380 "better alternative\n");
16381 return SDValue();
16382}
16383
16384SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
16385 SelectionDAG &DAG) const {
16386 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16387 !Subtarget->isNeonAvailable()))
16388 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
16389
16390 assert(Op.getValueType().isScalableVector() &&
16391 isTypeLegal(Op.getValueType()) &&
16392 "Expected legal scalable vector type!");
16393
16394 if (isTypeLegal(Op.getOperand(0).getValueType())) {
16395 unsigned NumOperands = Op->getNumOperands();
16396 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
16397 "Unexpected number of operands in CONCAT_VECTORS");
16398
16399 if (NumOperands == 2)
16400 return Op;
16401
16402 // Concat each pair of subvectors and pack into the lower half of the array.
16403 SmallVector<SDValue> ConcatOps(Op->ops());
16404 while (ConcatOps.size() > 1) {
16405 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
16406 SDValue V1 = ConcatOps[I];
16407 SDValue V2 = ConcatOps[I + 1];
16408 EVT SubVT = V1.getValueType();
16409 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
16410 ConcatOps[I / 2] =
16411 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
16412 }
16413 ConcatOps.resize(ConcatOps.size() / 2);
16414 }
16415 return ConcatOps[0];
16416 }
16417
16418 return SDValue();
16419}
16420
16421SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
16422 SelectionDAG &DAG) const {
16423 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
16424
16425 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16426 !Subtarget->isNeonAvailable()))
16427 return LowerFixedLengthInsertVectorElt(Op, DAG);
16428
16429 EVT VT = Op.getOperand(0).getValueType();
16430
16431 if (VT.getScalarType() == MVT::i1) {
16432 EVT VectorVT = getPromotedVTForPredicate(VT);
16433 SDLoc DL(Op);
16434 SDValue ExtendedVector =
16435 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
16436 SDValue ExtendedValue =
16437 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
16438 VectorVT.getScalarType().getSizeInBits() < 32
16439 ? MVT::i32
16440 : VectorVT.getScalarType());
16441 ExtendedVector =
16442 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
16443 ExtendedValue, Op.getOperand(2));
16444 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
16445 }
16446
16447 // Check for non-constant or out of range lane.
16448 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
16449 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16450 return SDValue();
16451
16452 return Op;
16453}
16454
16455SDValue
16456AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16457 SelectionDAG &DAG) const {
16458 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16459 EVT VT = Op.getOperand(0).getValueType();
16460
16461 if (VT.getScalarType() == MVT::i1) {
16462 SDLoc DL(Op);
16463 // There are no operations to extend a nxv1i1 predicate to a nxv1i128 vector
16464 // An easy lowering is widening the input predicate to nxv2i1.
16465 if (VT == MVT::nxv1i1) {
16466 SDValue WidenedPred = DAG.getInsertSubvector(
16467 DL, DAG.getPOISON(MVT::nxv2i1), Op->getOperand(0), 0);
16468 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
16469 WidenedPred, Op.getOperand(1));
16470 }
16471 // We can't directly extract from an SVE predicate; extend it first.
16472 // (This isn't the only possible lowering, but it's straightforward.)
16473 EVT VectorVT = getPromotedVTForPredicate(VT);
16474 SDValue Extend =
16475 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16476 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16477 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16478 Extend, Op.getOperand(1));
16479 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16480 }
16481
16482 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16483 return LowerFixedLengthExtractVectorElt(Op, DAG);
16484
16485 // Check for non-constant or out of range lane.
16486 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16487 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16488 return SDValue();
16489
16490 // Insertion/extraction are legal for V128 types.
16491 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16492 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16493 VT == MVT::v8f16 || VT == MVT::v8bf16)
16494 return Op;
16495
16496 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16497 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16498 VT != MVT::v4bf16)
16499 return SDValue();
16500
16501 // For V64 types, we perform extraction by expanding the value
16502 // to a V128 type and perform the extraction on that.
16503 SDLoc DL(Op);
16504 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16505 EVT WideTy = WideVec.getValueType();
16506
16507 EVT ExtrTy = WideTy.getVectorElementType();
16508 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16509 ExtrTy = MVT::i32;
16510
16511 // For extractions, we just return the result directly.
16512 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16513 Op.getOperand(1));
16514}
16515
16516SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16517 SelectionDAG &DAG) const {
16518 EVT VT = Op.getValueType();
16520 "Only cases that extract a fixed length vector are supported!");
16521 EVT InVT = Op.getOperand(0).getValueType();
16522
16523 // If we don't have legal types yet, do nothing
16524 if (!isTypeLegal(InVT))
16525 return SDValue();
16526
16527 if (InVT.is128BitVector()) {
16528 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16529 unsigned Idx = Op.getConstantOperandVal(1);
16530
16531 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16532 if (Idx == 0)
16533 return Op;
16534
16535 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16536 // that directly.
16537 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16538 return Op;
16539 }
16540
16541 if (InVT.isScalableVector() ||
16542 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16543 SDLoc DL(Op);
16544 SDValue Vec = Op.getOperand(0);
16545 SDValue Idx = Op.getOperand(1);
16546
16547 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16548 if (PackedVT != InVT) {
16549 // Pack input into the bottom part of an SVE register and try again.
16550 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16551 DAG.getUNDEF(PackedVT), Vec,
16552 DAG.getVectorIdxConstant(0, DL));
16553 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16554 }
16555
16556 // This will get matched by custom code during ISelDAGToDAG.
16557 if (isNullConstant(Idx))
16558 return Op;
16559
16560 assert(InVT.isScalableVector() && "Unexpected vector type!");
16561 // Move requested subvector to the start of the vector and try again.
16562 SDValue Splice =
16563 DAG.getNode(ISD::VECTOR_SPLICE_LEFT, DL, InVT, Vec, Vec, Idx);
16564 return convertFromScalableVector(DAG, VT, Splice);
16565 }
16566
16567 return SDValue();
16568}
16569
16570SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16571 SelectionDAG &DAG) const {
16572 assert(Op.getValueType().isScalableVector() &&
16573 "Only expect to lower inserts into scalable vectors!");
16574
16575 EVT InVT = Op.getOperand(1).getValueType();
16576 unsigned Idx = Op.getConstantOperandVal(2);
16577
16578 SDValue Vec0 = Op.getOperand(0);
16579 SDValue Vec1 = Op.getOperand(1);
16580 SDLoc DL(Op);
16581 EVT VT = Op.getValueType();
16582
16583 if (InVT.isScalableVector()) {
16584 if (!isTypeLegal(VT))
16585 return SDValue();
16586
16587 // Break down insert_subvector into simpler parts.
16588 if (VT.getVectorElementType() == MVT::i1) {
16589 unsigned NumElts = VT.getVectorMinNumElements();
16590 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16591
16592 SDValue Lo, Hi;
16593 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16594 DAG.getVectorIdxConstant(0, DL));
16595 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16596 DAG.getVectorIdxConstant(NumElts / 2, DL));
16597 if (Idx < (NumElts / 2))
16598 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16599 DAG.getVectorIdxConstant(Idx, DL));
16600 else
16601 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16602 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16603
16604 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16605 }
16606
16607 // We can select these directly.
16608 if (isTypeLegal(InVT) && Vec0.isUndef())
16609 return Op;
16610
16611 // Ensure the subvector is half the size of the main vector.
16612 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16613 return SDValue();
16614
16615 // Here narrow and wide refers to the vector element types. After "casting"
16616 // both vectors must have the same bit length and so because the subvector
16617 // has fewer elements, those elements need to be bigger.
16618 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16619 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16620
16621 // NOP cast operands to the largest legal vector of the same element count.
16622 if (VT.isFloatingPoint()) {
16623 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16624 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16625 } else {
16626 // Legal integer vectors are already their largest so Vec0 is fine as is.
16627 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16628 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16629 }
16630
16631 // To replace the top/bottom half of vector V with vector SubV we widen the
16632 // preserved half of V, concatenate this to SubV (the order depending on the
16633 // half being replaced) and then narrow the result.
16634 SDValue Narrow;
16635 if (Idx == 0) {
16636 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16637 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16638 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16639 } else {
16640 assert(Idx == InVT.getVectorMinNumElements() &&
16641 "Invalid subvector index!");
16642 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16643 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16644 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16645 }
16646
16647 return getSVESafeBitCast(VT, Narrow, DAG);
16648 }
16649
16650 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16651 // This will be matched by custom code during ISelDAGToDAG.
16652 if (Vec0.isUndef())
16653 return Op;
16654
16655 std::optional<unsigned> PredPattern =
16657 auto PredTy = VT.changeVectorElementType(*DAG.getContext(), MVT::i1);
16658 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16659 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16660 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16661 }
16662
16663 return SDValue();
16664}
16665
16666static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16667 if (Op.getOpcode() != AArch64ISD::DUP &&
16668 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16669 Op.getOpcode() != ISD::BUILD_VECTOR)
16670 return false;
16671
16672 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16673 !isAllConstantBuildVector(Op, SplatVal))
16674 return false;
16675
16676 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16677 !isa<ConstantSDNode>(Op->getOperand(0)))
16678 return false;
16679
16680 SplatVal = Op->getConstantOperandVal(0);
16681 if (Op.getValueType().getVectorElementType() != MVT::i64)
16682 SplatVal = (int32_t)SplatVal;
16683
16684 Negated = false;
16685 if (isPowerOf2_64(SplatVal))
16686 return true;
16687
16688 Negated = true;
16689 if (isPowerOf2_64(-SplatVal)) {
16690 SplatVal = -SplatVal;
16691 return true;
16692 }
16693
16694 return false;
16695}
16696
16697SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16698 EVT VT = Op.getValueType();
16699 SDLoc DL(Op);
16700
16701 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16702 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16703
16704 assert(VT.isScalableVector() && "Expected a scalable vector.");
16705
16706 bool Signed = Op.getOpcode() == ISD::SDIV;
16707 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16708
16709 bool Negated;
16710 uint64_t SplatVal;
16711 // NOTE: SRAD cannot be used to represent sdiv-by-one.
16712 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
16713 SplatVal > 1) {
16715 SDValue Res =
16716 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16717 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16718 if (Negated)
16719 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16720
16721 return Res;
16722 }
16723
16724 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16725 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16726
16727 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16728 // operations, and truncate the result.
16729 EVT WidenedVT;
16730 if (VT == MVT::nxv16i8)
16731 WidenedVT = MVT::nxv8i16;
16732 else if (VT == MVT::nxv8i16)
16733 WidenedVT = MVT::nxv4i32;
16734 else
16735 llvm_unreachable("Unexpected Custom DIV operation");
16736
16737 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16738 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16739 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16740 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16741 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16742 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16743 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16744 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16745 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16746 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16747 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16748}
16749
16750bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16751 EVT VT, unsigned DefinedValues) const {
16752 if (!Subtarget->isNeonAvailable())
16753 return false;
16755}
16756
16758 // Currently no fixed length shuffles that require SVE are legal.
16759 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16760 return false;
16761
16762 if (VT.getVectorNumElements() == 4 &&
16763 (VT.is128BitVector() || VT.is64BitVector())) {
16764 unsigned Cost = getPerfectShuffleCost(M);
16765 if (Cost <= 1)
16766 return true;
16767 }
16768
16769 bool DummyBool;
16770 int DummyInt;
16771 unsigned DummyUnsigned;
16772
16773 unsigned EltSize = VT.getScalarSizeInBits();
16774 unsigned NumElts = VT.getVectorNumElements();
16776 isREVMask(M, EltSize, NumElts, 64) ||
16777 isREVMask(M, EltSize, NumElts, 32) ||
16778 isREVMask(M, EltSize, NumElts, 16) ||
16779 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16780 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16781 isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16782 isUZPMask(M, NumElts, DummyUnsigned) ||
16783 isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16784 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16785 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16786 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16787 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16788 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16789}
16790
16792 EVT VT) const {
16793 // Just delegate to the generic legality, clear masks aren't special.
16794 return isShuffleMaskLegal(M, VT);
16795}
16796
16797/// getVShiftImm - Check if this is a valid build_vector for the immediate
16798/// operand of a vector shift operation, where all the elements of the
16799/// build_vector must have the same constant integer value.
16800static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16801 // Ignore bit_converts.
16802 while (Op.getOpcode() == ISD::BITCAST)
16803 Op = Op.getOperand(0);
16805 APInt SplatBits, SplatUndef;
16806 unsigned SplatBitSize;
16807 bool HasAnyUndefs;
16808 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16809 HasAnyUndefs, ElementBits) ||
16810 SplatBitSize > ElementBits)
16811 return false;
16812 Cnt = SplatBits.getSExtValue();
16813 return true;
16814}
16815
16816/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16817/// operand of a vector shift left operation. That value must be in the range:
16818/// 0 <= Value < ElementBits for a left shift; or
16819/// 0 <= Value <= ElementBits for a long left shift.
16820static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16821 assert(VT.isVector() && "vector shift count is not a vector type");
16822 int64_t ElementBits = VT.getScalarSizeInBits();
16823 if (!getVShiftImm(Op, ElementBits, Cnt))
16824 return false;
16825 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16826}
16827
16828/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16829/// operand of a vector shift right operation. The value must be in the range:
16830/// 1 <= Value <= ElementBits for a right shift; or
16831static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16832 assert(VT.isVector() && "vector shift count is not a vector type");
16833 int64_t ElementBits = VT.getScalarSizeInBits();
16834 if (!getVShiftImm(Op, ElementBits, Cnt))
16835 return false;
16836 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16837}
16838
16839SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16840 SelectionDAG &DAG) const {
16841 EVT VT = Op.getValueType();
16842
16843 if (VT.getScalarType() == MVT::i1) {
16844 // Lower i1 truncate to `(x & 1) != 0`.
16845 SDLoc DL(Op);
16846 EVT OpVT = Op.getOperand(0).getValueType();
16847 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16848 SDValue One = DAG.getConstant(1, DL, OpVT);
16849 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16850 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16851 }
16852
16853 if (!VT.isVector() || VT.isScalableVector())
16854 return SDValue();
16855
16856 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16857 !Subtarget->isNeonAvailable()))
16858 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16859
16860 return SDValue();
16861}
16862
16863// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16864// possibly a truncated type, it tells how many bits of the value are to be
16865// used.
16867 SelectionDAG &DAG,
16868 unsigned &ShiftValue,
16869 SDValue &RShOperand) {
16870 if (Shift->getOpcode() != ISD::SRL)
16871 return false;
16872
16873 EVT VT = Shift.getValueType();
16874 assert(VT.isScalableVT());
16875
16876 auto ShiftOp1 =
16878 if (!ShiftOp1)
16879 return false;
16880
16881 ShiftValue = ShiftOp1->getZExtValue();
16882 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16883 return false;
16884
16885 SDValue Add = Shift->getOperand(0);
16886 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16887 return false;
16888
16890 "ResVT must be truncated or same type as the shift.");
16891 // Check if an overflow can lead to incorrect results.
16892 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16893 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16894 return false;
16895
16896 auto AddOp1 =
16898 if (!AddOp1)
16899 return false;
16900 uint64_t AddValue = AddOp1->getZExtValue();
16901 if (AddValue != 1ULL << (ShiftValue - 1))
16902 return false;
16903
16904 RShOperand = Add->getOperand(0);
16905 return true;
16906}
16907
16908SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16909 SelectionDAG &DAG) const {
16910 EVT VT = Op.getValueType();
16911 SDLoc DL(Op);
16912 int64_t Cnt;
16913
16914 if (!Op.getOperand(1).getValueType().isVector())
16915 return Op;
16916 unsigned EltSize = VT.getScalarSizeInBits();
16917
16918 switch (Op.getOpcode()) {
16919 case ISD::SHL:
16920 if (VT.isScalableVector() ||
16921 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16922 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16923
16924 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16925 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16926 DAG.getTargetConstant(Cnt, DL, MVT::i32));
16927 return DAG.getNode(
16929 DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
16930 Op.getOperand(0), Op.getOperand(1));
16931 case ISD::SRA:
16932 case ISD::SRL:
16933 if (VT.isScalableVector() &&
16934 (Subtarget->hasSVE2() ||
16935 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16936 SDValue RShOperand;
16937 unsigned ShiftValue;
16938 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16939 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16940 getPredicateForVector(DAG, DL, VT), RShOperand,
16941 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16942 }
16943
16944 if (VT.isScalableVector() ||
16945 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16946 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16947 : AArch64ISD::SRL_PRED;
16948 return LowerToPredicatedOp(Op, DAG, Opc);
16949 }
16950
16951 // Right shift immediate
16952 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16953 unsigned Opc =
16954 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16955 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16956 DAG.getTargetConstant(Cnt, DL, MVT::i32),
16957 Op->getFlags());
16958 }
16959
16960 // Right shift register. Note, there is not a shift right register
16961 // instruction, but the shift left register instruction takes a signed
16962 // value, where negative numbers specify a right shift.
16963 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16964 : Intrinsic::aarch64_neon_ushl;
16965 // negate the shift amount
16966 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16967 Op.getOperand(1));
16968 SDValue NegShiftLeft =
16970 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16971 NegShift);
16972 return NegShiftLeft;
16973 }
16974
16975 llvm_unreachable("unexpected shift opcode");
16976}
16977
16978SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16979 SelectionDAG &DAG) const {
16980 if (Op.getValueType().isScalableVector())
16981 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16982
16983 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16984 !Subtarget->isNeonAvailable()))
16985 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16986
16987 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16988 SDValue LHS = Op.getOperand(0);
16989 SDValue RHS = Op.getOperand(1);
16990 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16991 SDLoc DL(Op);
16992
16993 if (LHS.getValueType().getVectorElementType().isInteger())
16994 return Op;
16995
16996 assert(((!Subtarget->hasFullFP16() &&
16997 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16998 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16999 LHS.getValueType().getVectorElementType() != MVT::f128) &&
17000 "Unexpected type!");
17001
17002 // Lower isnan(x) | isnan(never-nan) to x != x.
17003 // Lower !isnan(x) & !isnan(never-nan) to x == x.
17004 if (CC == ISD::SETUO || CC == ISD::SETO) {
17005 bool OneNaN = false;
17006 if (LHS == RHS) {
17007 OneNaN = true;
17008 } else if (DAG.isKnownNeverNaN(RHS)) {
17009 OneNaN = true;
17010 RHS = LHS;
17011 } else if (DAG.isKnownNeverNaN(LHS)) {
17012 OneNaN = true;
17013 LHS = RHS;
17014 }
17015 if (OneNaN) {
17016 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
17017 }
17018 }
17019
17020 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
17021 // clean. Some of them require two branches to implement.
17022 AArch64CC::CondCode CC1, CC2;
17023 bool ShouldInvert;
17024 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
17025
17026 bool NoNaNs =
17027 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
17028 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
17029 if (!Cmp.getNode())
17030 return SDValue();
17031
17032 if (CC2 != AArch64CC::AL) {
17033 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
17034 if (!Cmp2.getNode())
17035 return SDValue();
17036
17037 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
17038 }
17039
17040 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
17041
17042 if (ShouldInvert)
17043 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
17044
17045 return Cmp;
17046}
17047
17048static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
17049 SelectionDAG &DAG) {
17050 SDValue VecOp = ScalarOp.getOperand(0);
17051 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
17052 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
17053 DAG.getConstant(0, DL, MVT::i64));
17054}
17055
17056static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
17057 SDLoc DL, SelectionDAG &DAG) {
17058 unsigned ScalarOpcode;
17059 switch (Opcode) {
17060 case ISD::VECREDUCE_AND:
17061 ScalarOpcode = ISD::AND;
17062 break;
17063 case ISD::VECREDUCE_OR:
17064 ScalarOpcode = ISD::OR;
17065 break;
17066 case ISD::VECREDUCE_XOR:
17067 ScalarOpcode = ISD::XOR;
17068 break;
17069 default:
17070 llvm_unreachable("Expected bitwise vector reduction");
17071 return SDValue();
17072 }
17073
17074 EVT VecVT = Vec.getValueType();
17075 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
17076 "Expected power-of-2 length vector");
17077
17078 EVT ElemVT = VecVT.getVectorElementType();
17079
17080 SDValue Result;
17081 unsigned NumElems = VecVT.getVectorNumElements();
17082
17083 // Special case for boolean reductions
17084 if (ElemVT == MVT::i1) {
17085 // Split large vectors into smaller ones
17086 if (NumElems > 16) {
17087 SDValue Lo, Hi;
17088 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17089 EVT HalfVT = Lo.getValueType();
17090 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
17091 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
17092 }
17093
17094 // Results of setcc operations get widened to 128 bits if their input
17095 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
17096 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
17097 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
17098 // size leads to the best codegen, since e.g. setcc results might need to be
17099 // truncated otherwise.
17100 unsigned ExtendedWidth = 64;
17101 if (Vec.getOpcode() == ISD::SETCC &&
17102 Vec.getOperand(0).getValueSizeInBits() >= 128) {
17103 ExtendedWidth = 128;
17104 }
17105 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
17106
17107 // any_ext doesn't work with umin/umax, so only use it for uadd.
17108 unsigned ExtendOp =
17109 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
17110 SDValue Extended = DAG.getNode(
17111 ExtendOp, DL,
17112 VecVT.changeVectorElementType(*DAG.getContext(), ExtendedVT), Vec);
17113 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
17114 // in that case we bitcast the sign extended values from v2i64 to v4i32
17115 // before reduction for optimal code generation.
17116 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
17117 NumElems == 2 && ExtendedWidth == 128) {
17118 Extended = DAG.getBitcast(MVT::v4i32, Extended);
17119 ExtendedVT = MVT::i32;
17120 }
17121 switch (ScalarOpcode) {
17122 case ISD::AND:
17123 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
17124 break;
17125 case ISD::OR:
17126 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
17127 break;
17128 case ISD::XOR:
17129 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
17130 break;
17131 default:
17132 llvm_unreachable("Unexpected Opcode");
17133 }
17134
17135 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
17136 } else {
17137 // Iteratively split the vector in half and combine using the bitwise
17138 // operation until it fits in a 64 bit register.
17139 while (VecVT.getSizeInBits() > 64) {
17140 SDValue Lo, Hi;
17141 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17142 VecVT = Lo.getValueType();
17143 NumElems = VecVT.getVectorNumElements();
17144 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
17145 }
17146
17147 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
17148
17149 // Do the remaining work on a scalar since it allows the code generator to
17150 // combine the shift and bitwise operation into one instruction and since
17151 // integer instructions can have higher throughput than vector instructions.
17152 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
17153
17154 // Iteratively combine the lower and upper halves of the scalar using the
17155 // bitwise operation, halving the relevant region of the scalar in each
17156 // iteration, until the relevant region is just one element of the original
17157 // vector.
17158 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
17159 SDValue ShiftAmount =
17160 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
17161 SDValue Shifted =
17162 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
17163 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
17164 }
17165
17166 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
17167 }
17168
17169 return DAG.getAnyExtOrTrunc(Result, DL, VT);
17170}
17171
17172SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
17173 SelectionDAG &DAG) const {
17174 SDValue Src = Op.getOperand(0);
17175 EVT SrcVT = Src.getValueType();
17176
17177 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
17178 // widening by inserting zeroes.
17179 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
17180 SrcVT == MVT::v2f16) {
17181 SDLoc DL(Op);
17182 return DAG.getNode(ISD::FADD, DL, MVT::f16,
17183 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
17184 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
17185 }
17186
17187 // Try to lower fixed length reductions to SVE.
17188 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
17189 Op.getOpcode() == ISD::VECREDUCE_AND ||
17190 Op.getOpcode() == ISD::VECREDUCE_OR ||
17191 Op.getOpcode() == ISD::VECREDUCE_XOR ||
17192 Op.getOpcode() == ISD::VECREDUCE_FADD ||
17193 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
17194 SrcVT.getVectorElementType() == MVT::i64);
17195 if (SrcVT.isScalableVector() ||
17197 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
17198
17199 if (SrcVT.getVectorElementType() == MVT::i1)
17200 return LowerPredReductionToSVE(Op, DAG);
17201
17202 switch (Op.getOpcode()) {
17203 case ISD::VECREDUCE_ADD:
17204 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
17205 case ISD::VECREDUCE_AND:
17206 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
17207 case ISD::VECREDUCE_OR:
17208 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
17210 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
17212 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
17214 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
17216 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
17217 case ISD::VECREDUCE_XOR:
17218 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
17220 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
17222 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
17224 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
17226 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
17228 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
17229 default:
17230 llvm_unreachable("Unhandled fixed length reduction");
17231 }
17232 }
17233
17234 // Lower NEON reductions.
17235 SDLoc DL(Op);
17236 switch (Op.getOpcode()) {
17237 case ISD::VECREDUCE_AND:
17238 case ISD::VECREDUCE_OR:
17239 case ISD::VECREDUCE_XOR:
17240 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
17241 Op.getValueType(), DL, DAG);
17242 case ISD::VECREDUCE_ADD:
17243 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
17245 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
17247 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
17249 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
17251 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
17252 default:
17253 llvm_unreachable("Unhandled reduction");
17254 }
17255}
17256
17257SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
17258 SelectionDAG &DAG) const {
17259 SDLoc DL(Op);
17260 SDValue Src = Op.getOperand(0);
17261 EVT SrcVT = Src.getValueType();
17262 assert(SrcVT.isScalableVector() && "Unexpected operand type!");
17263
17264 SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
17265 unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
17266 SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
17267
17268 // Whilst we don't know the size of the vector we do know the maximum size so
17269 // can perform a tree reduction with an identity vector, which means once we
17270 // arrive at the result the remaining stages (when the vector is smaller than
17271 // the maximum) have no affect.
17272
17274 unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
17275
17276 for (unsigned I = 0; I < Stages; ++I) {
17277 Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
17278 Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
17279 }
17280
17281 return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
17282}
17283
17284SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
17285 SelectionDAG &DAG) const {
17286 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17287 // No point replacing if we don't have the relevant instruction/libcall anyway
17288 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
17289 return SDValue();
17290
17291 // LSE has an atomic load-clear instruction, but not a load-and.
17292 SDLoc DL(Op);
17293 MVT VT = Op.getSimpleValueType();
17294 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
17295 SDValue RHS = Op.getOperand(2);
17296 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
17297 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
17298 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
17299 Op.getOperand(0), Op.getOperand(1), RHS,
17300 AN->getMemOperand());
17301}
17302
17303SDValue
17304AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
17305 SelectionDAG &DAG) const {
17306
17307 SDLoc DL(Op);
17308 // Get the inputs.
17309 SDNode *Node = Op.getNode();
17310 SDValue Chain = Op.getOperand(0);
17311 SDValue Size = Op.getOperand(1);
17312 MaybeAlign Align =
17313 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17314 EVT VT = Node->getValueType(0);
17315
17317 "no-stack-arg-probe")) {
17318 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17319 Chain = SP.getValue(1);
17320 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17321 if (Align)
17322 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17323 DAG.getSignedConstant(-Align->value(), DL, VT));
17324 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17325 SDValue Ops[2] = {SP, Chain};
17326 return DAG.getMergeValues(Ops, DL);
17327 }
17328
17329 RTLIB::LibcallImpl ChkStkImpl = getLibcallImpl(RTLIB::STACK_PROBE);
17330 if (ChkStkImpl == RTLIB::Unsupported)
17331 return SDValue();
17332
17333 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
17334
17335 EVT PtrVT = getPointerTy(DAG.getDataLayout());
17337 getLibcallImplName(ChkStkImpl).data(), PtrVT, 0);
17338
17339 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17340 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
17341 if (Subtarget->hasCustomCallingConv())
17342 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
17343
17344 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
17345 DAG.getConstant(4, DL, MVT::i64));
17346 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
17347 Chain =
17348 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
17349 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
17350 DAG.getRegisterMask(Mask), Chain.getValue(1));
17351 // To match the actual intent better, we should read the output from X15 here
17352 // again (instead of potentially spilling it to the stack), but rereading Size
17353 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
17354 // here.
17355
17356 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
17357 DAG.getConstant(4, DL, MVT::i64));
17358
17359 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17360 Chain = SP.getValue(1);
17361 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17362 if (Align)
17363 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17364 DAG.getSignedConstant(-Align->value(), DL, VT));
17365 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17366
17367 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
17368
17369 SDValue Ops[2] = {SP, Chain};
17370 return DAG.getMergeValues(Ops, DL);
17371}
17372
17373SDValue
17374AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
17375 SelectionDAG &DAG) const {
17376 // Get the inputs.
17377 SDNode *Node = Op.getNode();
17378 SDValue Chain = Op.getOperand(0);
17379 SDValue Size = Op.getOperand(1);
17380
17381 MaybeAlign Align =
17382 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17383 SDLoc DL(Op);
17384 EVT VT = Node->getValueType(0);
17385
17386 // Construct the new SP value in a GPR.
17387 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17388 Chain = SP.getValue(1);
17389 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17390 if (Align)
17391 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17392 DAG.getSignedConstant(-Align->value(), DL, VT));
17393
17394 // Set the real SP to the new value with a probing loop.
17395 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
17396 SDValue Ops[2] = {SP, Chain};
17397 return DAG.getMergeValues(Ops, DL);
17398}
17399
17400SDValue
17401AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17402 SelectionDAG &DAG) const {
17403 MachineFunction &MF = DAG.getMachineFunction();
17404
17405 if (Subtarget->isTargetWindows())
17406 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
17407 else if (hasInlineStackProbe(MF))
17408 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
17409 else
17410 return SDValue();
17411}
17412
17413SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
17414 unsigned NewOp) const {
17415 if (Subtarget->hasSVE2())
17416 return LowerToPredicatedOp(Op, DAG, NewOp);
17417
17418 // Default to expand.
17419 return SDValue();
17420}
17421
17422SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
17423 SelectionDAG &DAG) const {
17424 EVT VT = Op.getValueType();
17425 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
17426
17427 SDLoc DL(Op);
17428 APInt MulImm = Op.getConstantOperandAPInt(0);
17429 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
17430 VT);
17431}
17432
17433/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
17434template <unsigned NumVecs>
17435static bool
17438 Info.opc = ISD::INTRINSIC_VOID;
17439 // Retrieve EC from first vector argument.
17440 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
17442#ifndef NDEBUG
17443 // Check the assumption that all input vectors are the same type.
17444 for (unsigned I = 0; I < NumVecs; ++I)
17445 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
17446 "Invalid type.");
17447#endif
17448 // memVT is `NumVecs * VT`.
17449 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
17450 EC * NumVecs);
17451 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
17452 Info.offset = 0;
17453 Info.align.reset();
17454 Info.flags = MachineMemOperand::MOStore;
17455 return true;
17456}
17457
17458/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
17459/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
17460/// specified in the intrinsic calls.
17462 const CallBase &I,
17463 MachineFunction &MF,
17464 unsigned Intrinsic) const {
17465 auto &DL = I.getDataLayout();
17466 switch (Intrinsic) {
17467 case Intrinsic::aarch64_sve_st2:
17468 return setInfoSVEStN<2>(*this, DL, Info, I);
17469 case Intrinsic::aarch64_sve_st3:
17470 return setInfoSVEStN<3>(*this, DL, Info, I);
17471 case Intrinsic::aarch64_sve_st4:
17472 return setInfoSVEStN<4>(*this, DL, Info, I);
17473 case Intrinsic::aarch64_neon_ld2:
17474 case Intrinsic::aarch64_neon_ld3:
17475 case Intrinsic::aarch64_neon_ld4:
17476 case Intrinsic::aarch64_neon_ld1x2:
17477 case Intrinsic::aarch64_neon_ld1x3:
17478 case Intrinsic::aarch64_neon_ld1x4: {
17479 Info.opc = ISD::INTRINSIC_W_CHAIN;
17480 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
17481 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17482 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17483 Info.offset = 0;
17484 Info.align.reset();
17485 // volatile loads with NEON intrinsics not supported
17486 Info.flags = MachineMemOperand::MOLoad;
17487 return true;
17488 }
17489 case Intrinsic::aarch64_neon_ld2lane:
17490 case Intrinsic::aarch64_neon_ld3lane:
17491 case Intrinsic::aarch64_neon_ld4lane:
17492 case Intrinsic::aarch64_neon_ld2r:
17493 case Intrinsic::aarch64_neon_ld3r:
17494 case Intrinsic::aarch64_neon_ld4r: {
17495 Info.opc = ISD::INTRINSIC_W_CHAIN;
17496 // ldx return struct with the same vec type
17497 Type *RetTy = I.getType();
17498 auto *StructTy = cast<StructType>(RetTy);
17499 unsigned NumElts = StructTy->getNumElements();
17500 Type *VecTy = StructTy->getElementType(0);
17501 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17502 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17503 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17504 Info.offset = 0;
17505 Info.align.reset();
17506 // volatile loads with NEON intrinsics not supported
17507 Info.flags = MachineMemOperand::MOLoad;
17508 return true;
17509 }
17510 case Intrinsic::aarch64_neon_st2:
17511 case Intrinsic::aarch64_neon_st3:
17512 case Intrinsic::aarch64_neon_st4:
17513 case Intrinsic::aarch64_neon_st1x2:
17514 case Intrinsic::aarch64_neon_st1x3:
17515 case Intrinsic::aarch64_neon_st1x4: {
17516 Info.opc = ISD::INTRINSIC_VOID;
17517 unsigned NumElts = 0;
17518 for (const Value *Arg : I.args()) {
17519 Type *ArgTy = Arg->getType();
17520 if (!ArgTy->isVectorTy())
17521 break;
17522 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17523 }
17524 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17525 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17526 Info.offset = 0;
17527 Info.align.reset();
17528 // volatile stores with NEON intrinsics not supported
17529 Info.flags = MachineMemOperand::MOStore;
17530 return true;
17531 }
17532 case Intrinsic::aarch64_neon_st2lane:
17533 case Intrinsic::aarch64_neon_st3lane:
17534 case Intrinsic::aarch64_neon_st4lane: {
17535 Info.opc = ISD::INTRINSIC_VOID;
17536 unsigned NumElts = 0;
17537 // all the vector type is same
17538 Type *VecTy = I.getArgOperand(0)->getType();
17539 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17540
17541 for (const Value *Arg : I.args()) {
17542 Type *ArgTy = Arg->getType();
17543 if (!ArgTy->isVectorTy())
17544 break;
17545 NumElts += 1;
17546 }
17547
17548 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17549 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17550 Info.offset = 0;
17551 Info.align.reset();
17552 // volatile stores with NEON intrinsics not supported
17553 Info.flags = MachineMemOperand::MOStore;
17554 return true;
17555 }
17556 case Intrinsic::aarch64_ldaxr:
17557 case Intrinsic::aarch64_ldxr: {
17558 Type *ValTy = I.getParamElementType(0);
17559 Info.opc = ISD::INTRINSIC_W_CHAIN;
17560 Info.memVT = MVT::getVT(ValTy);
17561 Info.ptrVal = I.getArgOperand(0);
17562 Info.offset = 0;
17563 Info.align = DL.getABITypeAlign(ValTy);
17565 return true;
17566 }
17567 case Intrinsic::aarch64_stlxr:
17568 case Intrinsic::aarch64_stxr: {
17569 Type *ValTy = I.getParamElementType(1);
17570 Info.opc = ISD::INTRINSIC_W_CHAIN;
17571 Info.memVT = MVT::getVT(ValTy);
17572 Info.ptrVal = I.getArgOperand(1);
17573 Info.offset = 0;
17574 Info.align = DL.getABITypeAlign(ValTy);
17576 return true;
17577 }
17578 case Intrinsic::aarch64_ldaxp:
17579 case Intrinsic::aarch64_ldxp:
17580 Info.opc = ISD::INTRINSIC_W_CHAIN;
17581 Info.memVT = MVT::i128;
17582 Info.ptrVal = I.getArgOperand(0);
17583 Info.offset = 0;
17584 Info.align = Align(16);
17586 return true;
17587 case Intrinsic::aarch64_stlxp:
17588 case Intrinsic::aarch64_stxp:
17589 Info.opc = ISD::INTRINSIC_W_CHAIN;
17590 Info.memVT = MVT::i128;
17591 Info.ptrVal = I.getArgOperand(2);
17592 Info.offset = 0;
17593 Info.align = Align(16);
17595 return true;
17596 case Intrinsic::aarch64_sve_ldnt1: {
17597 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17598 Info.opc = ISD::INTRINSIC_W_CHAIN;
17599 Info.memVT = MVT::getVT(I.getType());
17600 Info.ptrVal = I.getArgOperand(1);
17601 Info.offset = 0;
17602 Info.align = DL.getABITypeAlign(ElTy);
17604 return true;
17605 }
17606 case Intrinsic::aarch64_sve_stnt1: {
17607 Type *ElTy =
17608 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17609 Info.opc = ISD::INTRINSIC_W_CHAIN;
17610 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17611 Info.ptrVal = I.getArgOperand(2);
17612 Info.offset = 0;
17613 Info.align = DL.getABITypeAlign(ElTy);
17615 return true;
17616 }
17617 case Intrinsic::aarch64_mops_memset_tag: {
17618 Value *Dst = I.getArgOperand(0);
17619 Value *Val = I.getArgOperand(1);
17620 Info.opc = ISD::INTRINSIC_W_CHAIN;
17621 Info.memVT = MVT::getVT(Val->getType());
17622 Info.ptrVal = Dst;
17623 Info.offset = 0;
17624 Info.align = I.getParamAlign(0).valueOrOne();
17625 Info.flags = MachineMemOperand::MOStore;
17626 // The size of the memory being operated on is unknown at this point
17627 Info.size = MemoryLocation::UnknownSize;
17628 return true;
17629 }
17630 default:
17631 break;
17632 }
17633
17634 return false;
17635}
17636
17638 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17639 std::optional<unsigned> ByteOffset) const {
17640 // TODO: This may be worth removing. Check regression tests for diffs.
17641 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17642 ByteOffset))
17643 return false;
17644
17645 // If we're reducing the load width in order to avoid having to use an extra
17646 // instruction to do extension then it's probably a good idea.
17647 if (ExtTy != ISD::NON_EXTLOAD)
17648 return true;
17649 // Don't reduce load width if it would prevent us from combining a shift into
17650 // the offset.
17651 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17652 assert(Mem);
17653 const SDValue &Base = Mem->getBasePtr();
17654 if (Base.getOpcode() == ISD::ADD &&
17655 Base.getOperand(1).getOpcode() == ISD::SHL &&
17656 Base.getOperand(1).hasOneUse() &&
17657 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17658 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17659 if (Mem->getMemoryVT().isScalableVector())
17660 return false;
17661 // The shift can be combined if it matches the size of the value being
17662 // loaded (and so reducing the width would make it not match).
17663 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17664 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17665 if (ShiftAmount == Log2_32(LoadBytes))
17666 return false;
17667 }
17668 // We have no reason to disallow reducing the load width, so allow it.
17669 return true;
17670}
17671
17672// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17674 EVT VT = Extend.getValueType();
17675 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17676 SDValue Extract = Extend.getOperand(0);
17677 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17678 Extract = Extract.getOperand(0);
17679 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17680 EVT VecVT = Extract.getOperand(0).getValueType();
17681 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17682 return false;
17683 }
17684 }
17685 return true;
17686}
17687
17688// Truncations from 64-bit GPR to 32-bit GPR is free.
17690 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17691 return false;
17692 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17693 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17694 return NumBits1 > NumBits2;
17695}
17697 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17698 return false;
17699 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17700 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17701 return NumBits1 > NumBits2;
17702}
17703
17704/// Check if it is profitable to hoist instruction in then/else to if.
17705/// Not profitable if I and it's user can form a FMA instruction
17706/// because we prefer FMSUB/FMADD.
17708 if (I->getOpcode() != Instruction::FMul)
17709 return true;
17710
17711 if (!I->hasOneUse())
17712 return true;
17713
17714 Instruction *User = I->user_back();
17715
17716 if (!(User->getOpcode() == Instruction::FSub ||
17717 User->getOpcode() == Instruction::FAdd))
17718 return true;
17719
17721 const Function *F = I->getFunction();
17722 const DataLayout &DL = F->getDataLayout();
17723 Type *Ty = User->getOperand(0)->getType();
17724
17725 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17727 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17728 I->getFastMathFlags().allowContract()));
17729}
17730
17731// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17732// 64-bit GPR.
17734 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17735 return false;
17736 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17737 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17738 return NumBits1 == 32 && NumBits2 == 64;
17739}
17741 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17742 return false;
17743 unsigned NumBits1 = VT1.getSizeInBits();
17744 unsigned NumBits2 = VT2.getSizeInBits();
17745 return NumBits1 == 32 && NumBits2 == 64;
17746}
17747
17749 EVT VT1 = Val.getValueType();
17750 if (isZExtFree(VT1, VT2)) {
17751 return true;
17752 }
17753
17754 if (Val.getOpcode() != ISD::LOAD)
17755 return false;
17756
17757 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17758 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17759 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17760 VT1.getSizeInBits() <= 32);
17761}
17762
17763bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17764 if (isa<FPExtInst>(Ext))
17765 return false;
17766
17767 // Vector types are not free.
17768 if (Ext->getType()->isVectorTy())
17769 return false;
17770
17771 for (const Use &U : Ext->uses()) {
17772 // The extension is free if we can fold it with a left shift in an
17773 // addressing mode or an arithmetic operation: add, sub, and cmp.
17774
17775 // Is there a shift?
17776 const Instruction *Instr = cast<Instruction>(U.getUser());
17777
17778 // Is this a constant shift?
17779 switch (Instr->getOpcode()) {
17780 case Instruction::Shl:
17781 if (!isa<ConstantInt>(Instr->getOperand(1)))
17782 return false;
17783 break;
17784 case Instruction::GetElementPtr: {
17785 gep_type_iterator GTI = gep_type_begin(Instr);
17786 auto &DL = Ext->getDataLayout();
17787 std::advance(GTI, U.getOperandNo()-1);
17788 Type *IdxTy = GTI.getIndexedType();
17789 // This extension will end up with a shift because of the scaling factor.
17790 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17791 // Get the shift amount based on the scaling factor:
17792 // log2(sizeof(IdxTy)) - log2(8).
17793 if (IdxTy->isScalableTy())
17794 return false;
17795 uint64_t ShiftAmt =
17796 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17797 3;
17798 // Is the constant foldable in the shift of the addressing mode?
17799 // I.e., shift amount is between 1 and 4 inclusive.
17800 if (ShiftAmt == 0 || ShiftAmt > 4)
17801 return false;
17802 break;
17803 }
17804 case Instruction::Trunc:
17805 // Check if this is a noop.
17806 // trunc(sext ty1 to ty2) to ty1.
17807 if (Instr->getType() == Ext->getOperand(0)->getType())
17808 continue;
17809 [[fallthrough]];
17810 default:
17811 return false;
17812 }
17813
17814 // At this point we can use the bfm family, so this extension is free
17815 // for that use.
17816 }
17817 return true;
17818}
17819
17820static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17821 unsigned NumElts, bool IsLittleEndian,
17822 SmallVectorImpl<int> &Mask) {
17823 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17824 return false;
17825
17826 assert(DstWidth % SrcWidth == 0 &&
17827 "TBL lowering is not supported for a conversion instruction with this "
17828 "source and destination element type.");
17829
17830 unsigned Factor = DstWidth / SrcWidth;
17831 unsigned MaskLen = NumElts * Factor;
17832
17833 Mask.clear();
17834 Mask.resize(MaskLen, NumElts);
17835
17836 unsigned SrcIndex = 0;
17837 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17838 Mask[I] = SrcIndex++;
17839
17840 return true;
17841}
17842
17844 FixedVectorType *ZExtTy,
17845 FixedVectorType *DstTy,
17846 bool IsLittleEndian) {
17847 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17848 unsigned NumElts = SrcTy->getNumElements();
17849 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17850 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17851
17852 SmallVector<int> Mask;
17853 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17854 return nullptr;
17855
17856 auto *FirstEltZero = Builder.CreateInsertElement(
17857 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17858 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17859 Result = Builder.CreateBitCast(Result, DstTy);
17860 if (DstTy != ZExtTy)
17861 Result = Builder.CreateZExt(Result, ZExtTy);
17862 return Result;
17863}
17864
17866 FixedVectorType *DstTy,
17867 bool IsLittleEndian) {
17868 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17869 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17870 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17871
17872 SmallVector<int> Mask;
17873 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17874 !IsLittleEndian, Mask))
17875 return nullptr;
17876
17877 auto *FirstEltZero = Builder.CreateInsertElement(
17878 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17879
17880 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17881}
17882
17883static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17884 IRBuilder<> Builder(TI);
17886 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17887 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17888 auto *DstTy = cast<FixedVectorType>(TI->getType());
17889 assert(SrcTy->getElementType()->isIntegerTy() &&
17890 "Non-integer type source vector element is not supported");
17891 assert(DstTy->getElementType()->isIntegerTy(8) &&
17892 "Unsupported destination vector element type");
17893 unsigned SrcElemTySz =
17894 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17895 unsigned DstElemTySz =
17896 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17897 assert((SrcElemTySz % DstElemTySz == 0) &&
17898 "Cannot lower truncate to tbl instructions for a source element size "
17899 "that is not divisible by the destination element size");
17900 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17901 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17902 "Unsupported source vector element type size");
17903 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17904
17905 // Create a mask to choose every nth byte from the source vector table of
17906 // bytes to create the truncated destination vector, where 'n' is the truncate
17907 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17908 // 0,8,16,..Y*8th bytes for the little-endian format
17910 for (int Itr = 0; Itr < 16; Itr++) {
17911 if (Itr < NumElements)
17912 MaskConst.push_back(Builder.getInt8(
17913 IsLittleEndian ? Itr * TruncFactor
17914 : Itr * TruncFactor + (TruncFactor - 1)));
17915 else
17916 MaskConst.push_back(Builder.getInt8(255));
17917 }
17918
17919 int MaxTblSz = 128 * 4;
17920 int MaxSrcSz = SrcElemTySz * NumElements;
17921 int ElemsPerTbl =
17922 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17923 assert(ElemsPerTbl <= 16 &&
17924 "Maximum elements selected using TBL instruction cannot exceed 16!");
17925
17926 int ShuffleCount = 128 / SrcElemTySz;
17927 SmallVector<int> ShuffleLanes;
17928 for (int i = 0; i < ShuffleCount; ++i)
17929 ShuffleLanes.push_back(i);
17930
17931 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17932 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17933 // call TBL & save the result in a vector of TBL results for combining later.
17935 while (ShuffleLanes.back() < NumElements) {
17936 Parts.push_back(Builder.CreateBitCast(
17937 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17938
17939 if (Parts.size() == 4) {
17940 Parts.push_back(ConstantVector::get(MaskConst));
17941 Results.push_back(
17942 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17943 Parts.clear();
17944 }
17945
17946 for (int i = 0; i < ShuffleCount; ++i)
17947 ShuffleLanes[i] += ShuffleCount;
17948 }
17949
17950 assert((Parts.empty() || Results.empty()) &&
17951 "Lowering trunc for vectors requiring different TBL instructions is "
17952 "not supported!");
17953 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17954 // registers
17955 if (!Parts.empty()) {
17956 Intrinsic::ID TblID;
17957 switch (Parts.size()) {
17958 case 1:
17959 TblID = Intrinsic::aarch64_neon_tbl1;
17960 break;
17961 case 2:
17962 TblID = Intrinsic::aarch64_neon_tbl2;
17963 break;
17964 case 3:
17965 TblID = Intrinsic::aarch64_neon_tbl3;
17966 break;
17967 }
17968
17969 Parts.push_back(ConstantVector::get(MaskConst));
17970 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17971 }
17972
17973 // Extract the destination vector from TBL result(s) after combining them
17974 // where applicable. Currently, at most two TBLs are supported.
17975 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17976 "more than 2 tbl instructions!");
17977 Value *FinalResult = Results[0];
17978 if (Results.size() == 1) {
17979 if (ElemsPerTbl < 16) {
17980 SmallVector<int> FinalMask(ElemsPerTbl);
17981 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17982 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17983 }
17984 } else {
17985 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17986 if (ElemsPerTbl < 16) {
17987 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17988 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17989 } else {
17990 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17991 }
17992 FinalResult =
17993 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17994 }
17995
17996 TI->replaceAllUsesWith(FinalResult);
17997 TI->eraseFromParent();
17998}
17999
18001 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
18002 // shuffle_vector instructions are serialized when targeting SVE,
18003 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
18004 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
18005 return false;
18006
18007 // Try to optimize conversions using tbl. This requires materializing constant
18008 // index vectors, which can increase code size and add loads. Skip the
18009 // transform unless the conversion is in a loop block guaranteed to execute
18010 // and we are not optimizing for size.
18011 Function *F = I->getParent()->getParent();
18012 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
18013 return false;
18014
18015 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
18016 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
18017 if (!SrcTy || !DstTy)
18018 return false;
18019
18020 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
18021 // lowered to tbl instructions to insert the original i8 elements
18022 // into i8x lanes. This is enabled for cases where it is beneficial.
18023 auto *ZExt = dyn_cast<ZExtInst>(I);
18024 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
18025 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
18026 if (DstWidth % 8 != 0)
18027 return false;
18028
18029 auto *TruncDstType =
18031 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
18032 // the remaining ZExt folded into the user, don't use tbl lowering.
18033 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
18034 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
18037 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
18038 return false;
18039
18040 DstTy = TruncDstType;
18041 }
18042
18043 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
18044 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
18045 // most one extra extend step is needed and using tbl is not profitable.
18046 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
18047 // udot instruction.
18048 if (SrcWidth * 4 <= DstWidth) {
18049 if (all_of(I->users(), [&](auto *U) {
18050 using namespace llvm::PatternMatch;
18051 auto *SingleUser = cast<Instruction>(&*U);
18052 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
18053 return true;
18054 if (match(SingleUser,
18055 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
18056 m_Value(), m_Specific(I))))
18057 return true;
18058 return false;
18059 }))
18060 return false;
18061 }
18062
18063 if (DstTy->getScalarSizeInBits() >= 64)
18064 return false;
18065
18066 IRBuilder<> Builder(ZExt);
18068 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
18069 DstTy, Subtarget->isLittleEndian());
18070 if (!Result)
18071 return false;
18072 ZExt->replaceAllUsesWith(Result);
18073 ZExt->eraseFromParent();
18074 return true;
18075 }
18076
18077 auto *UIToFP = dyn_cast<UIToFPInst>(I);
18078 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
18079 DstTy->getElementType()->isFloatTy()) ||
18080 (SrcTy->getElementType()->isIntegerTy(16) &&
18081 DstTy->getElementType()->isDoubleTy()))) {
18082 IRBuilder<> Builder(I);
18084 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
18085 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
18086 assert(ZExt && "Cannot fail for the i8 to float conversion");
18087 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
18088 I->replaceAllUsesWith(UI);
18089 I->eraseFromParent();
18090 return true;
18091 }
18092
18093 auto *SIToFP = dyn_cast<SIToFPInst>(I);
18094 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
18095 DstTy->getElementType()->isFloatTy()) {
18096 IRBuilder<> Builder(I);
18097 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
18099 Subtarget->isLittleEndian());
18100 assert(Shuffle && "Cannot fail for the i8 to float conversion");
18101 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
18102 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
18103 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
18104 I->replaceAllUsesWith(SI);
18105 I->eraseFromParent();
18106 return true;
18107 }
18108
18109 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
18110 // followed by a truncate lowered to using tbl.4.
18111 auto *FPToUI = dyn_cast<FPToUIInst>(I);
18112 if (FPToUI &&
18113 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
18114 SrcTy->getElementType()->isFloatTy() &&
18115 DstTy->getElementType()->isIntegerTy(8)) {
18116 IRBuilder<> Builder(I);
18117 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
18118 VectorType::getInteger(SrcTy));
18119 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
18120 I->replaceAllUsesWith(TruncI);
18121 I->eraseFromParent();
18122 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
18123 return true;
18124 }
18125
18126 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
18127 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
18128 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
18129 // registers
18130 auto *TI = dyn_cast<TruncInst>(I);
18131 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
18132 ((SrcTy->getElementType()->isIntegerTy(32) ||
18133 SrcTy->getElementType()->isIntegerTy(64)) &&
18134 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
18135 createTblForTrunc(TI, Subtarget->isLittleEndian());
18136 return true;
18137 }
18138
18139 return false;
18140}
18141
18143 Align &RequiredAlignment) const {
18144 if (!LoadedType.isSimple() ||
18145 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
18146 return false;
18147 // Cyclone supports unaligned accesses.
18148 RequiredAlignment = Align(1);
18149 unsigned NumBits = LoadedType.getSizeInBits();
18150 return NumBits == 32 || NumBits == 64;
18151}
18152
18153/// A helper function for determining the number of interleaved accesses we
18154/// will generate when lowering accesses of the given type.
18156 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
18157 unsigned VecSize = 128;
18158 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18159 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
18160 if (UseScalable && isa<FixedVectorType>(VecTy))
18161 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18162 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
18163}
18164
18167 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
18168 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
18169 return MOStridedAccess;
18171}
18172
18174 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
18175 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18176 auto EC = VecTy->getElementCount();
18177 unsigned MinElts = EC.getKnownMinValue();
18178
18179 UseScalable = false;
18180
18181 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
18182 (!Subtarget->useSVEForFixedLengthVectors() ||
18184 return false;
18185
18186 if (isa<ScalableVectorType>(VecTy) &&
18187 !Subtarget->isSVEorStreamingSVEAvailable())
18188 return false;
18189
18190 // Ensure the number of vector elements is greater than 1.
18191 if (MinElts < 2)
18192 return false;
18193
18194 // Ensure the element type is legal.
18195 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
18196 return false;
18197
18198 if (EC.isScalable()) {
18199 UseScalable = true;
18200 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
18201 }
18202
18203 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
18204 if (Subtarget->useSVEForFixedLengthVectors()) {
18205 unsigned MinSVEVectorSize =
18206 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18207 if (VecSize % MinSVEVectorSize == 0 ||
18208 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
18209 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
18210 UseScalable = true;
18211 return true;
18212 }
18213 }
18214
18215 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
18216 // 128 will be split into multiple interleaved accesses.
18217 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
18218}
18219
18221 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
18222 return ScalableVectorType::get(VTy->getElementType(), 2);
18223
18224 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
18225 return ScalableVectorType::get(VTy->getElementType(), 4);
18226
18227 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
18228 return ScalableVectorType::get(VTy->getElementType(), 8);
18229
18230 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
18231 return ScalableVectorType::get(VTy->getElementType(), 8);
18232
18233 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
18234 return ScalableVectorType::get(VTy->getElementType(), 2);
18235
18236 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
18237 return ScalableVectorType::get(VTy->getElementType(), 4);
18238
18239 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
18240 return ScalableVectorType::get(VTy->getElementType(), 8);
18241
18242 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
18243 return ScalableVectorType::get(VTy->getElementType(), 16);
18244
18245 llvm_unreachable("Cannot handle input vector type");
18246}
18247
18248static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
18249 bool Scalable, Type *LDVTy,
18250 Type *PtrTy) {
18251 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18252 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
18253 Intrinsic::aarch64_sve_ld3_sret,
18254 Intrinsic::aarch64_sve_ld4_sret};
18255 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
18256 Intrinsic::aarch64_neon_ld3,
18257 Intrinsic::aarch64_neon_ld4};
18258 if (Scalable)
18259 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
18260
18261 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
18262 {LDVTy, PtrTy});
18263}
18264
18265static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
18266 bool Scalable, Type *STVTy,
18267 Type *PtrTy) {
18268 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18269 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
18270 Intrinsic::aarch64_sve_st3,
18271 Intrinsic::aarch64_sve_st4};
18272 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
18273 Intrinsic::aarch64_neon_st3,
18274 Intrinsic::aarch64_neon_st4};
18275 if (Scalable)
18276 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
18277
18278 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
18279 {STVTy, PtrTy});
18280}
18281
18282/// Lower an interleaved load into a ldN intrinsic.
18283///
18284/// E.g. Lower an interleaved load (Factor = 2):
18285/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
18286/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
18287/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
18288///
18289/// Into:
18290/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
18291/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
18292/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
18294 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
18295 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
18296 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18297 "Invalid interleave factor");
18298 assert(!Shuffles.empty() && "Empty shufflevector input");
18299 assert(Shuffles.size() == Indices.size() &&
18300 "Unmatched number of shufflevectors and indices");
18301
18302 auto *LI = dyn_cast<LoadInst>(Load);
18303 if (!LI)
18304 return false;
18305 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
18306
18307 const DataLayout &DL = LI->getDataLayout();
18308
18309 VectorType *VTy = Shuffles[0]->getType();
18310
18311 // Skip if we do not have NEON and skip illegal vector types. We can
18312 // "legalize" wide vector types into multiple interleaved accesses as long as
18313 // the vector types are divisible by 128.
18314 bool UseScalable;
18315 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18316 return false;
18317
18318 // Check if the interleave is a zext(shuffle), that can be better optimized
18319 // into shift / and masks. For the moment we do this just for uitofp (not
18320 // zext) to avoid issues with widening instructions.
18321 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
18322 using namespace llvm::PatternMatch;
18323 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
18324 SI->getType()->getScalarSizeInBits() * 4 ==
18325 SI->user_back()->getType()->getScalarSizeInBits();
18326 }))
18327 return false;
18328
18329 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18330
18331 auto *FVTy = cast<FixedVectorType>(VTy);
18332
18333 // A pointer vector can not be the return type of the ldN intrinsics. Need to
18334 // load integer vectors first and then convert to pointer vectors.
18335 Type *EltTy = FVTy->getElementType();
18336 if (EltTy->isPointerTy())
18337 FVTy =
18338 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
18339
18340 // If we're going to generate more than one load, reset the sub-vector type
18341 // to something legal.
18342 FVTy = FixedVectorType::get(FVTy->getElementType(),
18343 FVTy->getNumElements() / NumLoads);
18344
18345 auto *LDVTy =
18346 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
18347
18348 IRBuilder<> Builder(LI);
18349
18350 // The base address of the load.
18351 Value *BaseAddr = LI->getPointerOperand();
18352
18353 Type *PtrTy = LI->getPointerOperandType();
18354 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
18355 LDVTy->getElementCount());
18356
18357 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18358 UseScalable, LDVTy, PtrTy);
18359
18360 // Holds sub-vectors extracted from the load intrinsic return values. The
18361 // sub-vectors are associated with the shufflevector instructions they will
18362 // replace.
18364
18365 Value *PTrue = nullptr;
18366 if (UseScalable) {
18367 std::optional<unsigned> PgPattern =
18368 getSVEPredPatternFromNumElements(FVTy->getNumElements());
18369 if (Subtarget->getMinSVEVectorSizeInBits() ==
18370 Subtarget->getMaxSVEVectorSizeInBits() &&
18371 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
18372 PgPattern = AArch64SVEPredPattern::all;
18373
18374 auto *PTruePat =
18375 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
18376 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18377 {PTruePat});
18378 }
18379
18380 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
18381
18382 // If we're generating more than one load, compute the base address of
18383 // subsequent loads as an offset from the previous.
18384 if (LoadCount > 0)
18385 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
18386 FVTy->getNumElements() * Factor);
18387
18388 CallInst *LdN;
18389 if (UseScalable)
18390 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
18391 else
18392 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18393
18394 // Extract and store the sub-vectors returned by the load intrinsic.
18395 for (unsigned i = 0; i < Shuffles.size(); i++) {
18396 ShuffleVectorInst *SVI = Shuffles[i];
18397 unsigned Index = Indices[i];
18398
18399 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
18400
18401 if (UseScalable)
18402 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
18403
18404 // Convert the integer vector to pointer vector if the element is pointer.
18405 if (EltTy->isPointerTy())
18406 SubVec = Builder.CreateIntToPtr(
18408 FVTy->getNumElements()));
18409
18410 SubVecs[SVI].push_back(SubVec);
18411 }
18412 }
18413
18414 // Replace uses of the shufflevector instructions with the sub-vectors
18415 // returned by the load intrinsic. If a shufflevector instruction is
18416 // associated with more than one sub-vector, those sub-vectors will be
18417 // concatenated into a single wide vector.
18418 for (ShuffleVectorInst *SVI : Shuffles) {
18419 auto &SubVec = SubVecs[SVI];
18420 auto *WideVec =
18421 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
18422 SVI->replaceAllUsesWith(WideVec);
18423 }
18424
18425 return true;
18426}
18427
18428template <typename Iter>
18429bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
18430 int MaxLookupDist = 20;
18431 unsigned IdxWidth = DL.getIndexSizeInBits(0);
18432 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
18433 const Value *PtrA1 =
18435
18436 while (++It != End) {
18437 if (It->isDebugOrPseudoInst())
18438 continue;
18439 if (MaxLookupDist-- == 0)
18440 break;
18441 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
18442 const Value *PtrB1 =
18443 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
18444 DL, OffsetB);
18445 if (PtrA1 == PtrB1 &&
18446 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
18447 .abs() == 16)
18448 return true;
18449 }
18450 }
18451
18452 return false;
18453}
18454
18455/// Lower an interleaved store into a stN intrinsic.
18456///
18457/// E.g. Lower an interleaved store (Factor = 3):
18458/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
18459/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
18460/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18461///
18462/// Into:
18463/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
18464/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
18465/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
18466/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18467///
18468/// Note that the new shufflevectors will be removed and we'll only generate one
18469/// st3 instruction in CodeGen.
18470///
18471/// Example for a more general valid mask (Factor 3). Lower:
18472/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
18473/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
18474/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18475///
18476/// Into:
18477/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
18478/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
18479/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
18480/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18482 Value *LaneMask,
18483 ShuffleVectorInst *SVI,
18484 unsigned Factor,
18485 const APInt &GapMask) const {
18486
18487 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18488 "Invalid interleave factor");
18489 auto *SI = dyn_cast<StoreInst>(Store);
18490 if (!SI)
18491 return false;
18492 assert(!LaneMask && GapMask.popcount() == Factor &&
18493 "Unexpected mask on store");
18494
18495 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18496 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18497
18498 unsigned LaneLen = VecTy->getNumElements() / Factor;
18499 Type *EltTy = VecTy->getElementType();
18500 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18501
18502 const DataLayout &DL = SI->getDataLayout();
18503 bool UseScalable;
18504
18505 // Skip if we do not have NEON and skip illegal vector types. We can
18506 // "legalize" wide vector types into multiple interleaved accesses as long as
18507 // the vector types are divisible by 128.
18508 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18509 return false;
18510
18511 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18512
18513 Value *Op0 = SVI->getOperand(0);
18514 Value *Op1 = SVI->getOperand(1);
18515 IRBuilder<> Builder(SI);
18516
18517 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18518 // vectors to integer vectors.
18519 if (EltTy->isPointerTy()) {
18520 Type *IntTy = DL.getIntPtrType(EltTy);
18521 unsigned NumOpElts =
18522 cast<FixedVectorType>(Op0->getType())->getNumElements();
18523
18524 // Convert to the corresponding integer vector.
18525 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18526 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18527 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18528
18529 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18530 }
18531
18532 // If we're going to generate more than one store, reset the lane length
18533 // and sub-vector type to something legal.
18534 LaneLen /= NumStores;
18535 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18536
18537 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18538 : SubVecTy;
18539
18540 // The base address of the store.
18541 Value *BaseAddr = SI->getPointerOperand();
18542
18543 auto Mask = SVI->getShuffleMask();
18544
18545 // Sanity check if all the indices are NOT in range.
18546 // If mask is `poison`, `Mask` may be a vector of -1s.
18547 // If all of them are `poison`, OOB read will happen later.
18548 if (llvm::all_of(Mask, equal_to(PoisonMaskElem))) {
18549 return false;
18550 }
18551 // A 64bit st2 which does not start at element 0 will involved adding extra
18552 // ext elements making the st2 unprofitable, and if there is a nearby store
18553 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18554 // zip;ldp pair which has higher throughput.
18555 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18556 (Mask[0] != 0 ||
18557 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18558 DL) ||
18559 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18560 BaseAddr, DL)))
18561 return false;
18562
18563 Type *PtrTy = SI->getPointerOperandType();
18564 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18565 STVTy->getElementCount());
18566
18567 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18568 UseScalable, STVTy, PtrTy);
18569
18570 Value *PTrue = nullptr;
18571 if (UseScalable) {
18572 std::optional<unsigned> PgPattern =
18573 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18574 if (Subtarget->getMinSVEVectorSizeInBits() ==
18575 Subtarget->getMaxSVEVectorSizeInBits() &&
18576 Subtarget->getMinSVEVectorSizeInBits() ==
18577 DL.getTypeSizeInBits(SubVecTy))
18578 PgPattern = AArch64SVEPredPattern::all;
18579
18580 auto *PTruePat =
18581 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18582 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18583 {PTruePat});
18584 }
18585
18586 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18587
18589
18590 // Split the shufflevector operands into sub vectors for the new stN call.
18591 for (unsigned i = 0; i < Factor; i++) {
18592 Value *Shuffle;
18593 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18594 if (Mask[IdxI] >= 0) {
18595 Shuffle = Builder.CreateShuffleVector(
18596 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18597 } else {
18598 unsigned StartMask = 0;
18599 for (unsigned j = 1; j < LaneLen; j++) {
18600 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18601 if (Mask[IdxJ] >= 0) {
18602 StartMask = Mask[IdxJ] - j;
18603 break;
18604 }
18605 }
18606 // Note: Filling undef gaps with random elements is ok, since
18607 // those elements were being written anyway (with undefs).
18608 // In the case of all undefs we're defaulting to using elems from 0
18609 // Note: StartMask cannot be negative, it's checked in
18610 // isReInterleaveMask
18611 Shuffle = Builder.CreateShuffleVector(
18612 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18613 }
18614
18615 if (UseScalable)
18616 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18617 Shuffle, uint64_t(0));
18618
18619 Ops.push_back(Shuffle);
18620 }
18621
18622 if (UseScalable)
18623 Ops.push_back(PTrue);
18624
18625 // If we generating more than one store, we compute the base address of
18626 // subsequent stores as an offset from the previous.
18627 if (StoreCount > 0)
18628 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18629 BaseAddr, LaneLen * Factor);
18630
18631 Ops.push_back(BaseAddr);
18632 Builder.CreateCall(StNFunc, Ops);
18633 }
18634 return true;
18635}
18636
18638 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18639 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18640 if (Factor != 2 && Factor != 3 && Factor != 4) {
18641 LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
18642 return false;
18643 }
18644 auto *LI = dyn_cast<LoadInst>(Load);
18645 if (!LI)
18646 return false;
18647 assert(!Mask && "Unexpected mask on a load\n");
18648
18650
18651 const DataLayout &DL = LI->getModule()->getDataLayout();
18652 bool UseScalable;
18653 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18654 return false;
18655
18656 // TODO: Add support for using SVE instructions with fixed types later, using
18657 // the code from lowerInterleavedLoad to obtain the correct container type.
18658 if (UseScalable && !VTy->isScalableTy())
18659 return false;
18660
18661 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18662 VectorType *LdTy =
18664 VTy->getElementCount().divideCoefficientBy(NumLoads));
18665
18666 Type *PtrTy = LI->getPointerOperandType();
18667 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18668 UseScalable, LdTy, PtrTy);
18669
18670 IRBuilder<> Builder(LI);
18671 Value *Pred = nullptr;
18672 if (UseScalable)
18673 Pred =
18674 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18675
18676 Value *BaseAddr = LI->getPointerOperand();
18677 Value *Result = nullptr;
18678 if (NumLoads > 1) {
18679 // Create multiple legal small ldN.
18680 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18681 for (unsigned I = 0; I < NumLoads; ++I) {
18682 Value *Offset = Builder.getInt64(I * Factor);
18683
18684 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18685 Value *LdN = nullptr;
18686 if (UseScalable)
18687 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18688 else
18689 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18690 Value *Idx =
18691 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18692 for (unsigned J = 0; J < Factor; ++J) {
18693 ExtractedLdValues[J] = Builder.CreateInsertVector(
18694 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18695 }
18696 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18697 }
18698
18699 // Merge the values from different factors.
18700 Result = PoisonValue::get(DI->getType());
18701 for (unsigned J = 0; J < Factor; ++J)
18702 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18703 } else {
18704 if (UseScalable)
18705 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18706 else
18707 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18708 }
18709
18710 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18711 DI->replaceAllUsesWith(Result);
18712 return true;
18713}
18714
18716 Instruction *Store, Value *Mask,
18717 ArrayRef<Value *> InterleavedValues) const {
18718 unsigned Factor = InterleavedValues.size();
18719 if (Factor != 2 && Factor != 3 && Factor != 4) {
18720 LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
18721 return false;
18722 }
18724 if (!SI)
18725 return false;
18726 assert(!Mask && "Unexpected mask on plain store");
18727
18728 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18729 const DataLayout &DL = SI->getModule()->getDataLayout();
18730
18731 bool UseScalable;
18732 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18733 return false;
18734
18735 // TODO: Add support for using SVE instructions with fixed types later, using
18736 // the code from lowerInterleavedStore to obtain the correct container type.
18737 if (UseScalable && !VTy->isScalableTy())
18738 return false;
18739
18740 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18741
18742 VectorType *StTy =
18744 VTy->getElementCount().divideCoefficientBy(NumStores));
18745
18746 Type *PtrTy = SI->getPointerOperandType();
18747 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18748 UseScalable, StTy, PtrTy);
18749
18750 IRBuilder<> Builder(SI);
18751
18752 Value *BaseAddr = SI->getPointerOperand();
18753 Value *Pred = nullptr;
18754
18755 if (UseScalable)
18756 Pred =
18757 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18758
18759 auto ExtractedValues = InterleavedValues;
18760 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18761 if (UseScalable)
18762 StoreOperands.push_back(Pred);
18763 StoreOperands.push_back(BaseAddr);
18764 for (unsigned I = 0; I < NumStores; ++I) {
18765 Value *Address = BaseAddr;
18766 if (NumStores > 1) {
18767 Value *Offset = Builder.getInt64(I * Factor);
18768 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18769 Value *Idx =
18770 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18771 for (unsigned J = 0; J < Factor; J++) {
18772 StoreOperands[J] =
18773 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18774 }
18775 // update the address
18776 StoreOperands[StoreOperands.size() - 1] = Address;
18777 }
18778 Builder.CreateCall(StNFunc, StoreOperands);
18779 }
18780 return true;
18781}
18782
18784 LLVMContext &Context, const MemOp &Op,
18785 const AttributeList &FuncAttributes) const {
18786 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18787 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18788 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18789 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18790 // taken one instruction to materialize the v2i64 zero and one store (with
18791 // restrictive addressing mode). Just do i64 stores.
18792 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18793 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18794 if (Op.isAligned(AlignCheck))
18795 return true;
18796 unsigned Fast;
18797 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18799 Fast;
18800 };
18801
18802 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18803 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18804 return MVT::v16i8;
18805 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18806 return MVT::f128;
18807 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18808 return MVT::i64;
18809 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18810 return MVT::i32;
18811 return MVT::Other;
18812}
18813
18815 const MemOp &Op, const AttributeList &FuncAttributes) const {
18816 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18817 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18818 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18819 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18820 // taken one instruction to materialize the v2i64 zero and one store (with
18821 // restrictive addressing mode). Just do i64 stores.
18822 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18823 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18824 if (Op.isAligned(AlignCheck))
18825 return true;
18826 unsigned Fast;
18827 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18829 Fast;
18830 };
18831
18832 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18833 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18834 return LLT::fixed_vector(2, 64);
18835 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18836 return LLT::scalar(128);
18837 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18838 return LLT::scalar(64);
18839 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18840 return LLT::scalar(32);
18841 return LLT();
18842}
18843
18844// 12-bit optionally shifted immediates are legal for adds.
18846 if (Immed == std::numeric_limits<int64_t>::min()) {
18847 return false;
18848 }
18849 // Same encoding for add/sub, just flip the sign.
18850 return isLegalArithImmed((uint64_t)std::abs(Immed));
18851}
18852
18854 // We will only emit addvl/inc* instructions for SVE2
18855 if (!Subtarget->hasSVE2())
18856 return false;
18857
18858 // addvl's immediates are in terms of the number of bytes in a register.
18859 // Since there are 16 in the base supported size (128bits), we need to
18860 // divide the immediate by that much to give us a useful immediate to
18861 // multiply by vscale. We can't have a remainder as a result of this.
18862 if (Imm % 16 == 0)
18863 return isInt<6>(Imm / 16);
18864
18865 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18866 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18867 // of addvl as a result, so only take h|w|d into account.
18868 // Dec[h|w|d] will cover subtractions.
18869 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18870 // FIXME: Can we make use of other patterns to cover other immediates?
18871
18872 // inch|dech
18873 if (Imm % 8 == 0)
18874 return std::abs(Imm / 8) <= 16;
18875 // incw|decw
18876 if (Imm % 4 == 0)
18877 return std::abs(Imm / 4) <= 16;
18878 // incd|decd
18879 if (Imm % 2 == 0)
18880 return std::abs(Imm / 2) <= 16;
18881
18882 return false;
18883}
18884
18885// Return false to prevent folding
18886// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18887// if the folding leads to worse code.
18889 SDValue AddNode, SDValue ConstNode) const {
18890 // Let the DAGCombiner decide for vector types and large types.
18891 const EVT VT = AddNode.getValueType();
18892 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18893 return true;
18894
18895 // It is worse if c1 is legal add immediate, while c1*c2 is not
18896 // and has to be composed by at least two instructions.
18897 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18898 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18899 const int64_t C1 = C1Node->getSExtValue();
18900 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18902 return true;
18904 // Adapt to the width of a register.
18905 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18906 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18907 if (Insn.size() > 1)
18908 return false;
18909
18910 // Default to true and let the DAGCombiner decide.
18911 return true;
18912}
18913
18914// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18915// immediates is the same as for an add or a sub.
18917 return isLegalAddImmediate(Immed);
18918}
18919
18920/// isLegalAddressingMode - Return true if the addressing mode represented
18921/// by AM is legal for this target, for a load/store of the specified type.
18923 const AddrMode &AMode, Type *Ty,
18924 unsigned AS, Instruction *I) const {
18925 // AArch64 has five basic addressing modes:
18926 // reg
18927 // reg + 9-bit signed offset
18928 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18929 // reg1 + reg2
18930 // reg + SIZE_IN_BYTES * reg
18931
18932 // No global is ever allowed as a base.
18933 if (AMode.BaseGV)
18934 return false;
18935
18936 // No reg+reg+imm addressing.
18937 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18938 return false;
18939
18940 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18941 // `2*ScaledReg` into `BaseReg + ScaledReg`
18942 AddrMode AM = AMode;
18943 if (AM.Scale && !AM.HasBaseReg) {
18944 if (AM.Scale == 1) {
18945 AM.HasBaseReg = true;
18946 AM.Scale = 0;
18947 } else if (AM.Scale == 2) {
18948 AM.HasBaseReg = true;
18949 AM.Scale = 1;
18950 } else {
18951 return false;
18952 }
18953 }
18954
18955 // A base register is required in all addressing modes.
18956 if (!AM.HasBaseReg)
18957 return false;
18958
18959 if (Ty->isScalableTy()) {
18960 if (isa<ScalableVectorType>(Ty)) {
18961 // See if we have a foldable vscale-based offset, for vector types which
18962 // are either legal or smaller than the minimum; more work will be
18963 // required if we need to consider addressing for types which need
18964 // legalization by splitting.
18965 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18966 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18967 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18968 isPowerOf2_64(VecNumBytes))
18969 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18970
18971 uint64_t VecElemNumBytes =
18972 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18973 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18974 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18975 }
18976
18977 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18978 }
18979
18980 // No scalable offsets allowed for non-scalable types.
18981 if (AM.ScalableOffset)
18982 return false;
18983
18984 // check reg + imm case:
18985 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18986 uint64_t NumBytes = 0;
18987 if (Ty->isSized()) {
18988 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18989 NumBytes = NumBits / 8;
18990 if (!isPowerOf2_64(NumBits))
18991 NumBytes = 0;
18992 }
18993
18994 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18995 AM.Scale);
18996}
18997
18998// Check whether the 2 offsets belong to the same imm24 range, and their high
18999// 12bits are same, then their high part can be decoded with the offset of add.
19000int64_t
19002 int64_t MaxOffset) const {
19003 int64_t HighPart = MinOffset & ~0xfffULL;
19004 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
19005 // Rebase the value to an integer multiple of imm12.
19006 return HighPart;
19007 }
19008
19009 return 0;
19010}
19011
19013 // Consider splitting large offset of struct or array.
19014 return true;
19015}
19016
19018 const MachineFunction &MF, EVT VT) const {
19019 EVT ScalarVT = VT.getScalarType();
19020
19021 if (!ScalarVT.isSimple())
19022 return false;
19023
19024 switch (ScalarVT.getSimpleVT().SimpleTy) {
19025 case MVT::f16:
19026 return Subtarget->hasFullFP16();
19027 case MVT::f32:
19028 case MVT::f64:
19029 return true;
19030 case MVT::bf16:
19031 return VT.isScalableVector() && Subtarget->hasBF16() &&
19032 Subtarget->isNonStreamingSVEorSME2Available();
19033 default:
19034 break;
19035 }
19036
19037 return false;
19038}
19039
19041 Type *Ty) const {
19042 switch (Ty->getScalarType()->getTypeID()) {
19043 case Type::FloatTyID:
19044 case Type::DoubleTyID:
19045 return true;
19046 default:
19047 return false;
19048 }
19049}
19050
19052 EVT VT, CodeGenOptLevel OptLevel) const {
19053 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
19055}
19056
19057const MCPhysReg *
19059 // LR is a callee-save register, but we must treat it as clobbered by any call
19060 // site. Hence we include LR in the scratch registers, which are in turn added
19061 // as implicit-defs for stackmaps and patchpoints.
19062 static const MCPhysReg ScratchRegs[] = {
19063 AArch64::X16, AArch64::X17, AArch64::LR, 0
19064 };
19065 return ScratchRegs;
19066}
19067
19069 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
19070 return RCRegs;
19071}
19072
19073bool
19075 CombineLevel Level) const {
19076 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
19077 N->getOpcode() == ISD::SRL) &&
19078 "Expected shift op");
19079
19080 SDValue ShiftLHS = N->getOperand(0);
19081 EVT VT = N->getValueType(0);
19082
19083 if (!ShiftLHS->hasOneUse())
19084 return false;
19085
19086 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
19087 !ShiftLHS.getOperand(0)->hasOneUse())
19088 return false;
19089
19090 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
19091 // combine it with shift 'N' to let it be lowered to UBFX except:
19092 // ((x >> C) & mask) << C.
19093 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
19094 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
19095 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
19096 if (isMask_64(TruncMask)) {
19097 SDValue AndLHS = ShiftLHS.getOperand(0);
19098 if (AndLHS.getOpcode() == ISD::SRL) {
19099 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
19100 if (N->getOpcode() == ISD::SHL)
19101 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
19102 return SRLC->getZExtValue() == SHLC->getZExtValue();
19103 return false;
19104 }
19105 }
19106 }
19107 }
19108 return true;
19109}
19110
19112 const SDNode *N) const {
19113 assert(N->getOpcode() == ISD::XOR &&
19114 (N->getOperand(0).getOpcode() == ISD::SHL ||
19115 N->getOperand(0).getOpcode() == ISD::SRL) &&
19116 "Expected XOR(SHIFT) pattern");
19117
19118 // Only commute if the entire NOT mask is a hidden shifted mask.
19119 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
19120 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19121 if (XorC && ShiftC) {
19122 unsigned MaskIdx, MaskLen;
19123 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
19124 unsigned ShiftAmt = ShiftC->getZExtValue();
19125 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
19126 if (N->getOperand(0).getOpcode() == ISD::SHL)
19127 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
19128 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
19129 }
19130 }
19131
19132 return false;
19133}
19134
19136 const SDNode *N) const {
19137 assert(((N->getOpcode() == ISD::SHL &&
19138 N->getOperand(0).getOpcode() == ISD::SRL) ||
19139 (N->getOpcode() == ISD::SRL &&
19140 N->getOperand(0).getOpcode() == ISD::SHL)) &&
19141 "Expected shift-shift mask");
19142 // Don't allow multiuse shift folding with the same shift amount.
19143 if (!N->getOperand(0)->hasOneUse())
19144 return false;
19145
19146 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
19147 EVT VT = N->getValueType(0);
19148 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
19149 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19150 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
19151 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
19152 }
19153
19154 // We do not need to fold when this shifting used in specific load case:
19155 // (ldr x, (add x, (shl (srl x, c1) 2)))
19156 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
19157 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
19158 unsigned ShlAmt = C2->getZExtValue();
19159 if (auto ShouldADD = *N->user_begin();
19160 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
19161 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
19162 EVT MemVT = Load->getMemoryVT();
19163
19164 if (Load->getValueType(0).isScalableVector())
19165 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
19166
19167 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
19168 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
19169 }
19170 }
19171 }
19172 }
19173
19174 return true;
19175}
19176
19178 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
19179 SDValue Y) const {
19180 return VT.isScalableVector() && isTypeLegal(VT) &&
19181 SelectOpcode == ISD::VSELECT;
19182}
19183
19185 Type *Ty) const {
19186 assert(Ty->isIntegerTy());
19187
19188 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19189 if (BitSize == 0)
19190 return false;
19191
19192 int64_t Val = Imm.getSExtValue();
19193 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
19194 return true;
19195
19196 if (Val < 0)
19197 Val = ~Val;
19198 if (BitSize == 32)
19199 Val &= (1LL << 32) - 1;
19200
19201 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
19202 // MOVZ is free so return true for one or fewer MOVK.
19203 return Shift < 3;
19204}
19205
19207 unsigned Index) const {
19209 return false;
19210
19211 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
19212}
19213
19215 LLVMContext &Context, EVT VT) const {
19216 if (getTypeAction(Context, VT) != TypeExpandInteger)
19217 return false;
19218
19219 EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
19220 return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
19221}
19222
19223/// Turn vector tests of the signbit in the form of:
19224/// xor (sra X, elt_size(X)-1), -1
19225/// into:
19226/// cmge X, X, #0
19228 const AArch64Subtarget *Subtarget) {
19229 EVT VT = N->getValueType(0);
19230 if (!Subtarget->hasNEON() || !VT.isVector())
19231 return SDValue();
19232
19233 // There must be a shift right algebraic before the xor, and the xor must be a
19234 // 'not' operation.
19235 SDValue Shift = N->getOperand(0);
19236 SDValue Ones = N->getOperand(1);
19237 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
19239 return SDValue();
19240
19241 // The shift should be smearing the sign bit across each vector element.
19242 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
19243 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
19244 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
19245 return SDValue();
19246
19247 SDLoc DL(N);
19248 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
19249 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
19250}
19251
19252// Given a vecreduce_add node, detect the below pattern and convert it to the
19253// node sequence with UABDL, [S|U]ADB and UADDLP.
19254//
19255// i32 vecreduce_add(
19256// v16i32 abs(
19257// v16i32 sub(
19258// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
19259//
19260// or
19261//
19262// i32 vecreduce_add(
19263// v16i32 zext(
19264// v16i16 abs(
19265// v16i16 sub(
19266// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
19267//
19268// =================>
19269// i32 vecreduce_add(
19270// v4i32 UADDLP(
19271// v8i16 add(
19272// v8i16 zext(
19273// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
19274// v8i16 zext(
19275// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
19277 SelectionDAG &DAG) {
19278 // Assumed i32 vecreduce_add
19279 if (N->getValueType(0) != MVT::i32)
19280 return SDValue();
19281
19282 SDValue VecReduceOp0 = N->getOperand(0);
19283 bool SawTrailingZext = false;
19284 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
19285 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
19286 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
19287 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
19288 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
19289 SawTrailingZext = true;
19290 VecReduceOp0 = VecReduceOp0.getOperand(0);
19291 }
19292
19293 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
19294 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
19295 // Assumed v16i16 or v16i32 abs input
19296 unsigned Opcode = VecReduceOp0.getOpcode();
19297 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
19298 return SDValue();
19299
19300 SDValue ABS = VecReduceOp0;
19301 // Assumed v16i16 or v16i32 sub
19302 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
19303 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
19304 return SDValue();
19305
19306 SDValue SUB = ABS->getOperand(0);
19307 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
19308 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
19309 // Assumed v16i16 or v16i32 type
19310 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
19311 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
19312 return SDValue();
19313
19314 // Assumed zext or sext
19315 bool IsZExt = false;
19316 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
19317 IsZExt = true;
19318 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
19319 IsZExt = false;
19320 } else
19321 return SDValue();
19322
19323 SDValue EXT0 = SUB->getOperand(0);
19324 SDValue EXT1 = SUB->getOperand(1);
19325 // Assumed zext's operand has v16i8 type
19326 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
19327 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
19328 return SDValue();
19329
19330 // Pattern is detected. Let's convert it to sequence of nodes.
19331 SDLoc DL(N);
19332
19333 // First, create the node pattern of UABD/SABD.
19334 SDValue UABDHigh8Op0 =
19335 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19336 DAG.getConstant(8, DL, MVT::i64));
19337 SDValue UABDHigh8Op1 =
19338 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19339 DAG.getConstant(8, DL, MVT::i64));
19340 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19341 UABDHigh8Op0, UABDHigh8Op1);
19342 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
19343
19344 // Second, create the node pattern of UABAL.
19345 SDValue UABDLo8Op0 =
19346 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19347 DAG.getConstant(0, DL, MVT::i64));
19348 SDValue UABDLo8Op1 =
19349 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19350 DAG.getConstant(0, DL, MVT::i64));
19351 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19352 UABDLo8Op0, UABDLo8Op1);
19353 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
19354 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
19355
19356 // Third, create the node of UADDLP.
19357 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
19358
19359 // Fourth, create the node of VECREDUCE_ADD.
19360 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
19361}
19362
19363static SDValue
19365 const AArch64Subtarget *ST) {
19366 if (DCI.isBeforeLegalize())
19367 return SDValue();
19368
19369 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
19370 /*IsEqual=*/false))
19371 return While;
19372
19373 if (!N->getValueType(0).isScalableVector() ||
19374 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
19375 return SDValue();
19376
19377 // Count the number of users which are extract_vectors.
19378 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
19379 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
19380 });
19381
19382 auto MaskEC = N->getValueType(0).getVectorElementCount();
19383 if (!MaskEC.isKnownMultipleOf(NumExts))
19384 return SDValue();
19385
19386 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
19387 if (ExtMinEC.getKnownMinValue() < 2)
19388 return SDValue();
19389
19390 SmallVector<SDNode *> Extracts(NumExts, nullptr);
19391 for (SDNode *Use : N->users()) {
19392 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
19393 continue;
19394
19395 // Ensure the extract type is correct (e.g. if NumExts is 4 and
19396 // the mask return type is nxv8i1, each extract should be nxv2i1.
19397 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
19398 return SDValue();
19399
19400 // There should be exactly one extract for each part of the mask.
19401 unsigned Offset = Use->getConstantOperandVal(1);
19402 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
19403 if (Extracts[Part] != nullptr)
19404 return SDValue();
19405
19406 Extracts[Part] = Use;
19407 }
19408
19409 SelectionDAG &DAG = DCI.DAG;
19410 SDLoc DL(N);
19411 SDValue ID =
19412 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
19413
19414 SDValue Idx = N->getOperand(0);
19415 SDValue TC = N->getOperand(1);
19416 if (Idx.getValueType() != MVT::i64) {
19417 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
19418 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
19419 }
19420
19421 // Create the whilelo_x2 intrinsics from each pair of extracts
19422 EVT ExtVT = Extracts[0]->getValueType(0);
19423 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19424 auto R =
19425 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19426 DCI.CombineTo(Extracts[0], R.getValue(0));
19427 DCI.CombineTo(Extracts[1], R.getValue(1));
19428 SmallVector<SDValue> Concats = {DAG.getNode(
19429 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
19430
19431 if (NumExts == 2) {
19432 assert(N->getValueType(0) == DoubleExtVT);
19433 return Concats[0];
19434 }
19435
19436 auto Elts =
19437 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
19438 for (unsigned I = 2; I < NumExts; I += 2) {
19439 // After the first whilelo_x2, we need to increment the starting value.
19440 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
19441 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19442 DCI.CombineTo(Extracts[I], R.getValue(0));
19443 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
19444 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
19445 R.getValue(0), R.getValue(1)));
19446 }
19447
19448 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
19449}
19450
19451// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
19452// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
19453// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
19454// If we have vectors larger than v16i8 we extract v16i8 vectors,
19455// Follow the same steps above to get DOT instructions concatenate them
19456// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
19458 const AArch64Subtarget *ST) {
19459 if (!ST->isNeonAvailable())
19460 return SDValue();
19461
19462 if (!ST->hasDotProd())
19464
19465 SDValue Op0 = N->getOperand(0);
19466 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
19467 Op0.getValueType().getVectorElementType() != MVT::i32)
19468 return SDValue();
19469
19470 unsigned ExtOpcode = Op0.getOpcode();
19471 SDValue A = Op0;
19472 SDValue B;
19473 unsigned DotOpcode;
19474 if (ExtOpcode == ISD::MUL) {
19475 A = Op0.getOperand(0);
19476 B = Op0.getOperand(1);
19477 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
19478 return SDValue();
19479 auto OpCodeA = A.getOpcode();
19480 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
19481 return SDValue();
19482
19483 auto OpCodeB = B.getOpcode();
19484 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
19485 return SDValue();
19486
19487 if (OpCodeA == OpCodeB) {
19488 DotOpcode =
19489 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
19490 } else {
19491 // Check USDOT support support
19492 if (!ST->hasMatMulInt8())
19493 return SDValue();
19494 DotOpcode = AArch64ISD::USDOT;
19495 if (OpCodeA == ISD::SIGN_EXTEND)
19496 std::swap(A, B);
19497 }
19498 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
19499 DotOpcode = AArch64ISD::UDOT;
19500 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
19501 DotOpcode = AArch64ISD::SDOT;
19502 } else {
19503 return SDValue();
19504 }
19505
19506 EVT Op0VT = A.getOperand(0).getValueType();
19507 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19508 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19509 if (!IsValidElementCount || !IsValidSize)
19510 return SDValue();
19511
19512 SDLoc DL(Op0);
19513 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19514 // the extend B.
19515 if (!B)
19516 B = DAG.getConstant(1, DL, Op0VT);
19517 else
19518 B = B.getOperand(0);
19519
19520 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19521 unsigned NumOfVecReduce;
19522 EVT TargetType;
19523 if (IsMultipleOf16) {
19524 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19525 TargetType = MVT::v4i32;
19526 } else {
19527 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19528 TargetType = MVT::v2i32;
19529 }
19530 // Handle the case where we need to generate only one Dot operation.
19531 if (NumOfVecReduce == 1) {
19532 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19533 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19534 A.getOperand(0), B);
19535 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19536 }
19537 // Generate Dot instructions that are multiple of 16.
19538 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19539 SmallVector<SDValue, 4> SDotVec16;
19540 unsigned I = 0;
19541 for (; I < VecReduce16Num; I += 1) {
19542 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19543 SDValue Op0 =
19544 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19545 DAG.getConstant(I * 16, DL, MVT::i64));
19546 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19547 DAG.getConstant(I * 16, DL, MVT::i64));
19548 SDValue Dot =
19549 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19550 SDotVec16.push_back(Dot);
19551 }
19552 // Concatenate dot operations.
19553 EVT SDot16EVT =
19554 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19555 SDValue ConcatSDot16 =
19556 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19557 SDValue VecReduceAdd16 =
19558 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19559 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19560 if (VecReduce8Num == 0)
19561 return VecReduceAdd16;
19562
19563 // Generate the remainder Dot operation that is multiple of 8.
19564 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19565 SDValue Vec8Op0 =
19566 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19567 DAG.getConstant(I * 16, DL, MVT::i64));
19568 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19569 DAG.getConstant(I * 16, DL, MVT::i64));
19570 SDValue Dot =
19571 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19572 SDValue VecReduceAdd8 =
19573 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19574 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19575 VecReduceAdd8);
19576}
19577
19578// Given an (integer) vecreduce, we know the order of the inputs does not
19579// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19580// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19581// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19583 auto DetectAddExtract = [&](SDValue A) {
19584 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19585 // UADDLP(x) if found.
19586 assert(A.getOpcode() == ISD::ADD);
19587 EVT VT = A.getValueType();
19588 SDValue Op0 = A.getOperand(0);
19589 SDValue Op1 = A.getOperand(1);
19590 if (Op0.getOpcode() != Op1.getOpcode() ||
19591 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19592 Op0.getOpcode() != ISD::SIGN_EXTEND))
19593 return SDValue();
19594 SDValue Ext0 = Op0.getOperand(0);
19595 SDValue Ext1 = Op1.getOperand(0);
19596 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19598 Ext0.getOperand(0) != Ext1.getOperand(0) ||
19600 return SDValue();
19601 // Check that the type is twice the add types, and the extract are from
19602 // upper/lower parts of the same source.
19604 VT.getVectorNumElements() * 2)
19605 return SDValue();
19606 if ((Ext0.getConstantOperandVal(1) != 0 ||
19608 (Ext1.getConstantOperandVal(1) != 0 ||
19610 return SDValue();
19611 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19612 : AArch64ISD::SADDLP;
19613 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19614 };
19615
19616 if (SDValue R = DetectAddExtract(A))
19617 return R;
19618
19619 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19620 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19621 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19622 A.getOperand(1));
19623 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19624 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19625 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19626 A.getOperand(0));
19627 return SDValue();
19628}
19629
19630// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19631// UADDLV(concat), where the concat represents the 64-bit zext sources.
19633 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19634 // UADDLV(concat(zext, zext)) if found.
19635 assert(A.getOpcode() == ISD::ADD);
19636 EVT VT = A.getValueType();
19637 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19638 return SDValue();
19639 SDValue Op0 = A.getOperand(0);
19640 SDValue Op1 = A.getOperand(1);
19641 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19642 return SDValue();
19643 SDValue Ext0 = Op0.getOperand(0);
19644 SDValue Ext1 = Op1.getOperand(0);
19645 EVT ExtVT0 = Ext0.getValueType();
19646 EVT ExtVT1 = Ext1.getValueType();
19647 // Check zext VTs are the same and 64-bit length.
19648 if (ExtVT0 != ExtVT1 ||
19649 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19650 return SDValue();
19651 // Get VT for concat of zext sources.
19652 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19653 SDValue Concat =
19654 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19655
19656 switch (VT.getSimpleVT().SimpleTy) {
19657 case MVT::v2i64:
19658 case MVT::v4i32:
19659 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19660 case MVT::v8i16: {
19661 SDValue Uaddlv =
19662 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19663 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19664 }
19665 default:
19666 llvm_unreachable("Unhandled vector type");
19667 }
19668}
19669
19671 SDValue A = N->getOperand(0);
19672 if (A.getOpcode() == ISD::ADD) {
19673 if (SDValue R = performUADDVAddCombine(A, DAG))
19674 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19675 else if (SDValue R = performUADDVZextCombine(A, DAG))
19676 return R;
19677 }
19678
19679 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19680 MVT OpVT = A.getSimpleValueType();
19681 assert(N->getSimpleValueType(0) == OpVT &&
19682 "The operand type should be consistent with the result type of UADDV");
19684 Mask.clearBit(0);
19685 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19686 if (KnownLeadingLanes.isZero())
19687 return A;
19688
19689 return SDValue();
19690}
19691
19695 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
19696 APInt DemandedElts =
19697 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
19698
19700 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
19701 return SDValue(N, 0);
19702 return SDValue();
19703}
19704
19707 const AArch64Subtarget *Subtarget) {
19708 if (DCI.isBeforeLegalizeOps())
19709 return SDValue();
19710
19711 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19712}
19713
19714SDValue
19715AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19716 SelectionDAG &DAG,
19717 SmallVectorImpl<SDNode *> &Created) const {
19718 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19719 if (isIntDivCheap(N->getValueType(0), Attr))
19720 return SDValue(N, 0); // Lower SDIV as SDIV
19721
19722 EVT VT = N->getValueType(0);
19723
19724 // If SVE is available, we can generate
19725 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19726 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19727 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19728 return SDValue(N, 0);
19729
19730 // fold (sdiv X, pow2)
19731 if ((VT != MVT::i32 && VT != MVT::i64) ||
19732 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19733 return SDValue();
19734
19735 // If the divisor is 2 or -2, the default expansion is better. It will add
19736 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19737 if (Divisor == 2 ||
19738 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19739 return SDValue();
19740
19741 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19742}
19743
19744SDValue
19745AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19746 SelectionDAG &DAG,
19747 SmallVectorImpl<SDNode *> &Created) const {
19748 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19749 if (isIntDivCheap(N->getValueType(0), Attr))
19750 return SDValue(N, 0); // Lower SREM as SREM
19751
19752 EVT VT = N->getValueType(0);
19753
19754 // For scalable and fixed types, mark them as cheap so we can handle it much
19755 // later. This allows us to handle larger than legal types.
19756 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19757 return SDValue(N, 0);
19758
19759 // fold (srem X, pow2)
19760 if ((VT != MVT::i32 && VT != MVT::i64) ||
19761 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19762 return SDValue();
19763
19764 unsigned Lg2 = Divisor.countr_zero();
19765 if (Lg2 == 0)
19766 return SDValue();
19767
19768 SDLoc DL(N);
19769 SDValue N0 = N->getOperand(0);
19770 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19771 SDValue Zero = DAG.getConstant(0, DL, VT);
19772 SDValue CCVal, CSNeg;
19773 if (Lg2 == 1) {
19774 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19775 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19776 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19777
19778 Created.push_back(Cmp.getNode());
19779 Created.push_back(And.getNode());
19780 } else {
19781 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19782 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19783
19784 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19785 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19786 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19787 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19788 Negs.getValue(1));
19789
19790 Created.push_back(Negs.getNode());
19791 Created.push_back(AndPos.getNode());
19792 Created.push_back(AndNeg.getNode());
19793 }
19794
19795 return CSNeg;
19796}
19797
19799 switch(getIntrinsicID(S.getNode())) {
19800 default:
19801 break;
19802 case Intrinsic::aarch64_sve_cntb:
19803 case Intrinsic::aarch64_sve_cnth:
19804 case Intrinsic::aarch64_sve_cntw:
19805 case Intrinsic::aarch64_sve_cntd:
19806 return true;
19807 }
19808 return false;
19809}
19810
19811// Returns the maximum (scalable) value that can be returned by an SVE count
19812// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
19813static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
19814 Intrinsic::ID IID = getIntrinsicID(Op.getNode());
19815 if (IID == Intrinsic::aarch64_sve_cntp)
19816 return Op.getOperand(1).getValueType().getVectorElementCount();
19817 switch (IID) {
19818 case Intrinsic::aarch64_sve_cntd:
19819 return ElementCount::getScalable(2);
19820 case Intrinsic::aarch64_sve_cntw:
19821 return ElementCount::getScalable(4);
19822 case Intrinsic::aarch64_sve_cnth:
19823 return ElementCount::getScalable(8);
19824 case Intrinsic::aarch64_sve_cntb:
19825 return ElementCount::getScalable(16);
19826 default:
19827 return std::nullopt;
19828 }
19829}
19830
19831/// Calculates what the pre-extend type is, based on the extension
19832/// operation node provided by \p Extend.
19833///
19834/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19835/// pre-extend type is pulled directly from the operand, while other extend
19836/// operations need a bit more inspection to get this information.
19837///
19838/// \param Extend The SDNode from the DAG that represents the extend operation
19839///
19840/// \returns The type representing the \p Extend source type, or \p MVT::Other
19841/// if no valid type can be determined
19843 switch (Extend.getOpcode()) {
19844 case ISD::SIGN_EXTEND:
19845 case ISD::ZERO_EXTEND:
19846 case ISD::ANY_EXTEND:
19847 return Extend.getOperand(0).getValueType();
19848 case ISD::AssertSext:
19849 case ISD::AssertZext:
19851 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19852 if (!TypeNode)
19853 return MVT::Other;
19854 return TypeNode->getVT();
19855 }
19856 case ISD::AND: {
19859 if (!Constant)
19860 return MVT::Other;
19861
19862 uint32_t Mask = Constant->getZExtValue();
19863
19864 if (Mask == UCHAR_MAX)
19865 return MVT::i8;
19866 else if (Mask == USHRT_MAX)
19867 return MVT::i16;
19868 else if (Mask == UINT_MAX)
19869 return MVT::i32;
19870
19871 return MVT::Other;
19872 }
19873 default:
19874 return MVT::Other;
19875 }
19876}
19877
19878/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19879/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19880/// SExt/ZExt rather than the scalar SExt/ZExt
19882 EVT VT = BV.getValueType();
19883 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19885 return SDValue();
19886
19887 // Use the first item in the buildvector/shuffle to get the size of the
19888 // extend, and make sure it looks valid.
19889 SDValue Extend = BV->getOperand(0);
19890 unsigned ExtendOpcode = Extend.getOpcode();
19891 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19892 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19893 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19894 ExtendOpcode == ISD::AssertSext;
19895 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19896 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19897 return SDValue();
19898 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19899 // ensure calculatePreExtendType will work without issue.
19900 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19901 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19902 return SDValue();
19903
19904 // Restrict valid pre-extend data type
19905 EVT PreExtendType = calculatePreExtendType(Extend);
19906 if (PreExtendType == MVT::Other ||
19907 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19908 return SDValue();
19909
19910 // Make sure all other operands are equally extended.
19911 bool SeenZExtOrSExt = !IsAnyExt;
19912 for (SDValue Op : drop_begin(BV->ops())) {
19913 if (Op.isUndef())
19914 continue;
19915
19916 if (calculatePreExtendType(Op) != PreExtendType)
19917 return SDValue();
19918
19919 unsigned Opc = Op.getOpcode();
19920 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19922 return SDValue();
19923
19924 if (Opc == ISD::ANY_EXTEND)
19925 continue;
19926
19927 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19929
19930 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19931 return SDValue();
19932
19933 IsSExt = OpcIsSExt;
19934 SeenZExtOrSExt = true;
19935 }
19936
19937 SDValue NBV;
19938 SDLoc DL(BV);
19939 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19940 EVT PreExtendVT =
19941 VT.changeVectorElementType(*DAG.getContext(), PreExtendType);
19942 EVT PreExtendLegalType =
19943 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19945 for (SDValue Op : BV->ops())
19946 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19947 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19948 PreExtendLegalType));
19949 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19950 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19951 EVT PreExtendVT = VT.changeVectorElementType(*DAG.getContext(),
19952 PreExtendType.getScalarType());
19953 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19954 BV.getOperand(1).isUndef()
19955 ? DAG.getUNDEF(PreExtendVT)
19956 : BV.getOperand(1).getOperand(0),
19957 cast<ShuffleVectorSDNode>(BV)->getMask());
19958 }
19959 unsigned ExtOpc = !SeenZExtOrSExt
19961 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19962 return DAG.getNode(ExtOpc, DL, VT, NBV);
19963}
19964
19965/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19966/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19968 // If the value type isn't a vector, none of the operands are going to be dups
19969 EVT VT = Mul->getValueType(0);
19970 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19971 return SDValue();
19972
19973 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19974 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19975
19976 // Neither operands have been changed, don't make any further changes
19977 if (!Op0 && !Op1)
19978 return SDValue();
19979
19980 SDLoc DL(Mul);
19981 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19982 Op1 ? Op1 : Mul->getOperand(1));
19983}
19984
19985// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
19986// folding a power-of-two factor of the constant into the RDSVL immediate and
19987// compensating with an extra shift.
19988//
19989// We rewrite:
19990// (mul (srl (rdsvl 1), w), x)
19991// to one of:
19992// (shl (rdsvl y), z) if z > 0
19993// (srl (rdsvl y), abs(z)) if z < 0
19994// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
19996 SDLoc DL(Mul);
19997 EVT VT = Mul->getValueType(0);
19998 SDValue MulOp0 = Mul->getOperand(0);
19999 int ConstMultiplier =
20000 cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
20001 if ((MulOp0->getOpcode() != ISD::SRL) ||
20002 (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
20003 return SDValue();
20004
20005 unsigned AbsConstValue = abs(ConstMultiplier);
20006 unsigned OperandShift =
20007 cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
20008
20009 // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
20010 // integral)
20011 int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
20012
20013 // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
20014 // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
20015 unsigned B = ConstMultiplier < 0 ? 32 : 31;
20016 unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
20017 int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
20018
20019 // No valid solution found.
20020 if (LowerBound > UpperBound)
20021 return SDValue();
20022
20023 // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
20024 // shift if possible.
20025 int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
20026
20027 // y = x / 2^(w + z)
20028 int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
20029 (ConstMultiplier < 0 ? -1 : 1);
20030 auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
20031 DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
20032
20033 if (Shift == 0)
20034 return Rdsvl;
20035 return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
20036 DAG.getConstant(abs(Shift), DL, MVT::i32),
20038}
20039
20040// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
20041// Same for other types with equivalent constants.
20043 EVT VT = N->getValueType(0);
20044 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
20045 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
20046 return SDValue();
20047 if (N->getOperand(0).getOpcode() != ISD::AND ||
20048 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
20049 return SDValue();
20050
20051 SDValue And = N->getOperand(0);
20052 SDValue Srl = And.getOperand(0);
20053
20054 APInt V1, V2, V3;
20055 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
20056 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
20058 return SDValue();
20059
20060 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
20061 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
20062 V3 != (HalfSize - 1))
20063 return SDValue();
20064
20065 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20066 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
20067 VT.getVectorElementCount() * 2);
20068
20069 SDLoc DL(N);
20070 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
20071 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
20072 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
20073 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
20074}
20075
20076// Transform vector add(zext i8 to i32, zext i8 to i32)
20077// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
20078// This allows extra uses of saddl/uaddl at the lower vector widths, and less
20079// extends.
20081 EVT VT = N->getValueType(0);
20082 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
20083 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
20084 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
20085 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
20086 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
20087 N->getOperand(0).getOperand(0).getValueType() !=
20088 N->getOperand(1).getOperand(0).getValueType())
20089 return SDValue();
20090
20091 if (N->getOpcode() == ISD::MUL &&
20092 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
20093 return SDValue();
20094
20095 SDValue N0 = N->getOperand(0).getOperand(0);
20096 SDValue N1 = N->getOperand(1).getOperand(0);
20097 EVT InVT = N0.getValueType();
20098
20099 EVT S1 = InVT.getScalarType();
20100 EVT S2 = VT.getScalarType();
20101 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
20102 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
20103 SDLoc DL(N);
20104 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20107 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
20108 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
20109 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
20110 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
20111 : (unsigned)ISD::SIGN_EXTEND,
20112 DL, VT, NewOp);
20113 }
20114 return SDValue();
20115}
20116
20119 const AArch64Subtarget *Subtarget) {
20120
20121 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
20122 return Ext;
20124 return Ext;
20125 if (SDValue Ext = performVectorExtCombine(N, DAG))
20126 return Ext;
20127
20128 if (DCI.isBeforeLegalizeOps())
20129 return SDValue();
20130
20131 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
20132 // and in MachineCombiner pass, add+mul will be combined into madd.
20133 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
20134 SDLoc DL(N);
20135 EVT VT = N->getValueType(0);
20136 SDValue N0 = N->getOperand(0);
20137 SDValue N1 = N->getOperand(1);
20138 SDValue MulOper;
20139 unsigned AddSubOpc;
20140
20141 auto IsAddSubWith1 = [&](SDValue V) -> bool {
20142 AddSubOpc = V->getOpcode();
20143 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
20144 SDValue Opnd = V->getOperand(1);
20145 MulOper = V->getOperand(0);
20146 if (AddSubOpc == ISD::SUB)
20147 std::swap(Opnd, MulOper);
20148 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
20149 return C->isOne();
20150 }
20151 return false;
20152 };
20153
20154 if (IsAddSubWith1(N0)) {
20155 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
20156 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
20157 }
20158
20159 if (IsAddSubWith1(N1)) {
20160 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
20161 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
20162 }
20163
20164 // The below optimizations require a constant RHS.
20165 if (!isa<ConstantSDNode>(N1))
20166 return SDValue();
20167
20168 if (SDValue Ext = performMulRdsvlCombine(N, DAG))
20169 return Ext;
20170
20172 const APInt &ConstValue = C->getAPIntValue();
20173
20174 // Allow the scaling to be folded into the `cnt` instruction by preventing
20175 // the scaling to be obscured here. This makes it easier to pattern match.
20176 if (IsSVECntIntrinsic(N0) ||
20177 (N0->getOpcode() == ISD::TRUNCATE &&
20178 (IsSVECntIntrinsic(N0->getOperand(0)))))
20179 if (ConstValue.sge(1) && ConstValue.sle(16))
20180 return SDValue();
20181
20182 // Multiplication of a power of two plus/minus one can be done more
20183 // cheaply as shift+add/sub. For now, this is true unilaterally. If
20184 // future CPUs have a cheaper MADD instruction, this may need to be
20185 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
20186 // 64-bit is 5 cycles, so this is always a win.
20187 // More aggressively, some multiplications N0 * C can be lowered to
20188 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
20189 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
20190 // TODO: lower more cases.
20191
20192 // TrailingZeroes is used to test if the mul can be lowered to
20193 // shift+add+shift.
20194 unsigned TrailingZeroes = ConstValue.countr_zero();
20195 if (TrailingZeroes) {
20196 // Conservatively do not lower to shift+add+shift if the mul might be
20197 // folded into smul or umul.
20198 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
20199 isZeroExtended(N0, DAG)))
20200 return SDValue();
20201 // Conservatively do not lower to shift+add+shift if the mul might be
20202 // folded into madd or msub.
20203 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
20204 N->user_begin()->getOpcode() == ISD::SUB))
20205 return SDValue();
20206 }
20207 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
20208 // and shift+add+shift.
20209 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
20210 unsigned ShiftAmt;
20211
20212 auto Shl = [&](SDValue N0, unsigned N1) {
20213 if (!N0.getNode())
20214 return SDValue();
20215 // If shift causes overflow, ignore this combine.
20216 if (N1 >= N0.getValueSizeInBits())
20217 return SDValue();
20218 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
20219 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
20220 };
20221 auto Add = [&](SDValue N0, SDValue N1) {
20222 if (!N0.getNode() || !N1.getNode())
20223 return SDValue();
20224 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
20225 };
20226 auto Sub = [&](SDValue N0, SDValue N1) {
20227 if (!N0.getNode() || !N1.getNode())
20228 return SDValue();
20229 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
20230 };
20231 auto Negate = [&](SDValue N) {
20232 if (!N0.getNode())
20233 return SDValue();
20234 SDValue Zero = DAG.getConstant(0, DL, VT);
20235 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
20236 };
20237
20238 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
20239 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
20240 // the (2^N - 1) can't be execused via a single instruction.
20241 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
20242 unsigned BitWidth = C.getBitWidth();
20243 for (unsigned i = 1; i < BitWidth / 2; i++) {
20244 APInt Rem;
20245 APInt X(BitWidth, (1 << i) + 1);
20246 APInt::sdivrem(C, X, N, Rem);
20247 APInt NVMinus1 = N - 1;
20248 if (Rem == 0 && NVMinus1.isPowerOf2()) {
20249 M = X;
20250 return true;
20251 }
20252 }
20253 return false;
20254 };
20255
20256 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
20257 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
20258 // the (2^N - 1) can't be execused via a single instruction.
20259 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
20260 APInt CVMinus1 = C - 1;
20261 if (CVMinus1.isNegative())
20262 return false;
20263 unsigned TrailingZeroes = CVMinus1.countr_zero();
20264 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
20265 if (SCVMinus1.isPowerOf2()) {
20266 unsigned BitWidth = SCVMinus1.getBitWidth();
20267 M = APInt(BitWidth, SCVMinus1.logBase2());
20268 N = APInt(BitWidth, TrailingZeroes);
20269 return true;
20270 }
20271 return false;
20272 };
20273
20274 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
20275 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
20276 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
20277 APInt CVMinus1 = C - 1;
20278 if (CVMinus1.isNegative())
20279 return false;
20280 unsigned TrailingZeroes = CVMinus1.countr_zero();
20281 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
20282 if (CVPlus1.isPowerOf2()) {
20283 unsigned BitWidth = CVPlus1.getBitWidth();
20284 M = APInt(BitWidth, CVPlus1.logBase2());
20285 N = APInt(BitWidth, TrailingZeroes);
20286 return true;
20287 }
20288 return false;
20289 };
20290
20291 if (ConstValue.isNonNegative()) {
20292 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
20293 // (mul x, 2^N - 1) => (sub (shl x, N), x)
20294 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
20295 // (mul x, (2^M + 1) * (2^N + 1))
20296 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
20297 // (mul x, (2^M + 1) * 2^N + 1))
20298 // => MV = add (shl x, M), x); add (shl MV, N), x)
20299 // (mul x, 1 - (1 - 2^M) * 2^N))
20300 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
20301 APInt SCVMinus1 = ShiftedConstValue - 1;
20302 APInt SCVPlus1 = ShiftedConstValue + 1;
20303 APInt CVPlus1 = ConstValue + 1;
20304 APInt CVM, CVN;
20305 if (SCVMinus1.isPowerOf2()) {
20306 ShiftAmt = SCVMinus1.logBase2();
20307 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
20308 } else if (CVPlus1.isPowerOf2()) {
20309 ShiftAmt = CVPlus1.logBase2();
20310 return Sub(Shl(N0, ShiftAmt), N0);
20311 } else if (SCVPlus1.isPowerOf2()) {
20312 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20313 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
20314 }
20315 if (Subtarget->hasALULSLFast() &&
20316 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
20317 APInt CVMMinus1 = CVM - 1;
20318 APInt CVNMinus1 = CVN - 1;
20319 unsigned ShiftM1 = CVMMinus1.logBase2();
20320 unsigned ShiftN1 = CVNMinus1.logBase2();
20321 // ALULSLFast implicate that Shifts <= 4 places are fast
20322 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
20323 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
20324 return Add(Shl(MVal, ShiftN1), MVal);
20325 }
20326 }
20327 if (Subtarget->hasALULSLFast() &&
20328 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
20329 unsigned ShiftM = CVM.getZExtValue();
20330 unsigned ShiftN = CVN.getZExtValue();
20331 // ALULSLFast implicate that Shifts <= 4 places are fast
20332 if (ShiftM <= 4 && ShiftN <= 4) {
20333 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
20334 return Add(Shl(MVal, CVN.getZExtValue()), N0);
20335 }
20336 }
20337
20338 if (Subtarget->hasALULSLFast() &&
20339 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
20340 unsigned ShiftM = CVM.getZExtValue();
20341 unsigned ShiftN = CVN.getZExtValue();
20342 // ALULSLFast implicate that Shifts <= 4 places are fast
20343 if (ShiftM <= 4 && ShiftN <= 4) {
20344 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
20345 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
20346 }
20347 }
20348 } else {
20349 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
20350 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
20351 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
20352 APInt SCVPlus1 = -ShiftedConstValue + 1;
20353 APInt CVNegPlus1 = -ConstValue + 1;
20354 APInt CVNegMinus1 = -ConstValue - 1;
20355 if (CVNegPlus1.isPowerOf2()) {
20356 ShiftAmt = CVNegPlus1.logBase2();
20357 return Sub(N0, Shl(N0, ShiftAmt));
20358 } else if (CVNegMinus1.isPowerOf2()) {
20359 ShiftAmt = CVNegMinus1.logBase2();
20360 return Negate(Add(Shl(N0, ShiftAmt), N0));
20361 } else if (SCVPlus1.isPowerOf2()) {
20362 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20363 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
20364 }
20365 }
20366
20367 return SDValue();
20368}
20369
20371 SelectionDAG &DAG) {
20372 // Take advantage of vector comparisons producing 0 or -1 in each lane to
20373 // optimize away operation when it's from a constant.
20374 //
20375 // The general transformation is:
20376 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
20377 // AND(VECTOR_CMP(x,y), constant2)
20378 // constant2 = UNARYOP(constant)
20379
20380 // Early exit if this isn't a vector operation, the operand of the
20381 // unary operation isn't a bitwise AND, or if the sizes of the operations
20382 // aren't the same.
20383 EVT VT = N->getValueType(0);
20384 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
20385 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
20386 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
20387 return SDValue();
20388
20389 // Now check that the other operand of the AND is a constant. We could
20390 // make the transformation for non-constant splats as well, but it's unclear
20391 // that would be a benefit as it would not eliminate any operations, just
20392 // perform one more step in scalar code before moving to the vector unit.
20393 if (BuildVectorSDNode *BV =
20394 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
20395 // Bail out if the vector isn't a constant.
20396 if (!BV->isConstant())
20397 return SDValue();
20398
20399 // Everything checks out. Build up the new and improved node.
20400 SDLoc DL(N);
20401 EVT IntVT = BV->getValueType(0);
20402 // Create a new constant of the appropriate type for the transformed
20403 // DAG.
20404 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
20405 // The AND node needs bitcasts to/from an integer vector type around it.
20406 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
20407 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
20408 N->getOperand(0)->getOperand(0), MaskConst);
20409 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
20410 return Res;
20411 }
20412
20413 return SDValue();
20414}
20415
20416/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
20417/// functions, this can help to reduce the number of fmovs to/from GPRs.
20418static SDValue
20421 const AArch64Subtarget *Subtarget) {
20422 if (N->isStrictFPOpcode())
20423 return SDValue();
20424
20425 if (DCI.isBeforeLegalizeOps())
20426 return SDValue();
20427
20428 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
20429 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
20430 return SDValue();
20431
20432 auto isSupportedType = [](EVT VT) {
20433 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
20434 };
20435
20436 SDValue SrcVal = N->getOperand(0);
20437 EVT SrcTy = SrcVal.getValueType();
20438 EVT DestTy = N->getValueType(0);
20439
20440 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
20441 return SDValue();
20442
20443 EVT SrcVecTy;
20444 EVT DestVecTy;
20445 if (DestTy.bitsGT(SrcTy)) {
20446 DestVecTy = getPackedSVEVectorVT(DestTy);
20447 SrcVecTy = DestVecTy.changeVectorElementType(*DAG.getContext(), SrcTy);
20448 } else {
20449 SrcVecTy = getPackedSVEVectorVT(SrcTy);
20450 DestVecTy = SrcVecTy.changeVectorElementType(*DAG.getContext(), DestTy);
20451 }
20452
20453 // Ensure the resulting src/dest vector type is legal.
20454 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
20455 return SDValue();
20456
20457 SDLoc DL(N);
20458 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20459 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
20460 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
20461 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
20462 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
20463}
20464
20467 const AArch64Subtarget *Subtarget) {
20468 // First try to optimize away the conversion when it's conditionally from
20469 // a constant. Vectors only.
20471 return Res;
20472
20473 if (SDValue Res =
20474 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20475 return Res;
20476
20477 EVT VT = N->getValueType(0);
20478 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
20479 return SDValue();
20480 if (VT == MVT::f16 && !Subtarget->hasFullFP16())
20481 return SDValue();
20482
20483 // Only optimize when the source and destination types have the same width.
20484 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
20485 return SDValue();
20486
20487 // If the result of an integer load is only used by an integer-to-float
20488 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
20489 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
20490 SDValue N0 = N->getOperand(0);
20491 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
20492 N0.hasOneUse() &&
20493 // Do not change the width of a volatile load.
20494 !cast<LoadSDNode>(N0)->isVolatile()) {
20495 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
20496 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
20497 LN0->getPointerInfo(), LN0->getAlign(),
20498 LN0->getMemOperand()->getFlags());
20499
20500 // Make sure successors of the original load stay after it by updating them
20501 // to use the new Chain.
20502 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
20503
20504 unsigned Opcode =
20505 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
20506 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
20507 }
20508
20509 return SDValue();
20510}
20511
20512/// Fold a floating-point multiply by power of two into floating-point to
20513/// fixed-point conversion.
20516 const AArch64Subtarget *Subtarget) {
20517 if (SDValue Res =
20518 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20519 return Res;
20520
20521 if (!Subtarget->isNeonAvailable())
20522 return SDValue();
20523
20524 if (!N->getValueType(0).isSimple())
20525 return SDValue();
20526
20527 SDValue Op = N->getOperand(0);
20528 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
20529 return SDValue();
20530
20531 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
20532 return SDValue();
20533
20534 SDValue ConstVec = Op->getOperand(1);
20535 if (!isa<BuildVectorSDNode>(ConstVec))
20536 return SDValue();
20537
20538 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
20539 uint32_t FloatBits = FloatTy.getSizeInBits();
20540 if (FloatBits != 32 && FloatBits != 64 &&
20541 (FloatBits != 16 || !Subtarget->hasFullFP16()))
20542 return SDValue();
20543
20544 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
20545 uint32_t IntBits = IntTy.getSizeInBits();
20546 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
20547 return SDValue();
20548
20549 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
20550 if (IntBits > FloatBits)
20551 return SDValue();
20552
20553 BitVector UndefElements;
20555 int32_t Bits = IntBits == 64 ? 64 : 32;
20556 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
20557 if (C == -1 || C == 0 || C > Bits)
20558 return SDValue();
20559
20560 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
20561 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
20562 return SDValue();
20563
20564 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
20565 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
20566 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
20567 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
20568 return SDValue();
20569 }
20570
20571 SDLoc DL(N);
20572 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
20573 N->getOpcode() == ISD::FP_TO_SINT_SAT);
20574 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
20575 : Intrinsic::aarch64_neon_vcvtfp2fxu;
20576 SDValue FixConv =
20578 DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32),
20579 Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));
20580 // We can handle smaller integers by generating an extra trunc.
20581 if (IntBits < FloatBits)
20582 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
20583
20584 return FixConv;
20585}
20586
20587// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
20588// convert to csel(ccmp(.., cc0)), depending on cc1:
20589
20590// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20591// =>
20592// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
20593//
20594// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20595// =>
20596// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
20598 EVT VT = N->getValueType(0);
20599 SDValue CSel0 = N->getOperand(0);
20600 SDValue CSel1 = N->getOperand(1);
20601
20602 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
20603 CSel1.getOpcode() != AArch64ISD::CSEL)
20604 return SDValue();
20605
20606 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
20607 return SDValue();
20608