LLVM 23.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
20#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/Constants.h"
63#include "llvm/IR/DataLayout.h"
64#include "llvm/IR/DebugLoc.h"
66#include "llvm/IR/Function.h"
68#include "llvm/IR/GlobalValue.h"
69#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/Instruction.h"
73#include "llvm/IR/Intrinsics.h"
74#include "llvm/IR/IntrinsicsAArch64.h"
75#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Use.h"
79#include "llvm/IR/Value.h"
84#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <bitset>
96#include <cassert>
97#include <cctype>
98#include <cstdint>
99#include <cstdlib>
100#include <iterator>
101#include <limits>
102#include <optional>
103#include <tuple>
104#include <utility>
105#include <vector>
106
107using namespace llvm;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251static inline bool isPackedPredicateType(EVT VT, SelectionDAG &DAG) {
253 "Expected legal type!");
254 return VT == MVT::nxv16i1;
255}
256
257/// Returns true if the conceptual representation for \p VT does not map
258/// directly to its physical register representation, meaning there are gaps
259/// between elements in the register. In practice, the vector elements will be
260/// strided by a power of two and placed starting from lane 0. For example,
261/// nxv8i1 or nxv2f32 are unpacked types.
262///
263///\pre VT is a legal type.
264static inline bool isUnpackedType(EVT VT, SelectionDAG &DAG) {
265 bool Res = !isPackedVectorType(VT, DAG) && !isPackedPredicateType(VT, DAG);
266 assert((!Res || VT.isScalableVector()) &&
267 "Unexpected fixed-size unpacked type.");
268 return Res;
269}
270
271// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
272// predicate and end with a passthru value matching the result type.
273static bool isMergePassthruOpcode(unsigned Opc) {
274 switch (Opc) {
275 default:
276 return false;
277 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
278 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
279 case AArch64ISD::REVH_MERGE_PASSTHRU:
280 case AArch64ISD::REVW_MERGE_PASSTHRU:
281 case AArch64ISD::REVD_MERGE_PASSTHRU:
282 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
283 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
284 case AArch64ISD::DUP_MERGE_PASSTHRU:
285 case AArch64ISD::ABS_MERGE_PASSTHRU:
286 case AArch64ISD::NEG_MERGE_PASSTHRU:
287 case AArch64ISD::FNEG_MERGE_PASSTHRU:
288 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
289 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
290 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
291 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
292 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
293 case AArch64ISD::FRINT_MERGE_PASSTHRU:
294 case AArch64ISD::FRINT32_MERGE_PASSTHRU:
295 case AArch64ISD::FRINT64_MERGE_PASSTHRU:
296 case AArch64ISD::FROUND_MERGE_PASSTHRU:
297 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
298 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
299 case AArch64ISD::FTRUNC32_MERGE_PASSTHRU:
300 case AArch64ISD::FTRUNC64_MERGE_PASSTHRU:
301 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
302 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
303 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
304 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
305 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
306 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
307 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
308 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
309 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
310 case AArch64ISD::FABS_MERGE_PASSTHRU:
311 return true;
312 }
313}
314
315// Returns true if inactive lanes are known to be zeroed by construction.
317 switch (Op.getOpcode()) {
318 default:
319 return false;
320 // We guarantee i1 splat_vectors to zero the other lanes
323 case AArch64ISD::PTRUE:
324 case AArch64ISD::SETCC_MERGE_ZERO:
325 return true;
327 switch (Op.getConstantOperandVal(0)) {
328 default:
329 return false;
330 case Intrinsic::aarch64_sve_ptrue:
331 case Intrinsic::aarch64_sve_pnext:
332 case Intrinsic::aarch64_sve_cmpeq:
333 case Intrinsic::aarch64_sve_cmpne:
334 case Intrinsic::aarch64_sve_cmpge:
335 case Intrinsic::aarch64_sve_cmpgt:
336 case Intrinsic::aarch64_sve_cmphs:
337 case Intrinsic::aarch64_sve_cmphi:
338 case Intrinsic::aarch64_sve_cmpeq_wide:
339 case Intrinsic::aarch64_sve_cmpne_wide:
340 case Intrinsic::aarch64_sve_cmpge_wide:
341 case Intrinsic::aarch64_sve_cmpgt_wide:
342 case Intrinsic::aarch64_sve_cmplt_wide:
343 case Intrinsic::aarch64_sve_cmple_wide:
344 case Intrinsic::aarch64_sve_cmphs_wide:
345 case Intrinsic::aarch64_sve_cmphi_wide:
346 case Intrinsic::aarch64_sve_cmplo_wide:
347 case Intrinsic::aarch64_sve_cmpls_wide:
348 case Intrinsic::aarch64_sve_fcmpeq:
349 case Intrinsic::aarch64_sve_fcmpne:
350 case Intrinsic::aarch64_sve_fcmpge:
351 case Intrinsic::aarch64_sve_fcmpgt:
352 case Intrinsic::aarch64_sve_fcmpuo:
353 case Intrinsic::aarch64_sve_facgt:
354 case Intrinsic::aarch64_sve_facge:
355 case Intrinsic::aarch64_sve_whilege:
356 case Intrinsic::aarch64_sve_whilegt:
357 case Intrinsic::aarch64_sve_whilehi:
358 case Intrinsic::aarch64_sve_whilehs:
359 case Intrinsic::aarch64_sve_whilele:
360 case Intrinsic::aarch64_sve_whilelo:
361 case Intrinsic::aarch64_sve_whilels:
362 case Intrinsic::aarch64_sve_whilelt:
363 case Intrinsic::aarch64_sve_match:
364 case Intrinsic::aarch64_sve_nmatch:
365 case Intrinsic::aarch64_sve_whilege_x2:
366 case Intrinsic::aarch64_sve_whilegt_x2:
367 case Intrinsic::aarch64_sve_whilehi_x2:
368 case Intrinsic::aarch64_sve_whilehs_x2:
369 case Intrinsic::aarch64_sve_whilele_x2:
370 case Intrinsic::aarch64_sve_whilelo_x2:
371 case Intrinsic::aarch64_sve_whilels_x2:
372 case Intrinsic::aarch64_sve_whilelt_x2:
373 return true;
374 }
375 }
376}
377
378static std::tuple<SDValue, SDValue>
380 SDLoc DL(Disc);
381 SDValue AddrDisc;
382 SDValue ConstDisc;
383
384 // If this is a blend, remember the constant and address discriminators.
385 // Otherwise, it's either a constant discriminator, or a non-blended
386 // address discriminator.
387 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
388 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
389 AddrDisc = Disc->getOperand(1);
390 ConstDisc = Disc->getOperand(2);
391 } else {
392 ConstDisc = Disc;
393 }
394
395 // If the constant discriminator (either the blend RHS, or the entire
396 // discriminator value) isn't a 16-bit constant, bail out, and let the
397 // discriminator be computed separately.
398 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
399 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
400 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
401
402 // If there's no address discriminator, use NoRegister, which we'll later
403 // replace with XZR, or directly use a Z variant of the inst. when available.
404 if (!AddrDisc)
405 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
406
407 return std::make_tuple(
408 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
409 AddrDisc);
410}
411
413 const AArch64Subtarget &STI)
414 : TargetLowering(TM, STI), Subtarget(&STI) {
415 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
416 // we have to make something up. Arbitrarily, choose ZeroOrOne.
418 // When comparing vectors the result sets the different elements in the
419 // vector to all-one or all-zero.
421
422 // Set up the register classes.
423 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
424 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
425
426 if (Subtarget->hasLS64()) {
427 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
428 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
430 }
431
432 if (Subtarget->hasFPARMv8()) {
433 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
434 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
435 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
436 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
437 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
438 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
439 }
440
441 if (Subtarget->hasNEON()) {
442 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
443 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
444
445 addDRType(MVT::v2f32);
446 addDRType(MVT::v8i8);
447 addDRType(MVT::v4i16);
448 addDRType(MVT::v2i32);
449 addDRType(MVT::v1i64);
450 addDRType(MVT::v1f64);
451 addDRType(MVT::v4f16);
452 addDRType(MVT::v4bf16);
453
454 addQRType(MVT::v4f32);
455 addQRType(MVT::v2f64);
456 addQRType(MVT::v16i8);
457 addQRType(MVT::v8i16);
458 addQRType(MVT::v4i32);
459 addQRType(MVT::v2i64);
460 addQRType(MVT::v8f16);
461 addQRType(MVT::v8bf16);
462 }
463
464 if (Subtarget->isSVEorStreamingSVEAvailable()) {
465 // Add legal sve predicate types
466 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
467 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
468 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
469 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
470 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
471
472 // Add sve predicate as counter type
473 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
474
475 // Add legal sve data types
476 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
477 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
478 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
479 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
480
481 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
482 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
483 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
484 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
485 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
486 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
487
488 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
489 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
490 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
491
492 if (Subtarget->useSVEForFixedLengthVectors()) {
495 addRegisterClass(VT, &AArch64::ZPRRegClass);
496
499 addRegisterClass(VT, &AArch64::ZPRRegClass);
500 }
501 }
502
503 // Compute derived properties from the register classes
504 computeRegisterProperties(Subtarget->getRegisterInfo());
505
506 // Provide all sorts of operation actions
534 if (Subtarget->hasFPARMv8()) {
537 }
550
552
556
559
561
562 // Custom lowering hooks are needed for XOR
563 // to fold it into CSINC/CSINV.
566
569
570 // Virtually no operation on f128 is legal, but LLVM can't expand them when
571 // there's a valid register class, so we need custom operations in most cases.
596 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
597 // aren't handled.
598
599 // Lowering for many of the conversions is actually specified by the non-f128
600 // type. The LowerXXX function will be trivial when f128 isn't involved.
625 if (Subtarget->hasFPARMv8()) {
628 }
631 if (Subtarget->hasFPARMv8()) {
634 }
637
642
643 // Variable arguments.
648
649 // Variable-sized objects.
652
653 // Lowering Funnel Shifts to EXTR
658
660
661 // Constant pool entries
663
664 // BlockAddress
666
667 // AArch64 lacks both left-rotate and popcount instructions.
673 }
674
675 // AArch64 doesn't have i32 MULH{S|U}.
678
679 // AArch64 doesn't have {U|S}MUL_LOHI.
684
685 if (Subtarget->hasCSSC()) {
689
691
695
698
703
708 } else {
712
715
718 }
719
725 }
732
733 // Custom lower Add/Sub/Mul with overflow.
746
755
764 if (Subtarget->hasFullFP16()) {
767 } else {
770 }
771
772 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
786 setOperationAction(Op, MVT::f16, Promote);
787 setOperationAction(Op, MVT::v4f16, Expand);
788 setOperationAction(Op, MVT::v8f16, Expand);
789 setOperationAction(Op, MVT::bf16, Promote);
790 setOperationAction(Op, MVT::v4bf16, Expand);
791 setOperationAction(Op, MVT::v8bf16, Expand);
792 }
793
794 // Legalize fcanonicalize to circumvent default expansion
795 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
796 if (Subtarget->hasFullFP16()) {
798 }
799
800 // fpextend from f16 or bf16 to f32 is legal
805 // fpextend from bf16 to f64 needs to be split into two fpextends
808
809 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
810 for (auto Op : {
814 ISD::FADD,
815 ISD::FSUB,
816 ISD::FMUL,
817 ISD::FDIV,
818 ISD::FMA,
851 })
852 setOperationAction(Op, ScalarVT, Promote);
853
854 for (auto Op : {ISD::FNEG, ISD::FABS})
855 setOperationAction(Op, ScalarVT, Legal);
856
857 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
858 // because the result type is integer.
862 setOperationAction(Op, ScalarVT, Custom);
863
864 // promote v4f16 to v4f32 when that is known to be safe.
865 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
866 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
867 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
868 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
869 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
870 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
871 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
872 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
873 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
874 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
875 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
876 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
877 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
878 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
879
888
889 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
890 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
891 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
892 setOperationPromotedToType(ISD::VECREDUCE_FADD, V8Narrow, MVT::v8f32);
893 setOperationPromotedToType(ISD::VECREDUCE_FMUL, V8Narrow, MVT::v8f32);
894
915 };
916
917 if (!Subtarget->hasFullFP16()) {
918 LegalizeNarrowFP(MVT::f16);
919 }
920 LegalizeNarrowFP(MVT::bf16);
923
924 // AArch64 has implementations of a lot of rounding-like FP operations.
925 // clang-format off
926 for (auto Op :
938 for (MVT Ty : {MVT::f32, MVT::f64})
940 if (Subtarget->hasFullFP16())
941 setOperationAction(Op, MVT::f16, Legal);
942 }
943 // clang-format on
944
945 // Basic strict FP operations are legal
948 for (MVT Ty : {MVT::f32, MVT::f64})
950 if (Subtarget->hasFullFP16())
951 setOperationAction(Op, MVT::f16, Legal);
952 }
953
955
961
963 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
966 } else {
969 }
972
973 // Generate outline atomics library calls only if LSE was not specified for
974 // subtarget
975 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
1001 }
1002
1003 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
1008
1013
1018
1023
1028 }
1029
1030 if (Subtarget->hasLSE128()) {
1031 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1032 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1036 }
1037
1038 // 128-bit loads and stores can be done without expanding
1039 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1041
1042 // Aligned 128-bit loads and stores are single-copy atomic according to the
1043 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1044 if (Subtarget->hasLSE2()) {
1047 }
1048
1049 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1050 // custom lowering, as there are no un-paired non-temporal stores and
1051 // legalization will break up 256 bit inputs.
1052 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1053 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1054 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1055 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1056 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1057 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1058 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1059 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1060
1061 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1062 // custom lowering, as there are no un-paired non-temporal loads legalization
1063 // will break up 256 bit inputs.
1064 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1065 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1066 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1067 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1068 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1069 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1070 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1071 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1072
1073 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1075
1076 // Issue __sincos_stret if available.
1079
1080 // Make floating-point constants legal for the large code model, so they don't
1081 // become loads from the constant pool.
1082 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1085 }
1086
1087 // AArch64 does not have floating-point extending loads, i1 sign-extending
1088 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1089 for (MVT VT : MVT::fp_valuetypes()) {
1090 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1091 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1092 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1093 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1094 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1095 }
1096 for (MVT VT : MVT::integer_valuetypes())
1097 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1098
1099 for (MVT WideVT : MVT::fp_valuetypes()) {
1100 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1101 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1102 setTruncStoreAction(WideVT, NarrowVT, Expand);
1103 }
1104 }
1105 }
1106
1107 if (Subtarget->hasFPARMv8()) {
1111 }
1112
1113 // Indexed loads and stores are supported.
1114 for (unsigned im = (unsigned)ISD::PRE_INC;
1116 setIndexedLoadAction(im, MVT::i8, Legal);
1117 setIndexedLoadAction(im, MVT::i16, Legal);
1118 setIndexedLoadAction(im, MVT::i32, Legal);
1119 setIndexedLoadAction(im, MVT::i64, Legal);
1120 setIndexedLoadAction(im, MVT::f64, Legal);
1121 setIndexedLoadAction(im, MVT::f32, Legal);
1122 setIndexedLoadAction(im, MVT::f16, Legal);
1123 setIndexedLoadAction(im, MVT::bf16, Legal);
1124 setIndexedStoreAction(im, MVT::i8, Legal);
1125 setIndexedStoreAction(im, MVT::i16, Legal);
1126 setIndexedStoreAction(im, MVT::i32, Legal);
1127 setIndexedStoreAction(im, MVT::i64, Legal);
1128 setIndexedStoreAction(im, MVT::f64, Legal);
1129 setIndexedStoreAction(im, MVT::f32, Legal);
1130 setIndexedStoreAction(im, MVT::f16, Legal);
1131 setIndexedStoreAction(im, MVT::bf16, Legal);
1132 }
1133
1134 // Trap.
1135 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1138
1139 // We combine OR nodes for ccmp operations.
1141 // Try to create BICs for vector ANDs.
1143
1144 // llvm.init.trampoline and llvm.adjust.trampoline
1147
1148 // Vector add and sub nodes may conceal a high-half opportunity.
1149 // Also, try to fold ADD into CSINC/CSINV..
1152
1155
1156 // Try and combine setcc/select_cc with csel and bool-vector bitcasts.
1159
1161
1169
1171
1173
1175
1179
1182
1184
1186
1188
1190
1196
1198
1202
1203 // In case of strict alignment, avoid an excessive number of byte wide stores.
1206 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1207
1211 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1212
1215 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1216
1219 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1220
1222
1224
1225 EnableExtLdPromotion = true;
1226
1227 // Set required alignment.
1229 // Set preferred alignments.
1230
1231 // Don't align loops on Windows. The SEH unwind info generation needs to
1232 // know the exact length of functions before the alignments have been
1233 // expanded.
1234 if (!Subtarget->isTargetWindows())
1238
1239 // Only change the limit for entries in a jump table if specified by
1240 // the sub target, but not at the command line.
1241 unsigned MaxJT = STI.getMaximumJumpTableSize();
1242 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1244
1246
1248
1250 if (Subtarget->hasSME())
1252
1253 if (Subtarget->isNeonAvailable()) {
1254 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1255 // silliness like this:
1256 // clang-format off
1257 for (auto Op :
1278 setOperationAction(Op, MVT::v1f64, Expand);
1279 // clang-format on
1280
1281 for (auto Op :
1286 setOperationAction(Op, MVT::v1i64, Expand);
1287
1288 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1289 // elements smaller than i32, so promote the input to i32 first.
1290 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1291 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1292
1293 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1294 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1295 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1298 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1300
1301 if (Subtarget->hasFullFP16()) {
1304
1313 } else {
1314 // when AArch64 doesn't have fullfp16 support, promote the input
1315 // to i32 first.
1316 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1317 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1318 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1319 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1320 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1321 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1322 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1323 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1324 }
1325
1326 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1327 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1328 // CTLS (Count Leading Sign bits) - Legal for BHS types (8/16/32-bit
1329 // elements) No hardware support for 64-bit element vectors
1330 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1331 MVT::v4i32})
1339 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1344 }
1345
1346 // Custom handling for some quad-vector types to detect MULL.
1347 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1348 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1349 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1350 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1351 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1352 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1353
1354 // Saturates
1355 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1356 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1361 }
1362
1363 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1364 MVT::v4i32}) {
1371 }
1372
1373 // Vector reductions
1374 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1375 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1376 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1381
1383 }
1384 }
1385 if (Subtarget->hasFullFP16())
1387
1388 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1389 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1398 }
1403
1405 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1406 // Likewise, narrowing and extending vector loads/stores aren't handled
1407 // directly.
1410
1411 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1414 } else {
1417 }
1420
1421 if (VT == MVT::v4i16 || VT == MVT::v8i16 || VT == MVT::v2i32 ||
1422 VT == MVT::v4i32 || VT == MVT::v2i64)
1424 else
1426
1427 if (VT == MVT::v8i8 || VT == MVT::v16i8 || VT == MVT::v8i16 ||
1428 VT == MVT::v4i16 || VT == MVT::v2i32 || VT == MVT::v4i32)
1430 else
1432
1433 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1434 setTruncStoreAction(VT, InnerVT, Expand);
1435 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1436 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1437 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1438 }
1439 }
1440
1441 for (auto Op :
1447 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1449 if (Subtarget->hasFullFP16())
1450 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1452 }
1453
1454 // LRINT and LLRINT.
1455 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1456 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1458 if (Subtarget->hasFullFP16())
1459 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1461 }
1462
1463 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1464
1469
1473
1474 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1475 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1476 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1477 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1478 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1479 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1480 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1481 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1482 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1483 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1484 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1485 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1486 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1487 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1488 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1489 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1490 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1491 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1492
1493 // ADDP custom lowering
1494 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1496 // FADDP custom lowering
1497 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1499
1500 if (Subtarget->hasDotProd()) {
1501 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1503
1504 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1505 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1506 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
1507 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1508
1509 if (Subtarget->hasMatMulInt8()) {
1511 MVT::v16i8, Legal);
1513 MVT::v16i8, Custom);
1514
1516 MVT::v8i8, Legal);
1517 }
1518 }
1519
1520 if (Subtarget->hasF16F32DOT() || Subtarget->hasFP16FML()) {
1522 MVT::v4f16, Legal);
1524 MVT::v8f16, Legal);
1525 }
1526
1527 if (Subtarget->hasBF16())
1529 MVT::v8bf16, Legal);
1530
1532 setOperationAction(ISD::CLMUL, {MVT::v8i8, MVT::v16i8}, Legal);
1533 if (Subtarget->hasAES()) {
1534 setOperationAction(ISD::CLMUL, {MVT::i16, MVT::i32, MVT::i64}, Custom);
1535 setOperationAction(ISD::CLMUL, {MVT::v1i64, MVT::v2i64}, Legal);
1536 }
1537
1538 } else /* !isNeonAvailable */ {
1540 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1542
1543 if (VT.is128BitVector() || VT.is64BitVector()) {
1547 Subtarget->isLittleEndian() ? Legal : Expand);
1548 }
1549 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1550 setTruncStoreAction(VT, InnerVT, Expand);
1551 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1552 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1553 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1554 }
1555 }
1556 }
1557
1558 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1562 }
1563
1564 if (Subtarget->hasSME()) {
1566 }
1567
1568 // FIXME: Move lowering for more nodes here if those are common between
1569 // SVE and SME.
1570 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1571 for (auto VT :
1572 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1577 }
1578 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1580 Custom);
1583 }
1584
1585 if (Subtarget->isSVEorStreamingSVEAvailable() &&
1586 (Subtarget->hasSVE2p1() || Subtarget->hasSME2()))
1588
1589 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1591
1592 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v2f64})
1594 }
1595
1596 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1597 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1641
1647
1656
1661
1665
1666 if (!Subtarget->isLittleEndian())
1668
1669 if (Subtarget->hasSVE2() ||
1670 (Subtarget->hasSME() && Subtarget->isStreaming()))
1671 // For SLI/SRI.
1673 }
1674
1675 for (auto VT : {MVT::nxv4i32, MVT::nxv2i64}) {
1678 }
1679
1680 // Illegal unpacked integer vector types.
1681 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1684 }
1685
1686 // Type legalize unpacked bitcasts.
1687 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1689
1690 for (auto VT :
1691 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1692 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1694
1695 // Promote predicate as counter load/stores to standard predicates.
1696 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
1697 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
1698
1699 // Predicate as counter legalization actions.
1700 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
1701 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
1702
1703 for (auto VT :
1704 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1712
1716
1717 // There are no legal MVT::nxv16f## based types.
1718 if (VT != MVT::nxv16i1) {
1723 }
1724 }
1725
1726 // NEON doesn't support masked loads/stores, but SME and SVE do.
1727 for (auto VT :
1728 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1729 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1730 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1733 }
1734
1735 // Firstly, exclude all scalable vector extending loads/truncating stores,
1736 // include both integer and floating scalable vector.
1738 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1739 setTruncStoreAction(VT, InnerVT, Expand);
1740 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1741 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1742 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1743 }
1744 }
1745
1746 // Then, selectively enable those which we directly support.
1747 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1748 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1749 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1750 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1751 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1752 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1753 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1754 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1755 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1756 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1757 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1758 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1759 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1760 }
1761
1762 // SVE supports truncating stores of 64 and 128-bit vectors
1763 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1764 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1765 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1766 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1767 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1768
1769 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1770 MVT::nxv4f32, MVT::nxv2f64}) {
1815
1838
1850 }
1851
1852 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1869 }
1870
1871 if (Subtarget->hasSVEB16B16() &&
1872 Subtarget->isNonStreamingSVEorSME2Available()) {
1873 // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
1874 for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
1875 MVT::nxv8bf16}) {
1884 }
1885 }
1886
1887 for (auto Opcode :
1892 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1893 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1894 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1895 }
1896
1897 if (!Subtarget->hasSVEB16B16() ||
1898 !Subtarget->isNonStreamingSVEorSME2Available()) {
1899 for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1900 MVT PromotedVT = VT.changeVectorElementType(MVT::f32);
1901 setOperationPromotedToType(ISD::FADD, VT, PromotedVT);
1902 setOperationPromotedToType(ISD::FMA, VT, PromotedVT);
1907 setOperationPromotedToType(ISD::FSUB, VT, PromotedVT);
1908
1909 if (VT != MVT::nxv2bf16 && Subtarget->hasBF16())
1911 else
1912 setOperationPromotedToType(ISD::FMUL, VT, PromotedVT);
1913 }
1914
1915 if (Subtarget->hasBF16() && Subtarget->isNeonAvailable())
1916 setOperationAction(ISD::FMUL, MVT::v8bf16, Custom);
1917 }
1918
1921
1922 // A number of operations like MULH and integer divides are not supported by
1923 // NEON but are available in SVE.
1924 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1925 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1932 }
1933
1934 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1935 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1936 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1937
1938 // With SVE2 we can try lowering these to pairwise operations (e.g. smaxp).
1939 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
1944 }
1945
1946 // NOTE: Currently this has to happen after computeRegisterProperties rather
1947 // than the preferred option of combining it with the addRegisterClass call.
1948 if (Subtarget->useSVEForFixedLengthVectors()) {
1951 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1952 addTypeForFixedLengthSVE(VT);
1953 }
1956 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1957 addTypeForFixedLengthSVE(VT);
1958 }
1959
1960 // 64bit results can mean a bigger than NEON input.
1961 for (auto VT : {MVT::v8i8, MVT::v4i16})
1964
1965 // 128bit results imply a bigger than NEON input.
1966 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1968 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v8bf16})
1970
1971 // These operations are not supported on NEON but SVE can do them.
1973 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1974 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1975 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1976 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1977 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1978 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1979 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1980 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1981 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1982 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1983 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1988
1989 // Int operations with no NEON support.
1990 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1991 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1997 }
1998
1999 // Use SVE for vectors with more than 2 elements.
2000 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
2002 }
2003
2005 MVT::nxv2i64);
2007 MVT::nxv2i64);
2009 MVT::nxv4i32);
2011 MVT::nxv4i32);
2013 MVT::nxv8i16);
2015 MVT::nxv8i16);
2017 MVT::nxv16i8);
2019 MVT::nxv16i8);
2020
2022
2023 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
2026 VT, Custom);
2027 }
2028
2029 // Handle partial reduction operations
2030 if (Subtarget->isSVEorStreamingSVEAvailable()) {
2031 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
2032 // Other pairs will default to 'Expand'.
2033 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2035 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
2036 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
2037
2038 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
2039
2040 if (Subtarget->hasMatMulInt8()) {
2042 MVT::nxv16i8, Legal);
2044 MVT::nxv16i8, Custom);
2045 }
2046
2047 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
2048 // Wide add types
2049 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
2050 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
2051 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
2052
2053 setOperationAction(ISD::CLMUL, {MVT::nxv16i8, MVT::nxv4i32}, Legal);
2054
2056 MVT::nxv8f16, Legal);
2057
2058 // We can use SVE2p1 fdot or SVE2 fmlalb/t to emulate the fixed-length
2059 // variant (unless NEON fdot is natively available).
2060 if (!Subtarget->isNeonAvailable() ||
2061 (!Subtarget->hasF16F32DOT() && !Subtarget->hasFP16FML())) {
2063 MVT::v8f16, Custom);
2065 MVT::v4f16, Custom);
2066 }
2067 }
2068
2069 if (Subtarget->hasBF16())
2071 MVT::nxv8bf16, Legal);
2072 }
2073
2074 if (Subtarget->hasSVEAES() &&
2075 (Subtarget->isSVEAvailable() || Subtarget->hasSSVE_AES()))
2076 setOperationAction(ISD::CLMUL, MVT::nxv2i64, Legal);
2077
2078 // Handle non-aliasing elements mask
2079 if (Subtarget->hasSVE2() ||
2080 (Subtarget->hasSME() && Subtarget->isStreaming())) {
2081 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
2082 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
2085 }
2086 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
2089 }
2090 }
2091
2092 // Handle operations that are only available in non-streaming SVE mode.
2093 if (Subtarget->isSVEAvailable()) {
2094 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
2095 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2096 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
2097 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
2098 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
2099 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2100 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
2103 }
2104
2105 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2106 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
2107 MVT::v2f32, MVT::v4f32, MVT::v2f64})
2109
2110 // We can lower types that have <vscale x {2|4}> elements to compact.
2111 for (auto VT :
2112 {MVT::nxv4i32, MVT::nxv2i64, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64})
2114
2115 // If we have SVE, we can use SVE logic for legal NEON vectors in the lowest
2116 // bits of the SVE register.
2117 for (auto VT : {MVT::v2i32, MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32,
2118 MVT::v2f64})
2120
2121 // Promote v4i16/f16 to v4i32/f32 as the SVE container for v4i16 is nxv8,
2122 // which is not supported with for compact (with only +sve).
2123 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4bf16, MVT::v4i16);
2124 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4f16, MVT::v4i16);
2125 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4i16, MVT::v4i32);
2126
2127 for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
2128 MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
2129 MVT::nxv4i32, MVT::nxv4f32}) {
2130 // Use a custom lowering for masked stores that could be a supported
2131 // compressing store. Note: These types still use the normal (Legal)
2132 // lowering for non-compressing masked stores.
2134 }
2135
2136 // Histcnt is SVE2 only
2137 if (Subtarget->hasSVE2()) {
2139 Custom);
2141 Custom);
2142
2143 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2145 // Must be lowered to SVE instructions.
2146 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
2147 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
2148 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
2149 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
2150 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
2151 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
2152 }
2153 }
2154
2155 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
2156 // Only required for llvm.aarch64.mops.memset.tag
2158 }
2159
2161
2162 if (Subtarget->hasSVE()) {
2167 }
2168
2169 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2170
2171 IsStrictFPEnabled = true;
2173
2174 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2175 // it, but it's just a wrapper around ldexp.
2176 if (Subtarget->isTargetWindows()) {
2178 if (isOperationExpand(Op, MVT::f32))
2179 setOperationAction(Op, MVT::f32, Promote);
2180 }
2181
2182 // LegalizeDAG currently can't expand fp16/bf16 LDEXP/FREXP on targets where
2183 // i16 isn't legal.
2185 if (isOperationExpand(Op, MVT::f16))
2186 setOperationAction(Op, MVT::f16, Promote);
2187 if (isOperationExpand(Op, MVT::bf16))
2188 setOperationAction(Op, MVT::bf16, Promote);
2189 }
2190}
2191
2193 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2194}
2195
2196void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2197 assert(VT.isVector() && "VT should be a vector type");
2198
2199 if (VT.isFloatingPoint()) {
2201 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2202 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2203 }
2204
2205 // Mark vector float intrinsics as expand.
2206 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2225 }
2226
2227 // But we do support custom-lowering for FCOPYSIGN.
2228 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2229 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2230 VT == MVT::v8f16) &&
2231 Subtarget->hasFullFP16()))
2233
2248
2252 for (MVT InnerVT : MVT::all_valuetypes())
2253 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2254
2255 // CNT supports only B element sizes, then use UADDLP to widen.
2256 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2258
2264
2265 for (unsigned Opcode :
2268 setOperationAction(Opcode, VT, Custom);
2269
2270 if (!VT.isFloatingPoint())
2272
2273 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2274 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2275 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2276 setOperationAction(Opcode, VT, Legal);
2277
2278 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2279 // NEON types.
2280 if (VT.isFloatingPoint() &&
2281 VT.getVectorElementType() != MVT::bf16 &&
2282 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2283 for (unsigned Opcode :
2289 setOperationAction(Opcode, VT, Legal);
2290
2291 // Strict fp extend and trunc are legal
2292 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2294 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2296
2297 // FIXME: We could potentially make use of the vector comparison instructions
2298 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2299 // complications:
2300 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2301 // so we would need to expand when the condition code doesn't match the
2302 // kind of comparison.
2303 // * Some kinds of comparison require more than one FCMXY instruction so
2304 // would need to be expanded instead.
2305 // * The lowering of the non-strict versions involves target-specific ISD
2306 // nodes so we would likely need to add strict versions of all of them and
2307 // handle them appropriately.
2310
2311 // When little-endian we can use ordinary d and q register loads/stores for
2312 // vector types, but when big-endian we need to use structure load/store which
2313 // only allow post-index addressing.
2314 if (Subtarget->isLittleEndian()) {
2315 for (unsigned im = (unsigned)ISD::PRE_INC;
2316 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2319 }
2320 } else {
2323 }
2324
2325 if (Subtarget->hasD128()) {
2328 }
2329
2330 if (VT.isInteger()) {
2331 // Let common code emit inverted variants of compares we do support.
2337 }
2338}
2339
2341 EVT OpVT) const {
2342 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2343 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2344 ResVT.getVectorElementType() != MVT::i1)
2345 return true;
2346
2347 // Only support illegal types if the result is scalable and min elements > 1.
2348 if (ResVT.getVectorMinNumElements() == 1 ||
2349 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2350 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2351 return true;
2352
2353 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2354 // but anything larger should be expanded.
2355 if (OpVT.getFixedSizeInBits() > 64)
2356 return true;
2357
2358 return false;
2359}
2360
2362 if (!Subtarget->isSVEorStreamingSVEAvailable())
2363 return true;
2364
2365 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2366 // also support fixed-width predicates.
2367 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2368 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2369 VT != MVT::v4i1 && VT != MVT::v2i1;
2370}
2371
2373 unsigned SearchSize) const {
2374 // MATCH is SVE2 and only available in non-streaming mode.
2375 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2376 return true;
2377 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2378 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2379 return SearchSize != 8;
2380 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2381 return SearchSize != 8 && SearchSize != 16;
2382 return true;
2383}
2384
2385void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2386 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2387
2388 // By default everything must be expanded.
2389 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2391
2392 if (VT.isFloatingPoint()) {
2402 }
2403
2405 VT == MVT::v1f64 ? Expand : Custom;
2406
2407 // Mark integer truncating stores/extending loads as having custom lowering
2408 if (VT.isInteger()) {
2409 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2410 while (InnerVT != VT) {
2411 setTruncStoreAction(VT, InnerVT, Default);
2412 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2413 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2414 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2415 InnerVT = InnerVT.changeVectorElementType(
2416 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2417 }
2418 }
2419
2420 // Mark floating-point truncating stores/extending loads as having custom
2421 // lowering
2422 if (VT.isFloatingPoint()) {
2423 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2424 while (InnerVT != VT) {
2425 setTruncStoreAction(VT, InnerVT, Custom);
2426 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2427 InnerVT = InnerVT.changeVectorElementType(
2429 }
2430 }
2431
2432 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2433 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2434
2435 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2437 unsigned NumElts = VT.getVectorNumElements();
2438 if (VT.getVectorElementType() == MVT::i64) {
2439 setPartialReduceMLAAction(MLAOps, VT,
2440 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2441 setPartialReduceMLAAction(MLAOps, VT,
2442 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2443 setPartialReduceMLAAction(MLAOps, VT,
2444 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2445 } else if (VT.getVectorElementType() == MVT::i32) {
2446 setPartialReduceMLAAction(MLAOps, VT,
2447 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2448 setPartialReduceMLAAction(MLAOps, VT,
2449 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2450 } else if (VT.getVectorElementType() == MVT::i16) {
2451 setPartialReduceMLAAction(MLAOps, VT,
2452 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2453 }
2454 if (Subtarget->hasMatMulInt8()) {
2455 if (VT.getVectorElementType() == MVT::i32)
2457 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2458 else if (VT.getVectorElementType() == MVT::i64)
2460 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2461 }
2462
2463 if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
2465 MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
2466 }
2467
2468 // Lower fixed length vector operations to scalable equivalents.
2475 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2516 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2519 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2521 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2540 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2567}
2568
2569void AArch64TargetLowering::addDRType(MVT VT) {
2570 addRegisterClass(VT, &AArch64::FPR64RegClass);
2571 if (Subtarget->isNeonAvailable())
2572 addTypeForNEON(VT);
2573}
2574
2575void AArch64TargetLowering::addQRType(MVT VT) {
2576 addRegisterClass(VT, &AArch64::FPR128RegClass);
2577 if (Subtarget->isNeonAvailable())
2578 addTypeForNEON(VT);
2579}
2580
2582 LLVMContext &C, EVT VT) const {
2583 if (!VT.isVector())
2584 return MVT::i32;
2585 if (VT.isScalableVector())
2586 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2588}
2589
2590// isIntImmediate - This method tests to see if the node is a constant
2591// operand. If so Imm will receive the value.
2592static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2594 Imm = C->getZExtValue();
2595 return true;
2596 }
2597 return false;
2598}
2599
2600bool isVectorizedBinOp(unsigned Opcode) {
2601 switch (Opcode) {
2602 case AArch64ISD::SQDMULH:
2603 return true;
2604 default:
2605 return false;
2606 }
2607}
2608
2609// isOpcWithIntImmediate - This method tests to see if the node is a specific
2610// opcode and that it has a immediate integer right operand.
2611// If so Imm will receive the value.
2612static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2613 uint64_t &Imm) {
2614 return N->getOpcode() == Opc &&
2615 isIntImmediate(N->getOperand(1).getNode(), Imm);
2616}
2617
2618static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2619 const APInt &Demanded,
2621 unsigned NewOpc) {
2622 uint64_t OldImm = Imm, NewImm, Enc;
2623 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2624
2625 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2626 // bimm64.
2627 if (Imm == 0 || Imm == Mask ||
2629 return false;
2630
2631 unsigned EltSize = Size;
2632 uint64_t DemandedBits = Demanded.getZExtValue();
2633
2634 // Clear bits that are not demanded.
2635 Imm &= DemandedBits;
2636
2637 while (true) {
2638 // The goal here is to set the non-demanded bits in a way that minimizes
2639 // the number of switching between 0 and 1. In order to achieve this goal,
2640 // we set the non-demanded bits to the value of the preceding demanded bits.
2641 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2642 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2643 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2644 // The final result is 0b11000011.
2645 uint64_t NonDemandedBits = ~DemandedBits;
2646 uint64_t InvertedImm = ~Imm & DemandedBits;
2647 uint64_t RotatedImm =
2648 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2649 NonDemandedBits;
2650 uint64_t Sum = RotatedImm + NonDemandedBits;
2651 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2652 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2653 NewImm = (Imm | Ones) & Mask;
2654
2655 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2656 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2657 // we halve the element size and continue the search.
2658 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2659 break;
2660
2661 // We cannot shrink the element size any further if it is 2-bits.
2662 if (EltSize == 2)
2663 return false;
2664
2665 EltSize /= 2;
2666 Mask >>= EltSize;
2667 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2668
2669 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2670 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2671 return false;
2672
2673 // Merge the upper and lower halves of Imm and DemandedBits.
2674 Imm |= Hi;
2675 DemandedBits |= DemandedBitsHi;
2676 }
2677
2678 ++NumOptimizedImms;
2679
2680 // Replicate the element across the register width.
2681 while (EltSize < Size) {
2682 NewImm |= NewImm << EltSize;
2683 EltSize *= 2;
2684 }
2685
2686 (void)OldImm;
2687 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2688 "demanded bits should never be altered");
2689 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2690
2691 // Create the new constant immediate node.
2692 EVT VT = Op.getValueType();
2693 SDLoc DL(Op);
2694 SDValue New;
2695
2696 // If the new constant immediate is all-zeros or all-ones, let the target
2697 // independent DAG combine optimize this node.
2698 if (NewImm == 0 || NewImm == OrigMask) {
2699 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2700 TLO.DAG.getConstant(NewImm, DL, VT));
2701 // Otherwise, create a machine node so that target independent DAG combine
2702 // doesn't undo this optimization.
2703 } else {
2705 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2706 New = SDValue(
2707 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2708 }
2709
2710 return TLO.CombineTo(Op, New);
2711}
2712
2714 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2715 TargetLoweringOpt &TLO) const {
2716 // Delay this optimization to as late as possible.
2717 if (!TLO.LegalOps)
2718 return false;
2719
2721 return false;
2722
2723 EVT VT = Op.getValueType();
2724 if (VT.isVector())
2725 return false;
2726
2727 unsigned Size = VT.getSizeInBits();
2728
2729 if (Size != 32 && Size != 64)
2730 return false;
2731
2732 // Exit early if we demand all bits.
2733 if (DemandedBits.isAllOnes())
2734 return false;
2735
2736 unsigned NewOpc;
2737 switch (Op.getOpcode()) {
2738 default:
2739 return false;
2740 case ISD::AND:
2741 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2742 break;
2743 case ISD::OR:
2744 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2745 break;
2746 case ISD::XOR:
2747 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2748 break;
2749 }
2750 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2751 if (!C)
2752 return false;
2753 uint64_t Imm = C->getZExtValue();
2754 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2755}
2756
2757/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2758/// Mask are known to be either zero or one and return them Known.
2760 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2761 const SelectionDAG &DAG, unsigned Depth) const {
2762 switch (Op.getOpcode()) {
2763 default:
2764 break;
2765 case AArch64ISD::DUP: {
2766 SDValue SrcOp = Op.getOperand(0);
2767 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2768 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2769 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2770 "Expected DUP implicit truncation");
2771 Known = Known.trunc(Op.getScalarValueSizeInBits());
2772 }
2773 break;
2774 }
2775 case AArch64ISD::CSEL: {
2776 KnownBits Known2;
2777 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2778 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2779 Known = Known.intersectWith(Known2);
2780 break;
2781 }
2782 case AArch64ISD::CSNEG:
2783 case AArch64ISD::CSINC:
2784 case AArch64ISD::CSINV: {
2785 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2786 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2787
2788 // The result is either:
2789 // CSINC: KnownOp0 or KnownOp1 + 1
2790 // CSINV: KnownOp0 or ~KnownOp1
2791 // CSNEG: KnownOp0 or KnownOp1 * -1
2792 if (Op.getOpcode() == AArch64ISD::CSINC)
2793 KnownOp1 = KnownBits::add(
2794 KnownOp1,
2795 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2796 else if (Op.getOpcode() == AArch64ISD::CSINV)
2797 std::swap(KnownOp1.Zero, KnownOp1.One);
2798 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2799 KnownOp1 =
2801 Op.getScalarValueSizeInBits())));
2802
2803 Known = KnownOp0.intersectWith(KnownOp1);
2804 break;
2805 }
2806 case AArch64ISD::BICi: {
2807 // Compute the bit cleared value.
2808 APInt Mask =
2809 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2810 .trunc(Known.getBitWidth());
2811 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2812 Known &= KnownBits::makeConstant(Mask);
2813 break;
2814 }
2815 case AArch64ISD::VLSHR: {
2816 KnownBits Known2;
2817 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2818 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2819 Known = KnownBits::lshr(Known, Known2);
2820 break;
2821 }
2822 case AArch64ISD::VASHR: {
2823 KnownBits Known2;
2824 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2825 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2826 Known = KnownBits::ashr(Known, Known2);
2827 break;
2828 }
2829 case AArch64ISD::VSHL: {
2830 KnownBits Known2;
2831 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2832 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2833 Known = KnownBits::shl(Known, Known2);
2834 break;
2835 }
2836 case AArch64ISD::MOVI: {
2838 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2839 break;
2840 }
2841 case AArch64ISD::MOVIshift: {
2843 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2844 << Op->getConstantOperandVal(1)));
2845 break;
2846 }
2847 case AArch64ISD::MOVImsl: {
2848 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2850 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2851 break;
2852 }
2853 case AArch64ISD::MOVIedit: {
2855 Known.getBitWidth(),
2856 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2857 break;
2858 }
2859 case AArch64ISD::MVNIshift: {
2861 APInt(Known.getBitWidth(),
2862 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2863 /*isSigned*/ false, /*implicitTrunc*/ true));
2864 break;
2865 }
2866 case AArch64ISD::MVNImsl: {
2867 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2869 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2870 /*isSigned*/ false, /*implicitTrunc*/ true));
2871 break;
2872 }
2873 case AArch64ISD::LOADgot:
2874 case AArch64ISD::ADDlow: {
2875 if (!Subtarget->isTargetILP32())
2876 break;
2877 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2878 Known.Zero = APInt::getHighBitsSet(64, 32);
2879 break;
2880 }
2881 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2882 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2883 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2884 break;
2885 }
2887 Intrinsic::ID IntID =
2888 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2889 switch (IntID) {
2890 default: return;
2891 case Intrinsic::aarch64_ldaxr:
2892 case Intrinsic::aarch64_ldxr: {
2893 unsigned BitWidth = Known.getBitWidth();
2894 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2895 unsigned MemBits = VT.getScalarSizeInBits();
2896 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2897 return;
2898 }
2899 }
2900 break;
2901 }
2903 case ISD::INTRINSIC_VOID: {
2904 unsigned IntNo = Op.getConstantOperandVal(0);
2905 switch (IntNo) {
2906 default:
2907 break;
2908 case Intrinsic::aarch64_neon_uaddlv: {
2909 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2910 unsigned BitWidth = Known.getBitWidth();
2911 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2912 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2913 assert(BitWidth >= Bound && "Unexpected width!");
2915 Known.Zero |= Mask;
2916 }
2917 break;
2918 }
2919 case Intrinsic::aarch64_neon_umaxv:
2920 case Intrinsic::aarch64_neon_uminv: {
2921 // Figure out the datatype of the vector operand. The UMINV instruction
2922 // will zero extend the result, so we can mark as known zero all the
2923 // bits larger than the element datatype. 32-bit or larget doesn't need
2924 // this as those are legal types and will be handled by isel directly.
2925 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2926 unsigned BitWidth = Known.getBitWidth();
2927 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2928 assert(BitWidth >= 8 && "Unexpected width!");
2930 Known.Zero |= Mask;
2931 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2932 assert(BitWidth >= 16 && "Unexpected width!");
2934 Known.Zero |= Mask;
2935 }
2936 break;
2937 } break;
2938 }
2939 }
2940 }
2941}
2942
2944 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2945 unsigned Depth) const {
2946 EVT VT = Op.getValueType();
2947 unsigned VTBits = VT.getScalarSizeInBits();
2948 unsigned Opcode = Op.getOpcode();
2949 switch (Opcode) {
2950 case AArch64ISD::FCMEQ:
2951 case AArch64ISD::FCMGE:
2952 case AArch64ISD::FCMGT:
2953 // Compares return either 0 or all-ones
2954 return VTBits;
2955 case AArch64ISD::VASHR: {
2956 unsigned Tmp =
2957 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2958 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2959 }
2960 }
2961
2962 return 1;
2963}
2964
2966 EVT) const {
2967 return MVT::i64;
2968}
2969
2971 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2972 unsigned *Fast) const {
2973
2974 // Allow SVE loads/stores where the alignment >= the size of the element type,
2975 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2976 // for stores that come from IR, only require element-size alignment (even if
2977 // unaligned accesses are disabled). Without this, these will be forced to
2978 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2979 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2980 if (VT.isScalableVector()) {
2981 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2982 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2983 return true;
2984 }
2985
2986 if (Subtarget->requiresStrictAlign())
2987 return false;
2988
2989 if (Fast) {
2990 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2991 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2992 // See comments in performSTORECombine() for more details about
2993 // these conditions.
2994
2995 // Code that uses clang vector extensions can mark that it
2996 // wants unaligned accesses to be treated as fast by
2997 // underspecifying alignment to be 1 or 2.
2998 Alignment <= 2 ||
2999
3000 // Disregard v2i64. Memcpy lowering produces those and splitting
3001 // them regresses performance on micro-benchmarks and olden/bh.
3002 VT == MVT::v2i64;
3003 }
3004 return true;
3005}
3006
3007// Same as above but handling LLTs instead.
3009 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
3010 unsigned *Fast) const {
3011 if (Subtarget->requiresStrictAlign())
3012 return false;
3013
3014 if (Fast) {
3015 // Some CPUs are fine with unaligned stores except for 128-bit ones.
3016 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
3017 Ty.getSizeInBytes() != 16 ||
3018 // See comments in performSTORECombine() for more details about
3019 // these conditions.
3020
3021 // Code that uses clang vector extensions can mark that it
3022 // wants unaligned accesses to be treated as fast by
3023 // underspecifying alignment to be 1 or 2.
3024 Alignment <= 2 ||
3025
3026 // Disregard v2i64. Memcpy lowering produces those and splitting
3027 // them regresses performance on micro-benchmarks and olden/bh.
3028 Ty == LLT::fixed_vector(2, 64);
3029 }
3030 return true;
3031}
3032
3034 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
3035 const LibcallLoweringInfo *libcallLowering) const {
3036 return AArch64::createFastISel(funcInfo, libInfo, libcallLowering);
3037}
3038
3041 MachineBasicBlock *MBB) const {
3042 // We materialise the F128CSEL pseudo-instruction as some control flow and a
3043 // phi node:
3044
3045 // OrigBB:
3046 // [... previous instrs leading to comparison ...]
3047 // b.ne TrueBB
3048 // b EndBB
3049 // TrueBB:
3050 // ; Fallthrough
3051 // EndBB:
3052 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
3053
3054 MachineFunction *MF = MBB->getParent();
3055 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3056 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3057 DebugLoc DL = MI.getDebugLoc();
3058 MachineFunction::iterator It = ++MBB->getIterator();
3059
3060 Register DestReg = MI.getOperand(0).getReg();
3061 Register IfTrueReg = MI.getOperand(1).getReg();
3062 Register IfFalseReg = MI.getOperand(2).getReg();
3063 unsigned CondCode = MI.getOperand(3).getImm();
3064 bool NZCVKilled = MI.getOperand(4).isKill();
3065
3066 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3067 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3068 MF->insert(It, TrueBB);
3069 MF->insert(It, EndBB);
3070
3071 // Transfer rest of current basic-block to EndBB
3072 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3073 MBB->end());
3075
3076 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3077 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3078 MBB->addSuccessor(TrueBB);
3079 MBB->addSuccessor(EndBB);
3080
3081 // TrueBB falls through to the end.
3082 TrueBB->addSuccessor(EndBB);
3083
3084 if (!NZCVKilled) {
3085 TrueBB->addLiveIn(AArch64::NZCV);
3086 EndBB->addLiveIn(AArch64::NZCV);
3087 }
3088
3089 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3090 .addReg(IfTrueReg)
3091 .addMBB(TrueBB)
3092 .addReg(IfFalseReg)
3093 .addMBB(MBB);
3094
3095 MI.eraseFromParent();
3096 return EndBB;
3097}
3098
3106
3109 MachineBasicBlock *MBB) const {
3110 MachineFunction &MF = *MBB->getParent();
3111 MachineBasicBlock::iterator MBBI = MI.getIterator();
3112 const AArch64InstrInfo &TII =
3113 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3114 Register TargetReg = MI.getOperand(0).getReg();
3116 TII.probedStackAlloc(MBBI, TargetReg, false);
3117
3118 MI.eraseFromParent();
3119 return NextInst->getParent();
3120}
3121
3124 MachineBasicBlock *MBB) const {
3125 MachineFunction *MF = MBB->getParent();
3126 MachineRegisterInfo &MRI = MF->getRegInfo();
3127
3128 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
3129 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
3130
3131 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
3132 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
3133 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
3134 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
3135
3136 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3137 DebugLoc DL = MI.getDebugLoc();
3138
3139 // RDVL requires GPR64, ADDSVL requires GPR64sp
3140 // We need to insert COPY instructions, these will later be removed by the
3141 // RegisterCoalescer
3142 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
3143 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
3144 .addReg(RegVL_GPR);
3145
3146 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
3147 .addReg(RegVL_GPRsp)
3148 .addImm(-1);
3149 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
3150 .addReg(RegSVL_GPRsp);
3151
3152 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3153 MachineFunction::iterator It = ++MBB->getIterator();
3154 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
3155 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
3156 MF->insert(It, TrapBB);
3157 MF->insert(It, PassBB);
3158
3159 // Continue if vector lengths match
3160 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
3161 .addReg(RegSVL_GPR)
3162 .addMBB(PassBB);
3163
3164 // Transfer rest of current BB to PassBB
3165 PassBB->splice(PassBB->begin(), MBB,
3166 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
3168
3169 // Trap if vector lengths mismatch
3170 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
3171
3172 MBB->addSuccessor(TrapBB);
3173 MBB->addSuccessor(PassBB);
3174
3175 MI.eraseFromParent();
3176 return PassBB;
3177}
3178
3180AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3182 MachineBasicBlock *BB) const {
3183 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3184 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3185
3186 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3187 MIB.add(MI.getOperand(1)); // slice index register
3188 MIB.add(MI.getOperand(2)); // slice index offset
3189 MIB.add(MI.getOperand(3)); // pg
3190 MIB.add(MI.getOperand(4)); // base
3191 MIB.add(MI.getOperand(5)); // offset
3192
3193 MI.eraseFromParent(); // The pseudo is gone now.
3194 return BB;
3195}
3196
3199 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3201 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3202
3203 MIB.addReg(AArch64::ZA, RegState::Define);
3204 MIB.add(MI.getOperand(0)); // Vector select register
3205 MIB.add(MI.getOperand(1)); // Vector select offset
3206 MIB.add(MI.getOperand(2)); // Base
3207 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3208
3209 MI.eraseFromParent(); // The pseudo is gone now.
3210 return BB;
3211}
3212
3215 unsigned Opcode,
3216 bool Op0IsDef) const {
3217 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3219
3220 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3221 .addReg(MI.getOperand(0).getReg(), getDefRegState(Op0IsDef));
3222 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3223 MIB.add(MI.getOperand(I));
3224
3225 MI.eraseFromParent(); // The pseudo is gone now.
3226 return BB;
3227}
3228
3230AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3232 MachineBasicBlock *BB) const {
3233 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3234 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3235 unsigned StartIdx = 0;
3236
3237 bool HasTile = BaseReg != AArch64::ZA;
3238 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3239 if (HasZPROut) {
3240 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3241 ++StartIdx;
3242 }
3243 if (HasTile) {
3244 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3245 RegState::Define); // Output ZA Tile
3246 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3247 StartIdx++;
3248 } else {
3249 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3250 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3251 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3252 ++StartIdx;
3253 }
3254 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3255 }
3256 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3257 MIB.add(MI.getOperand(I));
3258
3259 MI.eraseFromParent(); // The pseudo is gone now.
3260 return BB;
3261}
3262
3265 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3267 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3268 MIB.add(MI.getOperand(0)); // Mask
3269
3270 unsigned Mask = MI.getOperand(0).getImm();
3271 for (unsigned I = 0; I < 8; I++) {
3272 if (Mask & (1 << I))
3273 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3274 }
3275
3276 MI.eraseFromParent(); // The pseudo is gone now.
3277 return BB;
3278}
3279
3282 MachineBasicBlock *BB) const {
3283 MachineFunction *MF = BB->getParent();
3284 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3285 const DebugLoc &DL = MI.getDebugLoc();
3286 Register ResultReg = MI.getOperand(0).getReg();
3287 if (MF->getRegInfo().use_empty(ResultReg)) {
3288 // Nothing to do. Pseudo erased below.
3289 } else if (Subtarget->hasSME()) {
3290 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3291 .addImm(AArch64SysReg::SVCR)
3292 .addReg(AArch64::VG, RegState::Implicit);
3293 } else {
3294 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3295 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3296 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3298 .addReg(AArch64::X0, RegState::ImplicitDefine)
3299 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3300 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3301 .addReg(AArch64::X0);
3302 }
3303 MI.eraseFromParent();
3304 return BB;
3305}
3306
3307// Helper function to find the instruction that defined a virtual register.
3308// If unable to find such instruction, returns nullptr.
3310 Register Reg) {
3311 while (Reg.isVirtual()) {
3313 assert(DefMI && "Virtual register definition not found");
3314 unsigned Opcode = DefMI->getOpcode();
3315
3316 if (Opcode == AArch64::COPY) {
3317 Reg = DefMI->getOperand(1).getReg();
3318 // Vreg is defined by copying from physreg.
3319 if (Reg.isPhysical())
3320 return DefMI;
3321 continue;
3322 }
3323 if (Opcode == AArch64::SUBREG_TO_REG) {
3324 Reg = DefMI->getOperand(1).getReg();
3325 continue;
3326 }
3327
3328 return DefMI;
3329 }
3330 return nullptr;
3331}
3332
3335 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3336 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3337 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3338 const DebugLoc &DL = MI.getDebugLoc();
3339
3340 Register AddrDisc = AddrDiscOp.getReg();
3341 int64_t IntDisc = IntDiscOp.getImm();
3342 assert(IntDisc == 0 && "Blend components are already expanded");
3343
3344 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3345 if (DiscMI) {
3346 switch (DiscMI->getOpcode()) {
3347 case AArch64::MOVKXi:
3348 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3349 // #imm should be an immediate and not a global symbol, for example.
3350 if (DiscMI->getOperand(2).isImm() &&
3351 DiscMI->getOperand(3).getImm() == 48) {
3352 AddrDisc = DiscMI->getOperand(1).getReg();
3353 IntDisc = DiscMI->getOperand(2).getImm();
3354 }
3355 break;
3356 case AArch64::MOVi32imm:
3357 case AArch64::MOVi64imm:
3358 // Small immediate integer constant passed via VReg.
3359 if (DiscMI->getOperand(1).isImm() &&
3360 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3361 AddrDisc = AArch64::NoRegister;
3362 IntDisc = DiscMI->getOperand(1).getImm();
3363 }
3364 break;
3365 }
3366 }
3367
3368 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3369 // in the requested register class.
3370 if (AddrDisc == AArch64::XZR)
3371 AddrDisc = AArch64::NoRegister;
3372
3373 // Make sure AddrDisc operand respects the register class imposed by MI.
3374 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3375 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3376 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3377 AddrDisc = TmpReg;
3378 }
3379
3380 AddrDiscOp.setReg(AddrDisc);
3381 IntDiscOp.setImm(IntDisc);
3382}
3383
3385 MachineInstr &MI, MachineBasicBlock *BB) const {
3386
3387 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3388 if (SMEOrigInstr != -1) {
3389 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3390 uint64_t SMEMatrixType =
3391 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3392 switch (SMEMatrixType) {
3394 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3396 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3398 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3400 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3402 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3404 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3405 }
3406 }
3407
3408 switch (MI.getOpcode()) {
3409 default:
3410#ifndef NDEBUG
3411 MI.dump();
3412#endif
3413 llvm_unreachable("Unexpected instruction for custom inserter!");
3414 case AArch64::EntryPStateSM:
3415 return EmitEntryPStateSM(MI, BB);
3416 case AArch64::F128CSEL:
3417 return EmitF128CSEL(MI, BB);
3418 case TargetOpcode::STATEPOINT:
3419 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3420 // while bl call instruction (where statepoint will be lowered at the end)
3421 // has implicit def. This def is early-clobber as it will be set at
3422 // the moment of the call and earlier than any use is read.
3423 // Add this implicit dead def here as a workaround.
3424 MI.addOperand(*MI.getMF(),
3426 AArch64::LR, /*isDef*/ true,
3427 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3428 /*isUndef*/ false, /*isEarlyClobber*/ true));
3429 [[fallthrough]];
3430 case TargetOpcode::STACKMAP:
3431 case TargetOpcode::PATCHPOINT:
3432 return emitPatchPoint(MI, BB);
3433
3434 case TargetOpcode::PATCHABLE_EVENT_CALL:
3435 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3436 return BB;
3437
3438 case AArch64::CATCHRET:
3439 return EmitLoweredCatchRet(MI, BB);
3440
3441 case AArch64::PROBED_STACKALLOC_DYN:
3442 return EmitDynamicProbedAlloc(MI, BB);
3443
3444 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3445 return EmitCheckMatchingVL(MI, BB);
3446
3447 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3448 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3449 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3450 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3451 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3452 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3453 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3454 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3455 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3456 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3457 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3458 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3459 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3460 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3461 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3462 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3463 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3464 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3465 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3466 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3467 case AArch64::LDR_ZA_PSEUDO:
3468 return EmitFill(MI, BB);
3469 case AArch64::LDR_TX_PSEUDO:
3470 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3471 case AArch64::STR_TX_PSEUDO:
3472 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3473 case AArch64::ZERO_M_PSEUDO:
3474 return EmitZero(MI, BB);
3475 case AArch64::ZERO_T_PSEUDO:
3476 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3477 case AArch64::MOVT_TIZ_PSEUDO:
3478 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3479
3480 case AArch64::PAC:
3481 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3482 &AArch64::GPR64noipRegClass);
3483 return BB;
3484 }
3485}
3486
3487//===----------------------------------------------------------------------===//
3488// AArch64 Lowering private implementation.
3489//===----------------------------------------------------------------------===//
3490
3491//===----------------------------------------------------------------------===//
3492// Lowering Code
3493//===----------------------------------------------------------------------===//
3494
3495// Forward declarations of SVE fixed length lowering helpers
3500 SelectionDAG &DAG);
3503 EVT VT);
3505
3506/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3507static bool isZerosVector(const SDNode *N) {
3508 // Look through a bit convert.
3509 while (N->getOpcode() == ISD::BITCAST)
3510 N = N->getOperand(0).getNode();
3511
3513 return true;
3514
3515 if (N->getOpcode() != AArch64ISD::DUP)
3516 return false;
3517
3518 auto Opnd0 = N->getOperand(0);
3519 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3520}
3521
3522static bool isOneVector(SDValue V) {
3523 return isOneOrOneSplat(V) ||
3524 (V.getOpcode() == AArch64ISD::DUP && isOneConstant(V.getOperand(0)));
3525}
3526
3527/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3528/// CC
3530 SDValue RHS = {}) {
3531 switch (CC) {
3532 default:
3533 llvm_unreachable("Unknown condition code!");
3534 case ISD::SETNE:
3535 return AArch64CC::NE;
3536 case ISD::SETEQ:
3537 return AArch64CC::EQ;
3538 case ISD::SETGT:
3539 return AArch64CC::GT;
3540 case ISD::SETGE:
3542 case ISD::SETLT:
3544 case ISD::SETLE:
3545 return AArch64CC::LE;
3546 case ISD::SETUGT:
3547 return AArch64CC::HI;
3548 case ISD::SETUGE:
3549 return AArch64CC::HS;
3550 case ISD::SETULT:
3551 return AArch64CC::LO;
3552 case ISD::SETULE:
3553 return AArch64CC::LS;
3554 }
3555}
3556
3557/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3559 AArch64CC::CondCode &CondCode,
3560 AArch64CC::CondCode &CondCode2) {
3561 CondCode2 = AArch64CC::AL;
3562 switch (CC) {
3563 default:
3564 llvm_unreachable("Unknown FP condition!");
3565 case ISD::SETEQ:
3566 case ISD::SETOEQ:
3567 CondCode = AArch64CC::EQ;
3568 break;
3569 case ISD::SETGT:
3570 case ISD::SETOGT:
3571 CondCode = AArch64CC::GT;
3572 break;
3573 case ISD::SETGE:
3574 case ISD::SETOGE:
3575 CondCode = AArch64CC::GE;
3576 break;
3577 case ISD::SETOLT:
3578 CondCode = AArch64CC::MI;
3579 break;
3580 case ISD::SETOLE:
3581 CondCode = AArch64CC::LS;
3582 break;
3583 case ISD::SETONE:
3584 CondCode = AArch64CC::MI;
3585 CondCode2 = AArch64CC::GT;
3586 break;
3587 case ISD::SETO:
3588 CondCode = AArch64CC::VC;
3589 break;
3590 case ISD::SETUO:
3591 CondCode = AArch64CC::VS;
3592 break;
3593 case ISD::SETUEQ:
3594 CondCode = AArch64CC::EQ;
3595 CondCode2 = AArch64CC::VS;
3596 break;
3597 case ISD::SETUGT:
3598 CondCode = AArch64CC::HI;
3599 break;
3600 case ISD::SETUGE:
3601 CondCode = AArch64CC::PL;
3602 break;
3603 case ISD::SETLT:
3604 case ISD::SETULT:
3605 CondCode = AArch64CC::LT;
3606 break;
3607 case ISD::SETLE:
3608 case ISD::SETULE:
3609 CondCode = AArch64CC::LE;
3610 break;
3611 case ISD::SETNE:
3612 case ISD::SETUNE:
3613 CondCode = AArch64CC::NE;
3614 break;
3615 }
3616}
3617
3618/// Convert a DAG fp condition code to an AArch64 CC.
3619/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3620/// should be AND'ed instead of OR'ed.
3622 AArch64CC::CondCode &CondCode,
3623 AArch64CC::CondCode &CondCode2) {
3624 CondCode2 = AArch64CC::AL;
3625 switch (CC) {
3626 default:
3627 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3628 assert(CondCode2 == AArch64CC::AL);
3629 break;
3630 case ISD::SETONE:
3631 // (a one b)
3632 // == ((a olt b) || (a ogt b))
3633 // == ((a ord b) && (a une b))
3634 CondCode = AArch64CC::VC;
3635 CondCode2 = AArch64CC::NE;
3636 break;
3637 case ISD::SETUEQ:
3638 // (a ueq b)
3639 // == ((a uno b) || (a oeq b))
3640 // == ((a ule b) && (a uge b))
3641 CondCode = AArch64CC::PL;
3642 CondCode2 = AArch64CC::LE;
3643 break;
3644 }
3645}
3646
3647/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3648/// CC usable with the vector instructions. Fewer operations are available
3649/// without a real NZCV register, so we have to use less efficient combinations
3650/// to get the same effect.
3652 AArch64CC::CondCode &CondCode,
3653 AArch64CC::CondCode &CondCode2,
3654 bool &Invert) {
3655 Invert = false;
3656 switch (CC) {
3657 default:
3658 // Mostly the scalar mappings work fine.
3659 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3660 break;
3661 case ISD::SETUO:
3662 Invert = true;
3663 [[fallthrough]];
3664 case ISD::SETO:
3665 CondCode = AArch64CC::MI;
3666 CondCode2 = AArch64CC::GE;
3667 break;
3668 case ISD::SETLE:
3669 CondCode = AArch64CC::LS;
3670 CondCode2 = AArch64CC::AL;
3671 break;
3672 case ISD::SETLT:
3673 CondCode = AArch64CC::MI;
3674 CondCode2 = AArch64CC::AL;
3675 break;
3676 case ISD::SETUEQ:
3677 case ISD::SETULT:
3678 case ISD::SETULE:
3679 case ISD::SETUGT:
3680 case ISD::SETUGE:
3681 // All of the compare-mask comparisons are ordered, but we can switch
3682 // between the two by a double inversion. E.g. ULE == !OGT.
3683 Invert = true;
3684 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3685 CondCode, CondCode2);
3686 break;
3687 }
3688}
3689
3690/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3692 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3693 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3694}
3695
3697 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3698 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3699 LLVM_DEBUG(dbgs() << "Is imm " << C
3700 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3701 return IsLegal;
3702}
3703
3705 // Works for negative immediates too, as it can be written as an ADDS
3706 // instruction with a negated immediate.
3707 return isLegalArithImmed(C.abs().getZExtValue());
3708}
3709
3711 uint64_t Imm = C.getZExtValue();
3713 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3714 return Insn.size();
3715}
3716
3718 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3719 if (Op->getFlags().hasNoSignedWrap())
3720 return true;
3721
3722 // We can still figure out if the second operand is safe to use
3723 // in a CMN instruction by checking if it is known to be not the minimum
3724 // signed value. If it is not, then we can safely use CMN.
3725 // Note: We can eventually remove this check and simply rely on
3726 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3727 // consistently sets them appropriately when making said nodes.
3728
3729 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3730 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3731}
3732
3733// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3734// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3735// can be set differently by this operation. It comes down to whether
3736// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3737// everything is fine. If not then the optimization is wrong. Thus general
3738// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3739//
3740// So, finally, the only LLVM-native comparisons that don't mention C or V
3741// are the ones that aren't unsigned comparisons. They're the only ones we can
3742// safely use CMN for in the absence of information about op2.
3744 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3745 (isIntEqualitySetCC(CC) ||
3746 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3747 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3748}
3749
3751 SelectionDAG &DAG, SDValue Chain,
3752 bool IsSignaling) {
3753 EVT VT = LHS.getValueType();
3754 assert(VT != MVT::f128);
3755
3756 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3757
3758 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3759 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3760 {Chain, LHS});
3761 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3762 {LHS.getValue(1), RHS});
3763 Chain = RHS.getValue(1);
3764 }
3765 unsigned Opcode =
3766 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3767 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3768}
3769
3771 const SDLoc &DL, SelectionDAG &DAG) {
3772 EVT VT = LHS.getValueType();
3773 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3774
3775 if (VT.isFloatingPoint()) {
3776 assert(VT != MVT::f128);
3777 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3778 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3779 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3780 }
3781 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3782 }
3783
3784 // The CMP instruction is just an alias for SUBS, and representing it as
3785 // SUBS means that it's possible to get CSE with subtract operations.
3786 // A later phase can perform the optimization of setting the destination
3787 // register to WZR/XZR if it ends up being unused.
3788 unsigned Opcode = AArch64ISD::SUBS;
3789
3790 if (isCMN(RHS, CC, DAG)) {
3791 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3792 Opcode = AArch64ISD::ADDS;
3793 RHS = RHS.getOperand(1);
3794 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3795 isIntEqualitySetCC(CC)) {
3796 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3797 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3798 Opcode = AArch64ISD::ADDS;
3799 LHS = LHS.getOperand(1);
3800 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3801 if (LHS.getOpcode() == ISD::AND) {
3802 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3803 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3804 // of the signed comparisons.
3805 const SDValue ANDSNode =
3806 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3807 LHS.getOperand(0), LHS.getOperand(1));
3808 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3809 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3810 return ANDSNode.getValue(1);
3811 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3812 // Use result of ANDS
3813 return LHS.getValue(1);
3814 }
3815 }
3816
3817 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3818 .getValue(1);
3819}
3820
3821/// \defgroup AArch64CCMP CMP;CCMP matching
3822///
3823/// These functions deal with the formation of CMP;CCMP;... sequences.
3824/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3825/// a comparison. They set the NZCV flags to a predefined value if their
3826/// predicate is false. This allows to express arbitrary conjunctions, for
3827/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3828/// expressed as:
3829/// cmp A
3830/// ccmp B, inv(CB), CA
3831/// check for CB flags
3832///
3833/// This naturally lets us implement chains of AND operations with SETCC
3834/// operands. And we can even implement some other situations by transforming
3835/// them:
3836/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3837/// negating the flags used in a CCMP/FCCMP operations.
3838/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3839/// by negating the flags we test for afterwards. i.e.
3840/// NEG (CMP CCMP CCCMP ...) can be implemented.
3841/// - Note that we can only ever negate all previously processed results.
3842/// What we can not implement by flipping the flags to test is a negation
3843/// of two sub-trees (because the negation affects all sub-trees emitted so
3844/// far, so the 2nd sub-tree we emit would also affect the first).
3845/// With those tools we can implement some OR operations:
3846/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3847/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3848/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3849/// elimination rules from earlier to implement the whole thing as a
3850/// CCMP/FCCMP chain.
3851///
3852/// As complete example:
3853/// or (or (setCA (cmp A)) (setCB (cmp B)))
3854/// (and (setCC (cmp C)) (setCD (cmp D)))"
3855/// can be reassociated to:
3856/// or (and (setCC (cmp C)) setCD (cmp D))
3857// (or (setCA (cmp A)) (setCB (cmp B)))
3858/// can be transformed to:
3859/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3860/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3861/// which can be implemented as:
3862/// cmp C
3863/// ccmp D, inv(CD), CC
3864/// ccmp A, CA, inv(CD)
3865/// ccmp B, CB, inv(CA)
3866/// check for CB flags
3867///
3868/// A counterexample is "or (and A B) (and C D)" which translates to
3869/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3870/// can only implement 1 of the inner (not) operations, but not both!
3871/// @{
3872
3873/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3875 ISD::CondCode CC, SDValue CCOp,
3877 AArch64CC::CondCode OutCC,
3878 const SDLoc &DL, SelectionDAG &DAG) {
3879 unsigned Opcode = 0;
3880 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3881
3882 if (LHS.getValueType().isFloatingPoint()) {
3883 assert(LHS.getValueType() != MVT::f128);
3884 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3885 LHS.getValueType() == MVT::bf16) {
3886 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3887 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3888 }
3889 Opcode = AArch64ISD::FCCMP;
3890 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3891 APInt Imm = Const->getAPIntValue();
3892 if (Imm.isNegative() && Imm.sgt(-32)) {
3893 Opcode = AArch64ISD::CCMN;
3894 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3895 }
3896 } else if (isCMN(RHS, CC, DAG)) {
3897 Opcode = AArch64ISD::CCMN;
3898 RHS = RHS.getOperand(1);
3899 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3900 isIntEqualitySetCC(CC)) {
3901 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3902 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3903 Opcode = AArch64ISD::CCMN;
3904 LHS = LHS.getOperand(1);
3905 }
3906 if (Opcode == 0)
3907 Opcode = AArch64ISD::CCMP;
3908
3909 SDValue Condition = getCondCode(DAG, Predicate);
3911 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3912 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3913 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3914}
3915
3916/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3917/// expressed as a conjunction. See \ref AArch64CCMP.
3918/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3919/// changing the conditions on the SETCC tests.
3920/// (this means we can call emitConjunctionRec() with
3921/// Negate==true on this sub-tree)
3922/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3923/// cannot do the negation naturally. We are required to
3924/// emit the subtree first in this case.
3925/// \param PreferFirst Set to true if processing this subtree first may
3926/// result in more efficient code.
3927/// \param WillNegate Is true if are called when the result of this
3928/// subexpression must be negated. This happens when the
3929/// outer expression is an OR. We can use this fact to know
3930/// that we have a double negation (or (or ...) ...) that
3931/// can be implemented for free.
3932static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val,
3933 bool &CanNegate, bool &MustBeFirst,
3934 bool &PreferFirst, bool WillNegate,
3935 unsigned Depth = 0) {
3936 if (!Val.hasOneUse())
3937 return false;
3938 unsigned Opcode = Val->getOpcode();
3939 if (Opcode == ISD::SETCC) {
3940 EVT VT = Val->getOperand(0).getValueType();
3941 if (VT == MVT::f128)
3942 return false;
3943 CanNegate = true;
3944 MustBeFirst = false;
3945 // Designate this operation as a preferred first operation if the result
3946 // of a SUB operation can be reused.
3947 PreferFirst = DAG.doesNodeExist(ISD::SUB, DAG.getVTList(VT),
3948 {Val->getOperand(0), Val->getOperand(1)});
3949 return true;
3950 }
3951 // Protect against exponential runtime and stack overflow.
3952 if (Depth > 6)
3953 return false;
3954 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3955 bool IsOR = Opcode == ISD::OR;
3956 SDValue O0 = Val->getOperand(0);
3957 SDValue O1 = Val->getOperand(1);
3958 bool CanNegateL;
3959 bool MustBeFirstL;
3960 bool PreferFirstL;
3961 if (!canEmitConjunction(DAG, O0, CanNegateL, MustBeFirstL, PreferFirstL,
3962 IsOR, Depth + 1))
3963 return false;
3964 bool CanNegateR;
3965 bool MustBeFirstR;
3966 bool PreferFirstR;
3967 if (!canEmitConjunction(DAG, O1, CanNegateR, MustBeFirstR, PreferFirstR,
3968 IsOR, Depth + 1))
3969 return false;
3970
3971 if (MustBeFirstL && MustBeFirstR)
3972 return false;
3973
3974 if (IsOR) {
3975 // For an OR expression we need to be able to naturally negate at least
3976 // one side or we cannot do the transformation at all.
3977 if (!CanNegateL && !CanNegateR)
3978 return false;
3979 // If we the result of the OR will be negated and we can naturally negate
3980 // the leaves, then this sub-tree as a whole negates naturally.
3981 CanNegate = WillNegate && CanNegateL && CanNegateR;
3982 // If we cannot naturally negate the whole sub-tree, then this must be
3983 // emitted first.
3984 MustBeFirst = !CanNegate;
3985 } else {
3986 assert(Opcode == ISD::AND && "Must be OR or AND");
3987 // We cannot naturally negate an AND operation.
3988 CanNegate = false;
3989 MustBeFirst = MustBeFirstL || MustBeFirstR;
3990 }
3991 PreferFirst = PreferFirstL || PreferFirstR;
3992 return true;
3993 }
3994 return false;
3995}
3996
3997/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3998/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3999/// Tries to transform the given i1 producing node @p Val to a series compare
4000/// and conditional compare operations. @returns an NZCV flags producing node
4001/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
4002/// transformation was not possible.
4003/// \p Negate is true if we want this sub-tree being negated just by changing
4004/// SETCC conditions.
4006 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
4008 // We're at a tree leaf, produce a conditional comparison operation.
4009 unsigned Opcode = Val->getOpcode();
4010 if (Opcode == ISD::SETCC) {
4011 SDValue LHS = Val->getOperand(0);
4012 SDValue RHS = Val->getOperand(1);
4013 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
4014 bool isInteger = LHS.getValueType().isInteger();
4015 if (Negate)
4016 CC = getSetCCInverse(CC, LHS.getValueType());
4017 SDLoc DL(Val);
4018 // Determine OutCC and handle FP special case.
4019 if (isInteger) {
4020 OutCC = changeIntCCToAArch64CC(CC, RHS);
4021 } else {
4022 assert(LHS.getValueType().isFloatingPoint());
4023 AArch64CC::CondCode ExtraCC;
4024 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4025 // Some floating point conditions can't be tested with a single condition
4026 // code. Construct an additional comparison in this case.
4027 if (ExtraCC != AArch64CC::AL) {
4028 SDValue ExtraCmp;
4029 if (!CCOp.getNode())
4030 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
4031 else
4032 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
4033 ExtraCC, DL, DAG);
4034 CCOp = ExtraCmp;
4035 Predicate = ExtraCC;
4036 }
4037 }
4038
4039 // Produce a normal comparison if we are first in the chain
4040 if (!CCOp)
4041 return emitComparison(LHS, RHS, CC, DL, DAG);
4042 // Otherwise produce a ccmp.
4043 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
4044 DAG);
4045 }
4046 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
4047
4048 bool IsOR = Opcode == ISD::OR;
4049
4050 SDValue LHS = Val->getOperand(0);
4051 bool CanNegateL;
4052 bool MustBeFirstL;
4053 bool PreferFirstL;
4054 bool ValidL = canEmitConjunction(DAG, LHS, CanNegateL, MustBeFirstL,
4055 PreferFirstL, IsOR);
4056 assert(ValidL && "Valid conjunction/disjunction tree");
4057 (void)ValidL;
4058
4059 SDValue RHS = Val->getOperand(1);
4060 bool CanNegateR;
4061 bool MustBeFirstR;
4062 bool PreferFirstR;
4063 bool ValidR = canEmitConjunction(DAG, RHS, CanNegateR, MustBeFirstR,
4064 PreferFirstR, IsOR);
4065 assert(ValidR && "Valid conjunction/disjunction tree");
4066 (void)ValidR;
4067
4068 bool ShouldFirstL = PreferFirstL && !PreferFirstR && !MustBeFirstR;
4069
4070 // Swap sub-tree that must or should come first to the right side.
4071 if (MustBeFirstL || ShouldFirstL) {
4072 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4073 std::swap(LHS, RHS);
4074 std::swap(CanNegateL, CanNegateR);
4075 std::swap(MustBeFirstL, MustBeFirstR);
4076 }
4077
4078 bool NegateR;
4079 bool NegateAfterR;
4080 bool NegateL;
4081 bool NegateAfterAll;
4082 if (Opcode == ISD::OR) {
4083 // Swap the sub-tree that we can negate naturally to the left.
4084 if (!CanNegateL) {
4085 assert(CanNegateR && "at least one side must be negatable");
4086 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4087 assert(!Negate);
4088 std::swap(LHS, RHS);
4089 NegateR = false;
4090 NegateAfterR = true;
4091 } else {
4092 // Negate the left sub-tree if possible, otherwise negate the result.
4093 NegateR = CanNegateR;
4094 NegateAfterR = !CanNegateR;
4095 }
4096 NegateL = true;
4097 NegateAfterAll = !Negate;
4098 } else {
4099 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4100 assert(!Negate && "Valid conjunction/disjunction tree");
4101
4102 NegateL = false;
4103 NegateR = false;
4104 NegateAfterR = false;
4105 NegateAfterAll = false;
4106 }
4107
4108 // Emit sub-trees.
4109 AArch64CC::CondCode RHSCC;
4110 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4111 if (NegateAfterR)
4112 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4113 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4114 if (NegateAfterAll)
4115 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4116 return CmpL;
4117}
4118
4119/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4120/// In some cases this is even possible with OR operations in the expression.
4121/// See \ref AArch64CCMP.
4122/// \see emitConjunctionRec().
4124 AArch64CC::CondCode &OutCC) {
4125 bool DummyCanNegate;
4126 bool DummyMustBeFirst;
4127 bool DummyPreferFirst;
4128 if (!canEmitConjunction(DAG, Val, DummyCanNegate, DummyMustBeFirst,
4129 DummyPreferFirst, false))
4130 return SDValue();
4131
4132 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4133}
4134
4135/// @}
4136
4137/// Returns how profitable it is to fold a comparison's operand's shift and/or
4138/// extension operations.
4140 auto isSupportedExtend = [&](SDValue V) {
4141 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4142 return true;
4143
4144 if (V.getOpcode() == ISD::AND)
4145 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4146 uint64_t Mask = MaskCst->getZExtValue();
4147 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4148 }
4149
4150 return false;
4151 };
4152
4153 if (!Op.hasOneUse())
4154 return 0;
4155
4156 if (isSupportedExtend(Op))
4157 return 1;
4158
4159 unsigned Opc = Op.getOpcode();
4160 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4161 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4162 uint64_t Shift = ShiftCst->getZExtValue();
4163 if (isSupportedExtend(Op.getOperand(0)))
4164 return (Shift <= 4) ? 2 : 1;
4165 EVT VT = Op.getValueType();
4166 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4167 return 1;
4168 }
4169
4170 return 0;
4171}
4172
4173// emitComparison() converts comparison with one or negative one to comparison
4174// with 0. Note that this only works for signed comparisons because of how ANDS
4175// works.
4177 // Only works for ANDS and AND.
4178 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4179 return false;
4180
4181 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4182 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4183 return true;
4184 }
4185
4186 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4187 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4188 return true;
4189 }
4190
4191 return false;
4192}
4193
4195 SDValue &AArch64cc, SelectionDAG &DAG,
4196 const SDLoc &DL) {
4197 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4198 EVT VT = RHS.getValueType();
4199 APInt C = RHSC->getAPIntValue();
4200 // shouldBeAdjustedToZero is a special case to better fold with
4201 // emitComparison().
4202 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4203 // Adjust the constant to zero.
4204 // CC has already been adjusted.
4205 RHS = DAG.getConstant(0, DL, VT);
4206 } else if (!isLegalCmpImmed(C)) {
4207 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4208 // Constant does not fit, try adjusting it by one?
4209 switch (CC) {
4210 default:
4211 break;
4212 case ISD::SETLT:
4213 case ISD::SETGE:
4214 if (!C.isMinSignedValue()) {
4215 APInt CMinusOne = C - 1;
4216 if (isLegalCmpImmed(CMinusOne) ||
4217 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4218 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4219 RHS = DAG.getConstant(CMinusOne, DL, VT);
4220 }
4221 }
4222 break;
4223 case ISD::SETULT:
4224 case ISD::SETUGE: {
4225 // C is not 0 because it is a legal immediate.
4226 assert(!C.isZero() && "C should not be zero here");
4227 APInt CMinusOne = C - 1;
4228 if (isLegalCmpImmed(CMinusOne) ||
4229 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4230 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4231 RHS = DAG.getConstant(CMinusOne, DL, VT);
4232 }
4233 break;
4234 }
4235 case ISD::SETLE:
4236 case ISD::SETGT:
4237 if (!C.isMaxSignedValue()) {
4238 APInt CPlusOne = C + 1;
4239 if (isLegalCmpImmed(CPlusOne) ||
4240 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4241 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4242 RHS = DAG.getConstant(CPlusOne, DL, VT);
4243 }
4244 }
4245 break;
4246 case ISD::SETULE:
4247 case ISD::SETUGT: {
4248 if (!C.isAllOnes()) {
4249 APInt CPlusOne = C + 1;
4250 if (isLegalCmpImmed(CPlusOne) ||
4251 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4252 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4253 RHS = DAG.getConstant(CPlusOne, DL, VT);
4254 }
4255 }
4256 break;
4257 }
4258 }
4259 }
4260 }
4261
4262 // Comparisons are canonicalized so that the RHS operand is simpler than the
4263 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4264 // can fold some shift+extend operations on the RHS operand, so swap the
4265 // operands if that can be done.
4266 //
4267 // For example:
4268 // lsl w13, w11, #1
4269 // cmp w13, w12
4270 // can be turned into:
4271 // cmp w12, w11, lsl #1
4272 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4273 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4274 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4275 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4276 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4277
4278 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4279 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4280 std::swap(LHS, RHS);
4282 }
4283 }
4284
4285 SDValue Cmp;
4287 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4289
4290 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4291 // For the i8 operand, the largest immediate is 255, so this can be easily
4292 // encoded in the compare instruction. For the i16 operand, however, the
4293 // largest immediate cannot be encoded in the compare.
4294 // Therefore, use a sign extending load and cmn to avoid materializing the
4295 // -1 constant. For example,
4296 // movz w1, #65535
4297 // ldrh w0, [x0, #0]
4298 // cmp w0, w1
4299 // >
4300 // ldrsh w0, [x0, #0]
4301 // cmn w0, #1
4302 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4303 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4304 // ensure both the LHS and RHS are truly zero extended and to make sure the
4305 // transformation is profitable.
4306 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4307 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4308 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4309 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4310 int16_t ValueofRHS = RHS->getAsZExtVal();
4311 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4312 SDValue SExt =
4313 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4314 DAG.getValueType(MVT::i16));
4315 Cmp = emitComparison(
4316 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4317 DL, DAG);
4319 }
4320 }
4321
4322 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4323 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4324 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4326 }
4327 }
4328 }
4329
4330 if (!Cmp) {
4331 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4333 }
4334 AArch64cc = getCondCode(DAG, AArch64CC);
4335 return Cmp;
4336}
4337
4338static std::pair<SDValue, SDValue>
4340 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4341 "Unsupported value type");
4342 SDValue Value, Overflow;
4343 SDLoc DL(Op);
4344 SDValue LHS = Op.getOperand(0);
4345 SDValue RHS = Op.getOperand(1);
4346 unsigned Opc = 0;
4347 switch (Op.getOpcode()) {
4348 default:
4349 llvm_unreachable("Unknown overflow instruction!");
4350 case ISD::SADDO:
4351 Opc = AArch64ISD::ADDS;
4352 CC = AArch64CC::VS;
4353 break;
4354 case ISD::UADDO:
4355 Opc = AArch64ISD::ADDS;
4356 CC = AArch64CC::HS;
4357 break;
4358 case ISD::SSUBO:
4359 Opc = AArch64ISD::SUBS;
4360 CC = AArch64CC::VS;
4361 break;
4362 case ISD::USUBO:
4363 Opc = AArch64ISD::SUBS;
4364 CC = AArch64CC::LO;
4365 break;
4366 // Multiply needs a little bit extra work.
4367 case ISD::SMULO:
4368 case ISD::UMULO: {
4369 CC = AArch64CC::NE;
4370 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4371 if (Op.getValueType() == MVT::i32) {
4372 // Extend to 64-bits, then perform a 64-bit multiply.
4373 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4374 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4375 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4376 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4377 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4378
4379 // Check that the result fits into a 32-bit integer.
4380 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4381 if (IsSigned) {
4382 // cmp xreg, wreg, sxtw
4383 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4384 Overflow =
4385 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4386 } else {
4387 // tst xreg, #0xffffffff00000000
4388 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4389 Overflow =
4390 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4391 }
4392 break;
4393 }
4394 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4395 // For the 64 bit multiply
4396 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4397 if (IsSigned) {
4398 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4399 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4400 DAG.getConstant(63, DL, MVT::i64));
4401 // It is important that LowerBits is last, otherwise the arithmetic
4402 // shift will not be folded into the compare (SUBS).
4403 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4404 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4405 .getValue(1);
4406 } else {
4407 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4408 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4409 Overflow =
4410 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4411 DAG.getConstant(0, DL, MVT::i64),
4412 UpperBits).getValue(1);
4413 }
4414 break;
4415 }
4416 } // switch (...)
4417
4418 if (Opc) {
4419 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4420
4421 // Emit the AArch64 operation with overflow check.
4422 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4423 Overflow = Value.getValue(1);
4424 }
4425 return std::make_pair(Value, Overflow);
4426}
4427
4428SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4429 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4430 !Subtarget->isNeonAvailable()))
4431 return LowerToScalableOp(Op, DAG);
4432
4433 SDValue Sel = Op.getOperand(0);
4434 SDValue Other = Op.getOperand(1);
4435 SDLoc DL(Sel);
4436
4437 // If the operand is an overflow checking operation, invert the condition
4438 // code and kill the Not operation. I.e., transform:
4439 // (xor (overflow_op_bool, 1))
4440 // -->
4441 // (csel 1, 0, invert(cc), overflow_op_bool)
4442 // ... which later gets transformed to just a cset instruction with an
4443 // inverted condition code, rather than a cset + eor sequence.
4445 // Only lower legal XALUO ops.
4447 return SDValue();
4448
4449 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4450 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4452 SDValue Value, Overflow;
4453 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4454 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4455 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4456 CCVal, Overflow);
4457 }
4458 // If neither operand is a SELECT_CC, give up.
4459 if (Sel.getOpcode() != ISD::SELECT_CC)
4460 std::swap(Sel, Other);
4461 if (Sel.getOpcode() != ISD::SELECT_CC)
4462 return Op;
4463
4464 // The folding we want to perform is:
4465 // (xor x, (select_cc a, b, cc, 0, -1) )
4466 // -->
4467 // (csel x, (xor x, -1), cc ...)
4468 //
4469 // The latter will get matched to a CSINV instruction.
4470
4471 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4472 SDValue LHS = Sel.getOperand(0);
4473 SDValue RHS = Sel.getOperand(1);
4474 SDValue TVal = Sel.getOperand(2);
4475 SDValue FVal = Sel.getOperand(3);
4476
4477 // FIXME: This could be generalized to non-integer comparisons.
4478 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4479 return Op;
4480
4481 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4482 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4483
4484 // The values aren't constants, this isn't the pattern we're looking for.
4485 if (!CFVal || !CTVal)
4486 return Op;
4487
4488 // We can commute the SELECT_CC by inverting the condition. This
4489 // might be needed to make this fit into a CSINV pattern.
4490 if (CTVal->isAllOnes() && CFVal->isZero()) {
4491 std::swap(TVal, FVal);
4492 std::swap(CTVal, CFVal);
4493 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4494 }
4495
4496 // If the constants line up, perform the transform!
4497 if (CTVal->isZero() && CFVal->isAllOnes()) {
4498 SDValue CCVal;
4499 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4500
4501 FVal = Other;
4502 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4503 DAG.getAllOnesConstant(DL, Other.getValueType()));
4504
4505 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4506 CCVal, Cmp);
4507 }
4508
4509 return Op;
4510}
4511
4512// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4513// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4514// sets 'C' bit to 0.
4516 SDLoc DL(Value);
4517 EVT VT = Value.getValueType();
4518 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4519 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4520 SDValue Cmp =
4521 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4522 return Cmp.getValue(1);
4523}
4524
4525// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4526// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4528 bool Invert) {
4529 assert(Glue.getResNo() == 1);
4530 SDLoc DL(Glue);
4531 SDValue Zero = DAG.getConstant(0, DL, VT);
4532 SDValue One = DAG.getConstant(1, DL, VT);
4534 SDValue CC = getCondCode(DAG, Cond);
4535 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4536}
4537
4538// Value is 1 if 'V' bit of NZCV is 1, else 0
4540 assert(Glue.getResNo() == 1);
4541 SDLoc DL(Glue);
4542 SDValue Zero = DAG.getConstant(0, DL, VT);
4543 SDValue One = DAG.getConstant(1, DL, VT);
4545 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4546}
4547
4548// This lowering is inefficient, but it will get cleaned up by
4549// `foldOverflowCheck`
4551 unsigned Opcode, bool IsSigned) {
4552 EVT VT0 = Op.getValue(0).getValueType();
4553 EVT VT1 = Op.getValue(1).getValueType();
4554
4555 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4556 return SDValue();
4557
4558 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4559 SDValue OpLHS = Op.getOperand(0);
4560 SDValue OpRHS = Op.getOperand(1);
4561 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4562
4563 SDLoc DL(Op);
4564
4565 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4566 OpRHS, OpCarryIn);
4567
4568 SDValue OutFlag =
4569 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4570 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4571
4572 return DAG.getMergeValues({Sum, OutFlag}, DL);
4573}
4574
4575static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
4576 SelectionDAG &DAG,
4577 bool LastOperandIsImm = false) {
4578 if (Op.getValueType().isVector())
4579 return SDValue();
4580
4581 SDLoc DL(Op);
4583 const unsigned NumOperands = Op.getNumOperands();
4584 auto getFloatVT = [](EVT VT) {
4585 assert((VT == MVT::i32 || VT == MVT::i64) && "Unexpected VT");
4586 return VT == MVT::i32 ? MVT::f32 : MVT::f64;
4587 };
4588 auto bitcastToFloat = [&](SDValue Val) {
4589 return DAG.getBitcast(getFloatVT(Val.getValueType()), Val);
4590 };
4591
4592 // Skip first operand as it is intrinsic ID.
4593 for (unsigned I = 1; I < NumOperands; ++I) {
4594 SDValue Val = Op.getOperand(I);
4595 const bool KeepInt = LastOperandIsImm && (I == NumOperands - 1);
4596 NewOps.push_back(KeepInt ? Val : bitcastToFloat(Val));
4597 }
4598 EVT OrigVT = Op.getValueType();
4599 SDValue OpNode = DAG.getNode(Opcode, DL, getFloatVT(OrigVT), NewOps);
4600 return DAG.getBitcast(OrigVT, OpNode);
4601}
4602
4604 // Let legalize expand this if it isn't a legal type yet.
4605 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4606 return SDValue();
4607
4608 SDLoc DL(Op);
4610 // The actual operation that sets the overflow or carry flag.
4611 SDValue Value, Overflow;
4612 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4613
4614 // We use 0 and 1 as false and true values.
4615 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4616 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4617
4618 // We use an inverted condition, because the conditional select is inverted
4619 // too. This will allow it to be selected to a single instruction:
4620 // CSINC Wd, WZR, WZR, invert(cond).
4621 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4622 Overflow =
4623 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4624
4625 return DAG.getMergeValues({Value, Overflow}, DL);
4626}
4627
4628// Prefetch operands are:
4629// 1: Address to prefetch
4630// 2: bool isWrite
4631// 3: int locality (0 = no locality ... 3 = extreme locality)
4632// 4: bool isDataCache
4634 SDLoc DL(Op);
4635 unsigned IsWrite = Op.getConstantOperandVal(2);
4636 unsigned Locality = Op.getConstantOperandVal(3);
4637 unsigned IsData = Op.getConstantOperandVal(4);
4638
4639 bool IsStream = !Locality;
4640 // When the locality number is set
4641 if (Locality) {
4642 // The front-end should have filtered out the out-of-range values
4643 assert(Locality <= 3 && "Prefetch locality out-of-range");
4644 // The locality degree is the opposite of the cache speed.
4645 // Put the number the other way around.
4646 // The encoding starts at 0 for level 1
4647 Locality = 3 - Locality;
4648 }
4649
4650 // built the mask value encoding the expected behavior.
4651 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4652 (!IsData << 3) | // IsDataCache bit
4653 (Locality << 1) | // Cache level bits
4654 (unsigned)IsStream; // Stream bit
4655 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4656 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4657 Op.getOperand(1));
4658}
4659
4660// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4661// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4662// (AND X Y) Z which produces a better opt with EmitComparison
4664 SelectionDAG &DAG, const SDLoc DL) {
4665 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4666 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4668 if (LHSConstOp && RHSConst) {
4669 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4670 uint64_t RHSConstant = RHSConst->getZExtValue();
4671 if (isPowerOf2_64(RHSConstant)) {
4672 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4673 LHS =
4674 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4675 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4676 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4677 CC = ISD::SETEQ;
4678 }
4679 }
4680 }
4681}
4682
4683SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4684 SelectionDAG &DAG) const {
4685 EVT VT = Op.getValueType();
4686 if (VT.isScalableVector()) {
4687 SDValue SrcVal = Op.getOperand(0);
4688
4689 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4690 // Break conversion in two with the first part converting to f32 and the
4691 // second using native f32->VT instructions.
4692 SDLoc DL(Op);
4693 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4694 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4695 }
4696
4697 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4698 }
4699
4700 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4701 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4702
4703 bool IsStrict = Op->isStrictFPOpcode();
4704 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4705 EVT Op0VT = Op0.getValueType();
4706 if (VT == MVT::f64) {
4707 // FP16->FP32 extends are legal for v32 and v4f32.
4708 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4709 return Op;
4710 // Split bf16->f64 extends into two fpextends.
4711 if (Op0VT == MVT::bf16 && IsStrict) {
4712 SDValue Ext1 =
4713 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4714 {Op0, Op.getOperand(0)});
4715 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4716 {Ext1, Ext1.getValue(1)});
4717 }
4718 if (Op0VT == MVT::bf16)
4719 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4720 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4721 return SDValue();
4722 }
4723
4724 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4725 return SDValue();
4726}
4727
4728SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4729 SelectionDAG &DAG) const {
4730 EVT VT = Op.getValueType();
4731 bool IsStrict = Op->isStrictFPOpcode();
4732 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4733 EVT SrcVT = SrcVal.getValueType();
4734 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4735 SDNodeFlags Flags = Op->getFlags();
4736
4737 if (VT.isScalableVector()) {
4738 // Let common code split the operation.
4739 if (SrcVT == MVT::nxv8f32)
4740 return Op;
4741
4742 if (VT.getScalarType() != MVT::bf16)
4743 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4744
4745 SDLoc DL(Op);
4746 constexpr EVT I32 = MVT::nxv4i32;
4747 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4748
4749 SDValue NaN;
4750 SDValue Narrow;
4751
4752 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4753 if (Subtarget->hasBF16())
4754 return LowerToPredicatedOp(Op, DAG,
4755 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4756
4757 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4758
4759 // Set the quiet bit.
4760 if (!DAG.isKnownNeverSNaN(SrcVal) && !Flags.hasNoNaNs())
4761 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4762 } else if (SrcVT == MVT::nxv2f64 &&
4763 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4764 // Round to float without introducing rounding errors and try again.
4765 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4766 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4767 Pg, SrcVal, DAG.getPOISON(MVT::nxv2f32));
4768
4770 if (IsStrict)
4771 NewOps.push_back(Op.getOperand(0));
4772 NewOps.push_back(Narrow);
4773 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4774 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4775 } else
4776 return SDValue();
4777
4778 if (!Trunc) {
4779 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4780 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4781 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4782 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4783 }
4784
4785 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4786 // 0x80000000.
4787 if (NaN) {
4788 EVT I1 = I32.changeElementType(*DAG.getContext(), MVT::i1);
4789 EVT CondVT = VT.changeElementType(*DAG.getContext(), MVT::i1);
4790 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4791 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4792 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4793 }
4794
4795 // Now that we have rounded, shift the bits into position.
4796 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4797 return getSVESafeBitCast(VT, Narrow, DAG);
4798 }
4799
4800 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4801 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4802
4803 // Expand cases where the result type is BF16 but we don't have hardware
4804 // instructions to lower it.
4805 if (VT.getScalarType() == MVT::bf16 &&
4806 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4807 Subtarget->hasBF16())) {
4808 SDLoc DL(Op);
4809 SDValue Narrow = SrcVal;
4810 SDValue NaN;
4811 EVT I32 = SrcVT.changeElementType(*DAG.getContext(), MVT::i32);
4812 EVT F32 = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
4813 if (SrcVT.getScalarType() == MVT::f32) {
4814 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4815 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4816 if (!NeverSNaN) {
4817 // Set the quiet bit.
4818 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4819 DAG.getConstant(0x400000, DL, I32));
4820 }
4821 } else if (SrcVT.getScalarType() == MVT::f64) {
4822 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4823 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4824 } else {
4825 return SDValue();
4826 }
4827 if (!Trunc) {
4828 SDValue One = DAG.getConstant(1, DL, I32);
4829 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4830 DAG.getShiftAmountConstant(16, I32, DL));
4831 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4832 SDValue RoundingBias =
4833 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4834 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4835 }
4836
4837 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4838 // 0x80000000.
4839 if (NaN) {
4840 SDValue IsNaN = DAG.getSetCC(
4841 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4842 SrcVal, SrcVal, ISD::SETUO);
4843 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4844 }
4845
4846 // Now that we have rounded, shift the bits into position.
4847 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4848 DAG.getShiftAmountConstant(16, I32, DL));
4849 if (VT.isVector()) {
4850 EVT I16 = I32.changeVectorElementType(*DAG.getContext(), MVT::i16);
4851 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4852 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4853 }
4854 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4855 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4856 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4857 : Result;
4858 }
4859
4860 if (SrcVT != MVT::f128) {
4861 // Expand cases where the input is a vector bigger than NEON.
4863 return SDValue();
4864
4865 // It's legal except when f128 is involved
4866 return Op;
4867 }
4868
4869 return SDValue();
4870}
4871
4872SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4873 SelectionDAG &DAG) const {
4874 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4875 // Any additional optimization in this function should be recorded
4876 // in the cost tables.
4877 bool IsStrict = Op->isStrictFPOpcode();
4878 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4879 EVT VT = Op.getValueType();
4880
4881 assert(!(IsStrict && VT.isScalableVector()) &&
4882 "Unimplemented SVE support for STRICT_FP_to_INT!");
4883
4884 // f16 conversions are promoted to f32 when full fp16 is not supported.
4885 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4886 InVT.getVectorElementType() == MVT::bf16) {
4887 EVT NewVT = VT.changeElementType(*DAG.getContext(), MVT::f32);
4888 SDLoc DL(Op);
4889 if (IsStrict) {
4890 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4891 {Op.getOperand(0), Op.getOperand(1)});
4892 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4893 {Ext.getValue(1), Ext.getValue(0)});
4894 }
4895 return DAG.getNode(
4896 Op.getOpcode(), DL, Op.getValueType(),
4897 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4898 }
4899
4900 if (VT.isScalableVector()) {
4901 if (VT.getVectorElementType() == MVT::i1) {
4902 SDLoc DL(Op);
4903 EVT CvtVT = getPromotedVTForPredicate(VT);
4904 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4905 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4906 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4907 }
4908
4909 // Let common code split the operation.
4910 if (InVT == MVT::nxv8f32)
4911 return Op;
4912
4913 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4914 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4915 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4916 return LowerToPredicatedOp(Op, DAG, Opcode);
4917 }
4918
4919 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4920 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4921 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4922
4923 uint64_t VTSize = VT.getFixedSizeInBits();
4924 uint64_t InVTSize = InVT.getFixedSizeInBits();
4925 if (VTSize < InVTSize) {
4926 SDLoc DL(Op);
4927 if (IsStrict) {
4929 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4930 {Op.getOperand(0), Op.getOperand(1)});
4931 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4932 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4933 }
4934 SDValue Cv =
4935 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4936 Op.getOperand(0));
4937 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4938 }
4939
4940 if (VTSize > InVTSize) {
4941 SDLoc DL(Op);
4942 MVT ExtVT =
4945 if (IsStrict) {
4946 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4947 {Op.getOperand(0), Op.getOperand(1)});
4948 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4949 {Ext.getValue(1), Ext.getValue(0)});
4950 }
4951 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4952 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4953 }
4954
4955 // Use a scalar operation for conversions between single-element vectors of
4956 // the same size.
4957 if (InVT.getVectorNumElements() == 1) {
4958 SDLoc DL(Op);
4959 SDValue Extract = DAG.getNode(
4961 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4962 EVT ScalarVT = VT.getScalarType();
4963 if (IsStrict)
4964 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4965 {Op.getOperand(0), Extract});
4966 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4967 }
4968
4969 // Type changing conversions are illegal.
4970 return Op;
4971}
4972
4973SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4974 SelectionDAG &DAG) const {
4975 bool IsStrict = Op->isStrictFPOpcode();
4976 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4977
4978 if (SrcVal.getValueType().isVector())
4979 return LowerVectorFP_TO_INT(Op, DAG);
4980
4981 // f16 conversions are promoted to f32 when full fp16 is not supported.
4982 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4983 SrcVal.getValueType() == MVT::bf16) {
4984 SDLoc DL(Op);
4985 if (IsStrict) {
4986 SDValue Ext =
4987 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4988 {Op.getOperand(0), SrcVal});
4989 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4990 {Ext.getValue(1), Ext.getValue(0)});
4991 }
4992 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4993 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4994 }
4995
4996 if (SrcVal.getValueType() != MVT::f128) {
4997 // It's legal except when f128 is involved
4998 return Op;
4999 }
5000
5001 return SDValue();
5002}
5003
5004SDValue
5005AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
5006 SelectionDAG &DAG) const {
5007 // AArch64 FP-to-int conversions saturate to the destination element size, so
5008 // we can lower common saturating conversions to simple instructions.
5009 SDValue SrcVal = Op.getOperand(0);
5010 EVT SrcVT = SrcVal.getValueType();
5011 EVT DstVT = Op.getValueType();
5012 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5013
5014 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
5015 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
5016 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5017 assert(SatWidth <= DstElementWidth &&
5018 "Saturation width cannot exceed result width");
5019
5020 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
5021 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
5022 // types, so this is hard to reach.
5023 if (DstVT.isScalableVector())
5024 return SDValue();
5025
5026 EVT SrcElementVT = SrcVT.getVectorElementType();
5027
5028 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5029 SDLoc DL(Op);
5030 SDValue SrcVal2;
5031 if ((SrcElementVT == MVT::f16 &&
5032 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
5033 SrcElementVT == MVT::bf16) {
5034 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
5035 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
5036 // If we are extending to a v8f32, split into two v4f32 to produce legal
5037 // types.
5038 if (F32VT.getSizeInBits() > 128) {
5039 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
5040 F32VT = F32VT.getHalfNumVectorElementsVT();
5041 }
5042 SrcVT = F32VT;
5043 SrcElementVT = MVT::f32;
5044 SrcElementWidth = 32;
5045 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
5046 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
5047 return SDValue();
5048
5049 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
5050 // width and produce a fcvtzu.
5051 if (SatWidth == 64 && SrcElementWidth < 64) {
5052 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
5053 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
5054 SrcVT = F64VT;
5055 SrcElementVT = MVT::f64;
5056 SrcElementWidth = 64;
5057 }
5058 // Cases that we can emit directly.
5059 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
5060 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5061 DAG.getValueType(DstVT.getScalarType()));
5062 if (SrcVal2) {
5063 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
5064 DAG.getValueType(DstVT.getScalarType()));
5065 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
5066 }
5067 return Res;
5068 }
5069
5070 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5071 // result. This is only valid if the legal cvt is larger than the saturate
5072 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
5073 // (at least until sqxtn is selected).
5074 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
5075 return SDValue();
5076
5077 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
5078 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
5079 DAG.getValueType(IntVT.getScalarType()));
5080 SDValue NativeCvt2 =
5081 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
5082 DAG.getValueType(IntVT.getScalarType()))
5083 : SDValue();
5084 SDValue Sat, Sat2;
5085 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5086 SDValue MinC = DAG.getConstant(
5087 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5088 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
5089 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5090 SDValue MaxC = DAG.getConstant(
5091 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5092 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
5093 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
5094 } else {
5095 SDValue MinC = DAG.getConstant(
5096 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
5097 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
5098 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5099 }
5100
5101 if (SrcVal2)
5102 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
5104 Sat, Sat2);
5105
5106 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5107}
5108
5109SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5110 SelectionDAG &DAG) const {
5111 // AArch64 FP-to-int conversions saturate to the destination register size, so
5112 // we can lower common saturating conversions to simple instructions.
5113 SDValue SrcVal = Op.getOperand(0);
5114 EVT SrcVT = SrcVal.getValueType();
5115
5116 if (SrcVT.isVector())
5117 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5118
5119 EVT DstVT = Op.getValueType();
5120 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5121 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5122 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5123 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5124
5125 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5126 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5127 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5128 SrcVT = MVT::f32;
5129 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5130 SrcVT != MVT::bf16)
5131 return SDValue();
5132
5133 SDLoc DL(Op);
5134 // Cases that we can emit directly.
5135 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5136 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5137 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5138 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5139 DAG.getValueType(DstVT));
5140
5141 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5142 // result. This is only valid if the legal cvt is larger than the saturate
5143 // width.
5144 if (DstWidth < SatWidth)
5145 return SDValue();
5146
5147 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5148 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5149 SDValue CVTf32 =
5150 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5151 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5152 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5153 DAG.getValueType(SatVT));
5154 }
5155 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5156 return DAG.getBitcast(DstVT, CVTf32);
5157 }
5158
5159 SDValue NativeCvt =
5160 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5161 SDValue Sat;
5162 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5163 SDValue MinC = DAG.getConstant(
5164 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5165 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5166 SDValue MaxC = DAG.getConstant(
5167 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5168 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5169 } else {
5170 SDValue MinC = DAG.getConstant(
5171 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5172 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5173 }
5174
5175 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5176}
5177
5178SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5179 SelectionDAG &DAG) const {
5180 EVT VT = Op.getValueType();
5181 SDValue Src = Op.getOperand(0);
5182 SDLoc DL(Op);
5183
5184 assert(VT.isVector() && "Expected vector type");
5185
5186 EVT CastVT = VT.changeVectorElementType(
5187 *DAG.getContext(), Src.getValueType().getVectorElementType());
5188
5189 // Round the floating-point value into a floating-point register with the
5190 // current rounding mode.
5191 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5192
5193 // Truncate the rounded floating point to an integer.
5194 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5196}
5197
5198SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5199 SelectionDAG &DAG) const {
5200 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5201 // Any additional optimization in this function should be recorded
5202 // in the cost tables.
5203 bool IsStrict = Op->isStrictFPOpcode();
5204 EVT VT = Op.getValueType();
5205 SDLoc DL(Op);
5206 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5207 EVT InVT = In.getValueType();
5208 unsigned Opc = Op.getOpcode();
5209 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5210
5211 assert(!(IsStrict && VT.isScalableVector()) &&
5212 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5213
5214 // NOTE: i1->bf16 does not require promotion to f32.
5215 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5216 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5217 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5218 : DAG.getConstantFP(1.0, DL, VT);
5219 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5220 }
5221
5222 // Promote bf16 conversions to f32.
5223 if (VT.getVectorElementType() == MVT::bf16) {
5224 EVT F32 = VT.changeElementType(*DAG.getContext(), MVT::f32);
5225 if (IsStrict) {
5226 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5227 {Op.getOperand(0), In});
5228 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5229 {Op.getValueType(), MVT::Other},
5230 {Val.getValue(1), Val.getValue(0),
5231 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5232 }
5233 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5234 DAG.getNode(Op.getOpcode(), DL, F32, In),
5235 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5236 }
5237
5238 if (VT.isScalableVector()) {
5239 // Let common code split the operation.
5240 if (VT == MVT::nxv8f32)
5241 return Op;
5242
5243 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5244 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5245 return LowerToPredicatedOp(Op, DAG, Opcode);
5246 }
5247
5248 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5249 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5250 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5251
5252 uint64_t VTSize = VT.getFixedSizeInBits();
5253 uint64_t InVTSize = InVT.getFixedSizeInBits();
5254 if (VTSize < InVTSize) {
5255 // AArch64 doesn't have a direct vector instruction to convert
5256 // fixed point to floating point AND narrow it at the same time.
5257 // Additional rounding when the target is f32/f64 causes double
5258 // rounding issues. Conversion to f16 is fine due to narrow width.
5259 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5260 bool IsTargetf16 = false;
5261 if (Op.hasOneUse() &&
5262 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5263 // Some vector types are split during legalization into half, followed by
5264 // concatenation, followed by rounding to the original vector type. If we
5265 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5266 SDNode *U = *Op->user_begin();
5267 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5268 EVT TmpVT = U->user_begin()->getValueType(0);
5269 if (TmpVT.getScalarType() == MVT::f16)
5270 IsTargetf16 = true;
5271 }
5272 }
5273
5274 if (IsTargetf32 && !IsTargetf16) {
5275 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5276 }
5277
5278 MVT CastVT =
5280 InVT.getVectorNumElements());
5281 if (IsStrict) {
5282 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5283 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5284 {In.getValue(1), In.getValue(0),
5285 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5286 }
5287 In = DAG.getNode(Opc, DL, CastVT, In);
5288 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5289 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5290 }
5291
5292 if (VTSize > InVTSize) {
5293 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5294 EVT CastVT = VT.changeVectorElementTypeToInteger();
5295 In = DAG.getNode(CastOpc, DL, CastVT, In);
5296 if (IsStrict)
5297 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5298 return DAG.getNode(Opc, DL, VT, In);
5299 }
5300
5301 // Use a scalar operation for conversions between single-element vectors of
5302 // the same size.
5303 if (VT.getVectorNumElements() == 1) {
5304 SDValue Extract =
5306 DAG.getConstant(0, DL, MVT::i64));
5307 EVT ScalarVT = VT.getScalarType();
5308 if (IsStrict)
5309 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5310 {Op.getOperand(0), Extract});
5311 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5312 }
5313
5314 return Op;
5315}
5316
5317SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5318 SelectionDAG &DAG) const {
5319 if (Op.getValueType().isVector())
5320 return LowerVectorINT_TO_FP(Op, DAG);
5321
5322 bool IsStrict = Op->isStrictFPOpcode();
5323 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5324
5325 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5326 Op->getOpcode() == ISD::SINT_TO_FP;
5327
5328 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5329 SDLoc DL(Op);
5330 if (IsStrict) {
5331 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5332 {Op.getOperand(0), SrcVal});
5333 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5334 {Op.getValueType(), MVT::Other},
5335 {Val.getValue(1), Val.getValue(0),
5336 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5337 }
5338 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5339 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5340 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5341 };
5342
5343 if (Op.getValueType() == MVT::bf16) {
5344 unsigned MaxWidth = IsSigned
5345 ? DAG.ComputeMaxSignificantBits(SrcVal)
5346 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5347 // bf16 conversions are promoted to f32 when converting from i16.
5348 if (MaxWidth <= 24) {
5349 return IntToFpViaPromotion(MVT::f32);
5350 }
5351
5352 // bf16 conversions are promoted to f64 when converting from i32.
5353 if (MaxWidth <= 53) {
5354 return IntToFpViaPromotion(MVT::f64);
5355 }
5356
5357 // We need to be careful about i64 -> bf16.
5358 // Consider an i32 22216703.
5359 // This number cannot be represented exactly as an f32 and so a itofp will
5360 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5361 // However, the correct bf16 was supposed to be 22151168.0
5362 // We need to use sticky rounding to get this correct.
5363 if (SrcVal.getValueType() == MVT::i64) {
5364 SDLoc DL(Op);
5365 // This algorithm is equivalent to the following:
5366 // uint64_t SrcHi = SrcVal & ~0xfffull;
5367 // uint64_t SrcLo = SrcVal & 0xfffull;
5368 // uint64_t Highest = SrcVal >> 53;
5369 // bool HasHighest = Highest != 0;
5370 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5371 // double Rounded = static_cast<double>(ToRound);
5372 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5373 // uint64_t HasLo = SrcLo != 0;
5374 // bool NeedsAdjustment = HasHighest & HasLo;
5375 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5376 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5377 // return static_cast<__bf16>(Adjusted);
5378 //
5379 // Essentially, what happens is that SrcVal either fits perfectly in a
5380 // double-precision value or it is too big. If it is sufficiently small,
5381 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5382 // ensure that u64 -> double has no rounding error by only using the 52
5383 // MSB of the input. The low order bits will get merged into a sticky bit
5384 // which will avoid issues incurred by double rounding.
5385
5386 // Signed conversion is more or less like so:
5387 // copysign((__bf16)abs(SrcVal), SrcVal)
5388 SDValue SignBit;
5389 if (IsSigned) {
5390 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5391 DAG.getConstant(1ull << 63, DL, MVT::i64));
5392 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5393 }
5394 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5395 DAG.getConstant(~0xfffull, DL, MVT::i64));
5396 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5397 DAG.getConstant(0xfffull, DL, MVT::i64));
5398 SDValue Highest =
5399 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5400 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5401 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5402 SDValue ToRound =
5403 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5404 SDValue Rounded =
5405 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5406 {Op.getOperand(0), ToRound})
5407 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5408
5409 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5410 if (SignBit) {
5411 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5412 }
5413
5414 SDValue HasHighest = DAG.getSetCC(
5415 DL,
5416 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5417 Highest, Zero64, ISD::SETNE);
5418
5419 SDValue HasLo = DAG.getSetCC(
5420 DL,
5421 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5422 SrcLo, Zero64, ISD::SETNE);
5423
5424 SDValue NeedsAdjustment =
5425 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5426 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5427
5428 SDValue AdjustedBits =
5429 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5430 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5431 return IsStrict
5432 ? DAG.getNode(
5434 {Op.getValueType(), MVT::Other},
5435 {Rounded.getValue(1), Adjusted,
5436 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5437 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5438 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5439 }
5440 }
5441
5442 // f16 conversions are promoted to f32 when full fp16 is not supported.
5443 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5444 return IntToFpViaPromotion(MVT::f32);
5445 }
5446
5447 // i128 conversions are libcalls.
5448 if (SrcVal.getValueType() == MVT::i128)
5449 return SDValue();
5450
5451 // Other conversions are legal, unless it's to the completely software-based
5452 // fp128.
5453 if (Op.getValueType() != MVT::f128)
5454 return Op;
5455 return SDValue();
5456}
5457
5458static MVT getSVEContainerType(EVT ContentTy);
5459
5460SDValue
5461AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5462 SelectionDAG &DAG) const {
5463 assert((Subtarget->hasSVE2() ||
5464 (Subtarget->hasSME() && Subtarget->isStreaming())) &&
5465 "Lowering loop_dependence_raw_mask or loop_dependence_war_mask "
5466 "requires SVE or SME");
5467
5468 SDLoc DL(Op);
5469 EVT VT = Op.getValueType();
5470 unsigned LaneOffset = Op.getConstantOperandVal(3);
5471 unsigned NumElements = VT.getVectorMinNumElements();
5472 uint64_t EltSizeInBytes = Op.getConstantOperandVal(2);
5473 EVT AddrTy = Op->getOperand(0).getValueType();
5474
5475 // Lane offsets and other element sizes are not supported by whilewr/rw.
5476 if (LaneOffset != 0 || !is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes))
5477 return SDValue();
5478
5479 EVT EltVT = MVT::getIntegerVT(EltSizeInBytes * 8);
5480 EVT PredVT =
5481 getPackedSVEVectorVT(EltVT).changeElementType(*DAG.getContext(), MVT::i1);
5482
5483 if (PredVT == VT) {
5484 // Legal whilewr/rw (lowered by tablegen matcher).
5485 if (AddrTy == MVT::i64)
5486 return Op;
5487
5488 // Almost legal whilewr/rw (addresses must be promoted to i64).
5489 assert(AddrTy == MVT::i32 && "Only expected i32 to be legal!");
5490 return DAG.getNode(
5491 Op.getOpcode(), DL, VT,
5492 DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Op->getOperand(0)),
5493 DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Op->getOperand(1)),
5494 DAG.getConstant(EltSizeInBytes, DL, MVT::i64), Op->getOperand(3));
5495 }
5496
5497 // Expand if this mask needs splitting (this will produce a whilelo).
5498 if (NumElements > PredVT.getVectorMinNumElements())
5499 return SDValue();
5500
5501 SDValue Mask =
5502 DAG.getNode(Op.getOpcode(), DL, PredVT, to_vector(Op->op_values()));
5503
5504 if (VT.isFixedLengthVector()) {
5505 EVT WidePredVT =
5506 PredVT.changeElementType(*DAG.getContext(), VT.getScalarType());
5507 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, WidePredVT, Mask);
5508 return convertFromScalableVector(DAG, VT, MaskAsInt);
5509 }
5510
5511 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Mask,
5512 DAG.getConstant(0, DL, MVT::i64));
5513}
5514
5515SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5516 SelectionDAG &DAG) const {
5517 EVT OpVT = Op.getValueType();
5518 EVT ArgVT = Op.getOperand(0).getValueType();
5519
5521 return LowerFixedLengthBitcastToSVE(Op, DAG);
5522
5523 if (OpVT.isScalableVector()) {
5524 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5525
5526 // Handle type legalisation first.
5527 if (!isTypeLegal(ArgVT)) {
5528 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5529 "Expected int->fp bitcast!");
5530
5531 // Bitcasting between unpacked vector types of different element counts is
5532 // not a NOP because the live elements are laid out differently.
5533 // 01234567
5534 // e.g. nxv2i32 = XX??XX??
5535 // nxv4f16 = X?X?X?X?
5536 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5537 return SDValue();
5538
5539 SDValue ExtResult =
5540 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5541 Op.getOperand(0));
5542 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5543 }
5544
5545 // Bitcasts between legal types with the same element count are legal.
5546 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5547 return Op;
5548
5549 // getSVESafeBitCast does not support casting between unpacked types.
5550 if (!isPackedVectorType(OpVT, DAG))
5551 return SDValue();
5552
5553 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5554 }
5555
5556 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5557 return SDValue();
5558
5559 // Bitcasts between f16 and bf16 are legal.
5560 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5561 return Op;
5562
5563 SDValue Src = Op.getOperand(0);
5564 SDLoc DL(Op);
5565 if (ArgVT.isVector() && ArgVT.getSizeInBits() == 16) {
5566 Src = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Src);
5567 ArgVT = MVT::i16;
5568 }
5569
5570 assert(ArgVT == MVT::i16);
5571
5572 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
5573 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5574 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5575}
5576
5577// Returns lane if Op extracts from a two-element vector and lane is constant
5578// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5579static std::optional<uint64_t>
5581 SDNode *OpNode = Op.getNode();
5582 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5583 return std::nullopt;
5584
5585 EVT VT = OpNode->getOperand(0).getValueType();
5587 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5588 return std::nullopt;
5589
5590 return C->getZExtValue();
5591}
5592
5594 bool isSigned) {
5595 EVT VT = N.getValueType();
5596
5597 if (N.getOpcode() != ISD::BUILD_VECTOR)
5598 return false;
5599
5600 for (const SDValue &Elt : N->op_values()) {
5602 unsigned EltSize = VT.getScalarSizeInBits();
5603 unsigned HalfSize = EltSize / 2;
5604 if (isSigned) {
5605 if (!isIntN(HalfSize, C->getSExtValue()))
5606 return false;
5607 } else {
5608 if (!isUIntN(HalfSize, C->getZExtValue()))
5609 return false;
5610 }
5611 continue;
5612 }
5613 return false;
5614 }
5615
5616 return true;
5617}
5618
5620 EVT VT = N.getValueType();
5621 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5622 EVT HalfVT = EVT::getVectorVT(
5623 *DAG.getContext(),
5626 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5627}
5628
5630 return N.getOpcode() == ISD::SIGN_EXTEND ||
5631 N.getOpcode() == ISD::ANY_EXTEND ||
5632 isExtendedBUILD_VECTOR(N, DAG, true);
5633}
5634
5636 return N.getOpcode() == ISD::ZERO_EXTEND ||
5637 N.getOpcode() == ISD::ANY_EXTEND ||
5638 isExtendedBUILD_VECTOR(N, DAG, false);
5639}
5640
5642 unsigned Opcode = N.getOpcode();
5643 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5644 SDValue N0 = N.getOperand(0);
5645 SDValue N1 = N.getOperand(1);
5646 return N0->hasOneUse() && N1->hasOneUse() &&
5647 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5648 }
5649 return false;
5650}
5651
5653 unsigned Opcode = N.getOpcode();
5654 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5655 SDValue N0 = N.getOperand(0);
5656 SDValue N1 = N.getOperand(1);
5657 return N0->hasOneUse() && N1->hasOneUse() &&
5658 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5659 }
5660 return false;
5661}
5662
5663SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5664 SelectionDAG &DAG) const {
5665 // The rounding mode is in bits 23:22 of the FPSCR.
5666 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5667 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5668 // so that the shift + and get folded into a bitfield extract.
5669 SDLoc DL(Op);
5670
5671 SDValue Chain = Op.getOperand(0);
5672 SDValue FPCR_64 =
5673 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5674 {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
5675 MVT::i64)});
5676 Chain = FPCR_64.getValue(1);
5677 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5678 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5679 DAG.getConstant(1U << 22, DL, MVT::i32));
5680 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5681 DAG.getConstant(22, DL, MVT::i32));
5682 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5683 DAG.getConstant(3, DL, MVT::i32));
5684 return DAG.getMergeValues({AND, Chain}, DL);
5685}
5686
5687SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5688 SelectionDAG &DAG) const {
5689 SDLoc DL(Op);
5690 SDValue Chain = Op->getOperand(0);
5691 SDValue RMValue = Op->getOperand(1);
5692
5693 // The rounding mode is in bits 23:22 of the FPCR.
5694 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5695 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5696 // ((arg - 1) & 3) << 22).
5697 //
5698 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5699 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5700 // generated llvm.set.rounding to ensure this condition.
5701
5702 // Calculate new value of FPCR[23:22].
5703 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5704 DAG.getConstant(1, DL, MVT::i32));
5705 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5706 DAG.getConstant(0x3, DL, MVT::i32));
5707 RMValue =
5708 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5709 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5710 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5711
5712 // Get current value of FPCR.
5713 SDValue Ops[] = {
5714 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5715 SDValue FPCR =
5716 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5717 Chain = FPCR.getValue(1);
5718 FPCR = FPCR.getValue(0);
5719
5720 // Put new rounding mode into FPSCR[23:22].
5721 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5722 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5723 DAG.getConstant(RMMask, DL, MVT::i64));
5724 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5725 SDValue Ops2[] = {
5726 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5727 FPCR};
5728 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5729}
5730
5731SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5732 SelectionDAG &DAG) const {
5733 SDLoc DL(Op);
5734 SDValue Chain = Op->getOperand(0);
5735
5736 // Get current value of FPCR.
5737 SDValue Ops[] = {
5738 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5739 SDValue FPCR =
5740 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5741 Chain = FPCR.getValue(1);
5742 FPCR = FPCR.getValue(0);
5743
5744 // Truncate FPCR to 32 bits.
5745 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5746
5747 return DAG.getMergeValues({Result, Chain}, DL);
5748}
5749
5750SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5751 SelectionDAG &DAG) const {
5752 SDLoc DL(Op);
5753 SDValue Chain = Op->getOperand(0);
5754 SDValue Mode = Op->getOperand(1);
5755
5756 // Extend the specified value to 64 bits.
5757 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5758
5759 // Set new value of FPCR.
5760 SDValue Ops2[] = {
5761 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5762 FPCR};
5763 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5764}
5765
5766SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5767 SelectionDAG &DAG) const {
5768 SDLoc DL(Op);
5769 SDValue Chain = Op->getOperand(0);
5770
5771 // Get current value of FPCR.
5772 SDValue Ops[] = {
5773 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5774 SDValue FPCR =
5775 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5776 Chain = FPCR.getValue(1);
5777 FPCR = FPCR.getValue(0);
5778
5779 // Clear bits that are not reserved.
5780 SDValue FPSCRMasked = DAG.getNode(
5781 ISD::AND, DL, MVT::i64, FPCR,
5783
5784 // Set new value of FPCR.
5785 SDValue Ops2[] = {
5786 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5787 FPSCRMasked};
5788 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5789}
5790
5791static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5792 SDLoc DL, bool &IsMLA) {
5793 bool IsN0SExt = isSignExtended(N0, DAG);
5794 bool IsN1SExt = isSignExtended(N1, DAG);
5795 if (IsN0SExt && IsN1SExt)
5796 return AArch64ISD::SMULL;
5797
5798 bool IsN0ZExt = isZeroExtended(N0, DAG);
5799 bool IsN1ZExt = isZeroExtended(N1, DAG);
5800
5801 if (IsN0ZExt && IsN1ZExt)
5802 return AArch64ISD::UMULL;
5803
5804 // Select UMULL if we can replace the other operand with an extend.
5805 EVT VT = N0.getValueType();
5806 unsigned EltSize = VT.getScalarSizeInBits();
5807 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5808 if (IsN0ZExt || IsN1ZExt) {
5809 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5810 return AArch64ISD::UMULL;
5811 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5812 DAG.MaskedValueIsZero(N1, Mask)) {
5813 // For v2i64 we look more aggressively at both operands being zero, to avoid
5814 // scalarization.
5815 return AArch64ISD::UMULL;
5816 }
5817
5818 if (IsN0SExt || IsN1SExt) {
5819 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5820 return AArch64ISD::SMULL;
5821 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5822 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5823 return AArch64ISD::SMULL;
5824 }
5825
5826 if (!IsN1SExt && !IsN1ZExt)
5827 return 0;
5828
5829 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5830 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5831 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5832 IsMLA = true;
5833 return AArch64ISD::SMULL;
5834 }
5835 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5836 IsMLA = true;
5837 return AArch64ISD::UMULL;
5838 }
5839 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5840 std::swap(N0, N1);
5841 IsMLA = true;
5842 return AArch64ISD::UMULL;
5843 }
5844 return 0;
5845}
5846
5847// Transform mul<v2i64, splat(const)> into a SHL and ADD/SUB
5848// (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
5849// mul x, (2^N + 1) --> add (shl x, N), x
5850// mul x, (2^N - 1) --> sub (shl x, N), x
5851// Examples: x * 33 --> (x << 5) + x
5852// x * 15 --> (x << 4) - x
5853// x * -33 --> -((x << 5) + x)
5854// x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
5855// (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
5856// mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
5857// mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
5858// Examples: x * 0x8800 --> (x << 15) + (x << 11)
5859// x * 0xf800 --> (x << 16) - (x << 11)
5860// x * -0x8800 --> -((x << 15) + (x << 11))
5861// x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
5863 const SDNode *Operand = N->getOperand(1).getNode();
5864 APInt SplatValue;
5865
5866 // Not a constant splat so should just stay as a multiplication operation
5867 if (!ISD::isConstantSplatVector(Operand, SplatValue) ||
5868 !SplatValue.getBoolValue())
5869 return SDValue();
5870
5871 bool IsNegative = SplatValue.isNegative();
5872 SplatValue = SplatValue.abs();
5873 // Placeholder for MathOp
5874 unsigned MathOp = ISD::DELETED_NODE;
5875 unsigned TZeros = SplatValue.countr_zero();
5876
5877 // Shift the splat value by all the zeros, this won't affect the parity
5878 // this will help us find the first and second multiple to use.
5879 SplatValue.lshrInPlace(TZeros);
5880
5881 if ((SplatValue - 1).isPowerOf2())
5882 MathOp = ISD::ADD;
5883 else if ((SplatValue + 1).isPowerOf2())
5884 MathOp = ISD::SUB;
5885
5886 // If the constant is not (2^n + 1) or (2^n - 1), it would require
5887 // more than one addition/subtraction. For v2i64, the cost of
5888 // multiple vector adds/shifts often exceeds the cost of
5889 // scalarization (moving to GPRs to use a single MUL).
5890 if (MathOp != ISD::DELETED_NODE) {
5891 SDLoc DL(N);
5892 EVT VT = N->getValueType(0);
5893 SDValue LHS = N->getOperand(0);
5894
5895 unsigned ShiftAmt = MathOp == ISD::ADD ? (SplatValue - 1).logBase2()
5896 : (SplatValue + 1).logBase2();
5897 ShiftAmt += TZeros;
5898
5899 SDValue Shl =
5900 DAG.getNode(ISD::SHL, DL, VT, LHS, DAG.getConstant(ShiftAmt, DL, VT));
5901
5902 SDValue NewLHS = TZeros ? DAG.getNode(ISD::SHL, DL, VT, LHS,
5903 DAG.getConstant(TZeros, DL, VT))
5904 : LHS;
5905 SDValue Combined = DAG.getNode(MathOp, DL, VT, Shl, NewLHS);
5906 if (IsNegative)
5907 Combined = DAG.getNegative(Combined, DL, VT);
5908 return Combined;
5909 }
5910 return SDValue();
5911}
5912
5913SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5914 EVT VT = Op.getValueType();
5915
5916 bool OverrideNEON = !Subtarget->isNeonAvailable();
5917 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5918 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5919
5920 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5921 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5922 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5923 "unexpected type for custom-lowering ISD::MUL");
5924 SDValue N0 = Op.getOperand(0);
5925 SDValue N1 = Op.getOperand(1);
5926 bool isMLA = false;
5927 EVT OVT = VT;
5928 if (VT.is64BitVector()) {
5929 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5930 isNullConstant(N0.getOperand(1)) &&
5933 isNullConstant(N1.getOperand(1)) &&
5935 N0 = N0.getOperand(0);
5936 N1 = N1.getOperand(0);
5937 VT = N0.getValueType();
5938 } else {
5939 if (VT == MVT::v1i64) {
5940 if (Subtarget->hasSVE())
5941 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5942 // Fall through to expand this. It is not legal.
5943 return SDValue();
5944 } else
5945 // Other vector multiplications are legal.
5946 return Op;
5947 }
5948 }
5949
5950 SDLoc DL(Op);
5951 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5952
5953 if (!NewOpc) {
5954 if (VT.getVectorElementType() == MVT::i64) {
5955 // If SVE is available then i64 vector multiplications can also be made
5956 // legal.
5957 if (Subtarget->hasSVE())
5958 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5959 // Try to optimize the mul to a shift left and add instead of scalarizing.
5960 if (SDValue ShlAdd = convertMulToShlAdd(Op.getNode(), DAG))
5961 return ShlAdd;
5962 // Fall through to expanding as the mul is not legal.
5963 return SDValue();
5964 } else
5965 // Other vector multiplications are legal.
5966 return Op;
5967 }
5968
5969 // Legalize to a S/UMULL instruction
5970 SDValue Op0;
5971 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5972 if (!isMLA) {
5973 Op0 = skipExtensionForVectorMULL(N0, DAG);
5975 Op1.getValueType().is64BitVector() &&
5976 "unexpected types for extended operands to VMULL");
5977 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5978 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5979 DAG.getConstant(0, DL, MVT::i64));
5980 }
5981 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5982 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5983 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5986 EVT Op1VT = Op1.getValueType();
5987 return DAG.getNode(
5989 DAG.getNode(N0.getOpcode(), DL, VT,
5990 DAG.getNode(NewOpc, DL, VT,
5991 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5992 DAG.getNode(NewOpc, DL, VT,
5993 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5994 DAG.getConstant(0, DL, MVT::i64));
5995}
5996
5997static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5998 int Pattern) {
5999 if (Pattern == AArch64SVEPredPattern::all)
6000 return DAG.getConstant(1, DL, VT);
6001
6002 // When the number of active elements of a pattern matches the scalable vector
6003 // length, we can upgrade the pattern to ALL and emit a splat instead.
6004 if (unsigned PatNumElts = getNumElementsFromSVEPredPattern(Pattern)) {
6005 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
6006 unsigned NumElts = VT.getVectorMinNumElements();
6007 unsigned VScale = Subtarget.getSVEVectorSizeInBits() / 128;
6008 if (PatNumElts == (NumElts * VScale))
6009 return DAG.getConstant(1, DL, VT);
6010 }
6011
6012 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
6013 DAG.getTargetConstant(Pattern, DL, MVT::i32));
6014}
6015
6017 bool IsSigned, bool IsEqual) {
6018 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
6019 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
6020
6021 if (!N->getValueType(0).isScalableVector() ||
6022 !isa<ConstantSDNode>(N->getOperand(Op1)))
6023 return SDValue();
6024
6025 SDLoc DL(N);
6026 APInt Y = N->getConstantOperandAPInt(Op1);
6027
6028 // When the second operand is the maximum value, comparisons that include
6029 // equality can never fail and thus we can return an all active predicate.
6030 if (IsEqual)
6031 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
6032 return DAG.getConstant(1, DL, N->getValueType(0));
6033
6034 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
6035 return SDValue();
6036
6037 APInt X = N->getConstantOperandAPInt(Op0);
6038
6039 bool Overflow;
6040 APInt NumActiveElems =
6041 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
6042
6043 if (Overflow)
6044 return SDValue();
6045
6046 if (IsEqual) {
6047 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
6048 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
6049 : NumActiveElems.uadd_ov(One, Overflow);
6050 if (Overflow)
6051 return SDValue();
6052 }
6053
6054 std::optional<unsigned> PredPattern =
6056 unsigned MinSVEVectorSize = std::max(
6058 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
6059 if (PredPattern != std::nullopt &&
6060 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
6061 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
6062
6063 return SDValue();
6064}
6065
6066// Match get.active.lane.mask(0, cttz.elts(x)) -> brkb(x)
6067// Match get.active.lane.mask(0, add(cttz.elts(x), 1)) -> brka(x)
6069 SDLoc DL(N);
6070 EVT VT = N->getValueType(0);
6071 // Lower bound must be 0.
6072 if (!isZeroOrZeroSplat(N->getOperand(0)))
6073 return SDValue();
6074
6075 SDValue Upper = N->getOperand(1);
6076
6077 // Default to brkb, switch to brka if we find a +1.
6078 unsigned BrkID = Intrinsic::aarch64_sve_brkb_z;
6079 if (Upper->getOpcode() == ISD::ADD && isOneOrOneSplat(Upper.getOperand(1))) {
6080 Upper = Upper.getOperand(0);
6081 BrkID = Intrinsic::aarch64_sve_brka_z;
6082 }
6083
6084 // We're looking for an upper bound based on CTTZ_ELTS; this would be selected
6085 // as a cntp(brk(Pg, Mask)), but if we're just going to make a whilelo based
6086 // on that then we just need the brk.
6087 if (Upper.getOpcode() != AArch64ISD::CTTZ_ELTS || !VT.isScalableVector() ||
6088 Upper.getOperand(0).getValueType() != VT)
6089 return SDValue();
6090
6091 SDValue Pg = Upper->getOperand(0);
6092 SDValue Mask = Upper->getOperand(1);
6093
6094 // brk{a,b} only support .b forms, so cast to make sure all our p regs match.
6095 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
6096 SDValue MaskR =
6097 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Mask);
6098 SDValue ID = DAG.getTargetConstant(BrkID, DL, MVT::i64);
6099 SDValue Brk =
6100 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv16i1, ID, Pg, MaskR);
6101 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Brk);
6102}
6103
6104// Returns a safe bitcast between two scalable vector predicates, where
6105// any newly created lanes from a widening bitcast are defined as zero.
6107 SDLoc DL(Op);
6108 EVT InVT = Op.getValueType();
6109
6110 assert(InVT.getVectorElementType() == MVT::i1 &&
6111 VT.getVectorElementType() == MVT::i1 &&
6112 "Expected a predicate-to-predicate bitcast");
6114 InVT.isScalableVector() &&
6115 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
6116 "Only expect to cast between legal scalable predicate types!");
6117
6118 // Return the operand if the cast isn't changing type,
6119 if (InVT == VT)
6120 return Op;
6121
6122 // Look through casts to <vscale x 16 x i1> when their input has more lanes
6123 // than VT. This will increase the chances of removing casts that introduce
6124 // new lanes, which have to be explicitly zero'd.
6125 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6126 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
6127 Op.getOperand(1).getValueType().bitsGT(VT))
6128 Op = Op.getOperand(1);
6129
6130 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
6131
6132 // We only have to zero the lanes if new lanes are being defined, e.g. when
6133 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
6134 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
6135 // we can return here.
6136 if (InVT.bitsGT(VT))
6137 return Reinterpret;
6138
6139 // Check if the other lanes are already known to be zeroed by
6140 // construction.
6142 return Reinterpret;
6143
6144 // Zero the newly introduced lanes.
6145 SDValue Mask = DAG.getConstant(1, DL, InVT);
6146 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
6147 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
6148}
6149
6150SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
6151 SDValue Chain, SDLoc DL,
6152 EVT VT) const {
6153 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
6154 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
6155 SDValue Callee =
6156 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
6157 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
6158 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
6159 TargetLowering::CallLoweringInfo CLI(DAG);
6161 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
6162 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
6163 std::move(Args));
6164 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6165 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
6166 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
6167 Mask);
6168}
6169
6170// Lower an SME LDR/STR ZA intrinsic
6171// Case 1: If the vector number (vecnum) is an immediate in range, it gets
6172// folded into the instruction
6173// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
6174// Case 2: If the vecnum is not an immediate, then it is used to modify the base
6175// and tile slice registers
6176// ldr(%tileslice, %ptr, %vecnum)
6177// ->
6178// %svl = rdsvl
6179// %ptr2 = %ptr + %svl * %vecnum
6180// %tileslice2 = %tileslice + %vecnum
6181// ldr [%tileslice2, 0], [%ptr2, 0]
6182// Case 3: If the vecnum is an immediate out of range, then the same is done as
6183// case 2, but the base and slice registers are modified by the greatest
6184// multiple of 15 lower than the vecnum and the remainder is folded into the
6185// instruction. This means that successive loads and stores that are offset from
6186// each other can share the same base and slice register updates.
6187// ldr(%tileslice, %ptr, 22)
6188// ldr(%tileslice, %ptr, 23)
6189// ->
6190// %svl = rdsvl
6191// %ptr2 = %ptr + %svl * 15
6192// %tileslice2 = %tileslice + 15
6193// ldr [%tileslice2, 7], [%ptr2, 7]
6194// ldr [%tileslice2, 8], [%ptr2, 8]
6195// Case 4: If the vecnum is an add of an immediate, then the non-immediate
6196// operand and the immediate can be folded into the instruction, like case 2.
6197// ldr(%tileslice, %ptr, %vecnum + 7)
6198// ldr(%tileslice, %ptr, %vecnum + 8)
6199// ->
6200// %svl = rdsvl
6201// %ptr2 = %ptr + %svl * %vecnum
6202// %tileslice2 = %tileslice + %vecnum
6203// ldr [%tileslice2, 7], [%ptr2, 7]
6204// ldr [%tileslice2, 8], [%ptr2, 8]
6205// Case 5: The vecnum being an add of an immediate out of range is also handled,
6206// in which case the same remainder logic as case 3 is used.
6208 SDLoc DL(N);
6209
6210 SDValue TileSlice = N->getOperand(2);
6211 SDValue Base = N->getOperand(3);
6212 SDValue VecNum = N->getOperand(4);
6213 int32_t ConstAddend = 0;
6214 SDValue VarAddend = VecNum;
6215
6216 // If the vnum is an add of an immediate, we can fold it into the instruction
6217 if (VecNum.getOpcode() == ISD::ADD &&
6218 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6219 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6220 VarAddend = VecNum.getOperand(0);
6221 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6222 ConstAddend = ImmNode->getSExtValue();
6223 VarAddend = SDValue();
6224 }
6225
6226 int32_t ImmAddend = ConstAddend % 16;
6227 if (int32_t C = (ConstAddend - ImmAddend)) {
6228 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6229 VarAddend = VarAddend
6230 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6231 : CVal;
6232 }
6233
6234 if (VarAddend) {
6235 // Get the vector length that will be multiplied by vnum
6236 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6237 DAG.getConstant(1, DL, MVT::i32));
6238
6239 // Multiply SVL and vnum then add it to the base
6240 SDValue Mul = DAG.getNode(
6241 ISD::MUL, DL, MVT::i64,
6242 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6243 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6244 // Just add vnum to the tileslice
6245 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6246 }
6247
6248 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6249 DL, MVT::Other,
6250 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6251 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6252}
6253
6255 SDLoc DL(Op);
6256 SDValue ID =
6257 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6258
6259 auto Op1 = Op.getOperand(1);
6260 auto Op2 = Op.getOperand(2);
6261 auto Mask = Op.getOperand(3);
6262
6263 EVT Op1VT = Op1.getValueType();
6264 EVT Op2VT = Op2.getValueType();
6265 EVT ResVT = Op.getValueType();
6266
6267 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6268 Op1VT.getVectorElementType() == MVT::i16) &&
6269 "Expected 8-bit or 16-bit characters.");
6270
6271 // Scalable vector type used to wrap operands.
6272 // A single container is enough for both operands because ultimately the
6273 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6274 EVT OpContainerVT = Op1VT.isScalableVector()
6275 ? Op1VT
6277
6278 if (Op2VT.is128BitVector()) {
6279 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6280 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6281 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6282 if (ResVT.isScalableVector())
6283 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6284 DAG.getTargetConstant(0, DL, MVT::i64));
6285 } else {
6286 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6287 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6288 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6289 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6290 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6291 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6292 DAG.getConstant(0, DL, MVT::i64));
6293 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6294 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6295 }
6296
6297 // If the result is scalable, we just need to carry out the MATCH.
6298 if (ResVT.isScalableVector())
6299 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6300
6301 // If the result is fixed, we can still use MATCH but we need to wrap the
6302 // first operand and the mask in scalable vectors before doing so.
6303
6304 // Wrap the operands.
6305 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6306 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6307 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6308
6309 // Carry out the match.
6310 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6311 ID, Mask, Op1, Op2);
6312
6313 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6314 // (v16i8/v8i8).
6315 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6316 Match = convertFromScalableVector(DAG, Op1VT, Match);
6317 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6318}
6319
6320SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6321 SelectionDAG &DAG) const {
6322 unsigned IntNo = Op.getConstantOperandVal(1);
6323 SDLoc DL(Op);
6324 switch (IntNo) {
6325 default:
6326 return SDValue(); // Don't custom lower most intrinsics.
6327 case Intrinsic::aarch64_prefetch: {
6328 SDValue Chain = Op.getOperand(0);
6329 SDValue Addr = Op.getOperand(2);
6330
6331 unsigned IsWrite = Op.getConstantOperandVal(3);
6332 unsigned Locality = Op.getConstantOperandVal(4);
6333 unsigned IsStream = Op.getConstantOperandVal(5);
6334 unsigned IsData = Op.getConstantOperandVal(6);
6335 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6336 (!IsData << 3) | // IsDataCache bit
6337 (Locality << 1) | // Cache level bits
6338 (unsigned)IsStream; // Stream bit
6339
6340 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6341 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6342 }
6343 case Intrinsic::aarch64_range_prefetch: {
6344 SDValue Chain = Op.getOperand(0);
6345 SDValue Addr = Op.getOperand(2);
6346
6347 unsigned IsWrite = Op.getConstantOperandVal(3);
6348 unsigned IsStream = Op.getConstantOperandVal(4);
6349 unsigned PrfOp = (IsStream << 2) | IsWrite;
6350
6351 SDValue Metadata = Op.getOperand(5);
6352 return DAG.getNode(AArch64ISD::RANGE_PREFETCH, DL, MVT::Other, Chain,
6353 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr,
6354 Metadata);
6355 }
6356 case Intrinsic::aarch64_prefetch_ir:
6357 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other,
6358 Op.getOperand(0), // Chain
6359 DAG.getTargetConstant(24, DL, MVT::i32), // Rt
6360 Op.getOperand(2)); // Addr
6361 case Intrinsic::aarch64_sme_str:
6362 case Intrinsic::aarch64_sme_ldr: {
6363 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6364 }
6365 case Intrinsic::aarch64_sme_za_enable:
6366 return DAG.getNode(
6367 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6368 Op->getOperand(0), // Chain
6369 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6370 case Intrinsic::aarch64_sme_za_disable:
6371 return DAG.getNode(
6372 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6373 Op->getOperand(0), // Chain
6374 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6375 }
6376}
6377
6378SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6379 SelectionDAG &DAG) const {
6380 unsigned IntNo = Op.getConstantOperandVal(1);
6381 SDLoc DL(Op);
6382 switch (IntNo) {
6383 default:
6384 return SDValue(); // Don't custom lower most intrinsics.
6385 case Intrinsic::aarch64_mops_memset_tag: {
6386 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6387 SDValue Chain = Node->getChain();
6388 SDValue Dst = Op.getOperand(2);
6389 SDValue Val = Op.getOperand(3);
6390 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6391 SDValue Size = Op.getOperand(4);
6392 auto Alignment = Node->getMemOperand()->getAlign();
6393 bool IsVol = Node->isVolatile();
6394 auto DstPtrInfo = Node->getPointerInfo();
6395
6396 const auto &SDI =
6397 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6398 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6399 Chain, Dst, Val, Size, Alignment, IsVol,
6400 DstPtrInfo, MachinePointerInfo{});
6401
6402 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6403 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6404 // LowerOperationWrapper will complain that the number of results has
6405 // changed.
6406 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6407 }
6408 }
6409}
6410
6411SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6412 SelectionDAG &DAG) const {
6413 unsigned IntNo = Op.getConstantOperandVal(0);
6414 SDLoc DL(Op);
6415 switch (IntNo) {
6416 default: return SDValue(); // Don't custom lower most intrinsics.
6417 case Intrinsic::thread_pointer: {
6418 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6419 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6420 }
6421 case Intrinsic::aarch64_sve_whilewr_b:
6422 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6423 Op.getOperand(1), Op.getOperand(2),
6424 DAG.getConstant(1, DL, MVT::i64),
6425 DAG.getConstant(0, DL, MVT::i64));
6426 case Intrinsic::aarch64_sve_whilewr_h:
6427 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6428 Op.getOperand(1), Op.getOperand(2),
6429 DAG.getConstant(2, DL, MVT::i64),
6430 DAG.getConstant(0, DL, MVT::i64));
6431 case Intrinsic::aarch64_sve_whilewr_s:
6432 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6433 Op.getOperand(1), Op.getOperand(2),
6434 DAG.getConstant(4, DL, MVT::i64),
6435 DAG.getConstant(0, DL, MVT::i64));
6436 case Intrinsic::aarch64_sve_whilewr_d:
6437 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6438 Op.getOperand(1), Op.getOperand(2),
6439 DAG.getConstant(8, DL, MVT::i64),
6440 DAG.getConstant(0, DL, MVT::i64));
6441 case Intrinsic::aarch64_sve_whilerw_b:
6442 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6443 Op.getOperand(1), Op.getOperand(2),
6444 DAG.getConstant(1, DL, MVT::i64),
6445 DAG.getConstant(0, DL, MVT::i64));
6446 case Intrinsic::aarch64_sve_whilerw_h:
6447 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6448 Op.getOperand(1), Op.getOperand(2),
6449 DAG.getConstant(2, DL, MVT::i64),
6450 DAG.getConstant(0, DL, MVT::i64));
6451 case Intrinsic::aarch64_sve_whilerw_s:
6452 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6453 Op.getOperand(1), Op.getOperand(2),
6454 DAG.getConstant(4, DL, MVT::i64),
6455 DAG.getConstant(0, DL, MVT::i64));
6456 case Intrinsic::aarch64_sve_whilerw_d:
6457 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6458 Op.getOperand(1), Op.getOperand(2),
6459 DAG.getConstant(8, DL, MVT::i64),
6460 DAG.getConstant(0, DL, MVT::i64));
6461 case Intrinsic::aarch64_neon_abs: {
6462 EVT Ty = Op.getValueType();
6463 if (Ty == MVT::i64) {
6464 SDValue Result =
6465 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op.getOperand(1));
6466 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6467 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Result,
6468 DAG.getConstant(0, DL, MVT::i64));
6469 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6470 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6471 } else {
6472 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6473 }
6474 }
6475 case Intrinsic::aarch64_neon_pmull64: {
6476 SDValue LHS = Op.getOperand(1);
6477 SDValue RHS = Op.getOperand(2);
6478
6479 std::optional<uint64_t> LHSLane =
6481 std::optional<uint64_t> RHSLane =
6483
6484 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6485 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6486
6487 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6488 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6489 // which ISel recognizes better. For example, generate a ldr into d*
6490 // registers as opposed to a GPR load followed by a fmov.
6491 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6492 std::optional<uint64_t> OtherLane,
6493 const SDLoc &DL,
6494 SelectionDAG &DAG) -> SDValue {
6495 // If the operand is an higher half itself, rewrite it to
6496 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6497 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6498 if (NLane == 1)
6499 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6500 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6501
6502 // Operand N is not a higher half but the other operand is.
6503 if (OtherLane == 1) {
6504 // If this operand is a lower half, rewrite it to
6505 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6506 // align lanes of two operands. A roundtrip sequence (to move from lane
6507 // 1 to lane 0) is like this:
6508 // mov x8, v0.d[1]
6509 // fmov d0, x8
6510 if (NLane == 0)
6511 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6512 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6513 N.getOperand(0),
6514 DAG.getConstant(0, DL, MVT::i64)),
6515 DAG.getConstant(1, DL, MVT::i64));
6516
6517 // Otherwise just dup from main to all lanes.
6518 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6519 }
6520
6521 // Neither operand is an extract of higher half, so codegen may just use
6522 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6523 assert(N.getValueType() == MVT::i64 &&
6524 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6525 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6526 };
6527
6528 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6529 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6530
6531 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6532 }
6533 case Intrinsic::aarch64_neon_smax:
6534 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6535 Op.getOperand(2));
6536 case Intrinsic::aarch64_neon_umax:
6537 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6538 Op.getOperand(2));
6539 case Intrinsic::aarch64_neon_smin:
6540 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6541 Op.getOperand(2));
6542 case Intrinsic::aarch64_neon_umin:
6543 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6544 Op.getOperand(2));
6545 case Intrinsic::aarch64_neon_scalar_sqxtn:
6546 case Intrinsic::aarch64_neon_scalar_sqxtun:
6547 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6548 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6549 if (Op.getValueType() == MVT::i32)
6550 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6551 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6552 Op.getOperand(0),
6553 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6554 Op.getOperand(1))));
6555 return SDValue();
6556 }
6557 case Intrinsic::aarch64_neon_sqxtn:
6558 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6559 Op.getOperand(1));
6560 case Intrinsic::aarch64_neon_sqxtun:
6561 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6562 Op.getOperand(1));
6563 case Intrinsic::aarch64_neon_uqxtn:
6564 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6565 Op.getOperand(1));
6566 case Intrinsic::aarch64_neon_sqshrn:
6567 if (Op.getValueType().isVector())
6568 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6569 DAG.getNode(AArch64ISD::VASHR, DL,
6570 Op.getOperand(1).getValueType(),
6571 Op.getOperand(1), Op.getOperand(2)));
6572 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRN, DAG,
6573 /*LastOperandIsImm=*/true);
6574 case Intrinsic::aarch64_neon_sqshrun:
6575 if (Op.getValueType().isVector())
6576 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6577 DAG.getNode(AArch64ISD::VASHR, DL,
6578 Op.getOperand(1).getValueType(),
6579 Op.getOperand(1), Op.getOperand(2)));
6580 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRUN, DAG,
6581 /*LastOperandIsImm=*/true);
6582 case Intrinsic::aarch64_neon_uqshrn:
6583 if (Op.getValueType().isVector())
6584 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6585 DAG.getNode(AArch64ISD::VLSHR, DL,
6586 Op.getOperand(1).getValueType(),
6587 Op.getOperand(1), Op.getOperand(2)));
6588 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHRN, DAG,
6589 /*LastOperandIsImm=*/true);
6590 case Intrinsic::aarch64_neon_sqrshrn:
6591 if (Op.getValueType().isVector())
6592 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6593 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6594 Op.getOperand(1).getValueType(),
6595 Op.getOperand(1), Op.getOperand(2)));
6596 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRN, DAG,
6597 /*LastOperandIsImm=*/true);
6598 case Intrinsic::aarch64_neon_sqrshrun:
6599 if (Op.getValueType().isVector())
6600 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6601 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6602 Op.getOperand(1).getValueType(),
6603 Op.getOperand(1), Op.getOperand(2)));
6604 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRUN, DAG,
6605 /*LastOperandIsImm=*/true);
6606 case Intrinsic::aarch64_neon_uqrshrn:
6607 if (Op.getValueType().isVector())
6608 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6609 DAG.getNode(AArch64ISD::URSHR_I, DL,
6610 Op.getOperand(1).getValueType(),
6611 Op.getOperand(1), Op.getOperand(2)));
6612 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHRN, DAG,
6613 /*LastOperandIsImm=*/true);
6614 case Intrinsic::aarch64_neon_sqdmulh:
6615 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULH, DAG);
6616 case Intrinsic::aarch64_neon_sqrdmulh:
6617 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMULH, DAG);
6618 case Intrinsic::aarch64_neon_sqrdmlah:
6619 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLAH, DAG);
6620 case Intrinsic::aarch64_neon_sqrdmlsh:
6621 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLSH, DAG);
6622 case Intrinsic::aarch64_neon_sqrshl:
6623 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
6624 case Intrinsic::aarch64_neon_sqshl:
6625 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
6626 case Intrinsic::aarch64_neon_uqrshl:
6627 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
6628 case Intrinsic::aarch64_neon_uqshl:
6629 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
6630 case Intrinsic::aarch64_neon_sqadd:
6631 if (Op.getValueType().isVector())
6632 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6633 Op.getOperand(2));
6634 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
6635
6636 case Intrinsic::aarch64_neon_sqsub:
6637 if (Op.getValueType().isVector())
6638 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6639 Op.getOperand(2));
6640 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
6641
6642 case Intrinsic::aarch64_neon_uqadd:
6643 if (Op.getValueType().isVector())
6644 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6645 Op.getOperand(2));
6646 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
6647 case Intrinsic::aarch64_neon_suqadd:
6648 return lowerIntNeonIntrinsic(Op, AArch64ISD::SUQADD, DAG);
6649 case Intrinsic::aarch64_neon_usqadd:
6650 return lowerIntNeonIntrinsic(Op, AArch64ISD::USQADD, DAG);
6651 case Intrinsic::aarch64_neon_uqsub:
6652 if (Op.getValueType().isVector())
6653 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6654 Op.getOperand(2));
6655 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
6656 case Intrinsic::aarch64_neon_sqdmulls_scalar:
6657 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULL, DAG);
6658 case Intrinsic::aarch64_neon_sqabs:
6659 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQABS, DAG);
6660 case Intrinsic::aarch64_neon_sqneg:
6661 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQNEG, DAG);
6662 case Intrinsic::aarch64_sve_whilelt:
6663 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6664 /*IsEqual=*/false);
6665 case Intrinsic::aarch64_sve_whilels:
6666 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6667 /*IsEqual=*/true);
6668 case Intrinsic::aarch64_sve_whilele:
6669 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6670 /*IsEqual=*/true);
6671 case Intrinsic::aarch64_sve_sunpkhi:
6672 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6673 Op.getOperand(1));
6674 case Intrinsic::aarch64_sve_sunpklo:
6675 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6676 Op.getOperand(1));
6677 case Intrinsic::aarch64_sve_uunpkhi:
6678 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6679 Op.getOperand(1));
6680 case Intrinsic::aarch64_sve_uunpklo:
6681 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6682 Op.getOperand(1));
6683 case Intrinsic::aarch64_sve_clasta_n:
6684 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6685 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6686 case Intrinsic::aarch64_sve_clastb_n:
6687 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6688 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6689 case Intrinsic::aarch64_sve_lasta:
6690 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6691 Op.getOperand(1), Op.getOperand(2));
6692 case Intrinsic::aarch64_sve_lastb:
6693 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6694 Op.getOperand(1), Op.getOperand(2));
6695 case Intrinsic::aarch64_sve_tbl:
6696 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6697 Op.getOperand(2));
6698 case Intrinsic::aarch64_sve_trn1:
6699 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6700 Op.getOperand(1), Op.getOperand(2));
6701 case Intrinsic::aarch64_sve_trn2:
6702 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6703 Op.getOperand(1), Op.getOperand(2));
6704 case Intrinsic::aarch64_sve_uzp1:
6705 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6706 Op.getOperand(1), Op.getOperand(2));
6707 case Intrinsic::aarch64_sve_uzp2:
6708 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6709 Op.getOperand(1), Op.getOperand(2));
6710 case Intrinsic::aarch64_sve_zip1:
6711 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6712 Op.getOperand(1), Op.getOperand(2));
6713 case Intrinsic::aarch64_sve_zip2:
6714 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6715 Op.getOperand(1), Op.getOperand(2));
6716 case Intrinsic::aarch64_sve_splice:
6717 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6718 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6719 case Intrinsic::aarch64_sve_ptrue:
6720 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6721 case Intrinsic::aarch64_sve_clz:
6722 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6723 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6724 case Intrinsic::aarch64_sme_cntsd: {
6725 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6726 DAG.getConstant(1, DL, MVT::i32));
6727 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6728 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6729 }
6730 case Intrinsic::aarch64_sve_cnt: {
6731 SDValue Data = Op.getOperand(3);
6732 // CTPOP only supports integer operands.
6733 if (Data.getValueType().isFloatingPoint())
6734 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6735 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6736 Op.getOperand(2), Data, Op.getOperand(1));
6737 }
6738 case Intrinsic::aarch64_sve_dupq_lane:
6739 return LowerDUPQLane(Op, DAG);
6740 case Intrinsic::aarch64_sve_convert_from_svbool:
6741 if (Op.getValueType() == MVT::aarch64svcount)
6742 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6743 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6744 case Intrinsic::aarch64_sve_convert_to_svbool:
6745 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6746 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6747 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6748 case Intrinsic::aarch64_sve_fneg:
6749 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6750 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6751 case Intrinsic::aarch64_sve_frintp:
6752 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6753 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6754 case Intrinsic::aarch64_sve_frintm:
6755 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6756 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6757 case Intrinsic::aarch64_sve_frinti:
6758 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6759 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6760 Op.getOperand(1));
6761 case Intrinsic::aarch64_sve_frintx:
6762 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6763 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6764 case Intrinsic::aarch64_sve_frint32x:
6765 return DAG.getNode(AArch64ISD::FRINT32_MERGE_PASSTHRU, DL,
6766 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6767 Op.getOperand(1));
6768 case Intrinsic::aarch64_sve_frint64x:
6769 return DAG.getNode(AArch64ISD::FRINT64_MERGE_PASSTHRU, DL,
6770 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6771 Op.getOperand(1));
6772 case Intrinsic::aarch64_sve_frinta:
6773 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6774 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6775 case Intrinsic::aarch64_sve_frintn:
6776 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6777 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6778 Op.getOperand(1));
6779 case Intrinsic::aarch64_sve_frintz:
6780 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6781 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6782 case Intrinsic::aarch64_sve_frint32z:
6783 return DAG.getNode(AArch64ISD::FTRUNC32_MERGE_PASSTHRU, DL,
6784 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6785 Op.getOperand(1));
6786 case Intrinsic::aarch64_sve_frint64z:
6787 return DAG.getNode(AArch64ISD::FTRUNC64_MERGE_PASSTHRU, DL,
6788 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6789 Op.getOperand(1));
6790 case Intrinsic::aarch64_sve_ucvtf:
6791 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6792 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6793 Op.getOperand(1));
6794 case Intrinsic::aarch64_sve_scvtf:
6795 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6796 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6797 Op.getOperand(1));
6798 case Intrinsic::aarch64_sve_fcvtzu:
6799 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6800 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6801 case Intrinsic::aarch64_sve_fcvtzs:
6802 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6803 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6804 case Intrinsic::aarch64_sve_fsqrt:
6805 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6806 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6807 case Intrinsic::aarch64_sve_frecpx:
6808 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6809 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6810 case Intrinsic::aarch64_sve_frecpe_x:
6811 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6812 Op.getOperand(1));
6813 case Intrinsic::aarch64_sve_frecps_x:
6814 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6815 Op.getOperand(1), Op.getOperand(2));
6816 case Intrinsic::aarch64_sve_frsqrte_x:
6817 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6818 Op.getOperand(1));
6819 case Intrinsic::aarch64_sve_frsqrts_x:
6820 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6821 Op.getOperand(1), Op.getOperand(2));
6822 case Intrinsic::aarch64_sve_fabs:
6823 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6824 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6825 case Intrinsic::aarch64_sve_abs:
6826 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6827 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6828 case Intrinsic::aarch64_sve_neg:
6829 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6830 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6831 case Intrinsic::aarch64_sve_insr: {
6832 SDValue Scalar = Op.getOperand(2);
6833 EVT ScalarTy = Scalar.getValueType();
6834 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6835 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6836
6837 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6838 Op.getOperand(1), Scalar);
6839 }
6840 case Intrinsic::aarch64_sve_rbit:
6841 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6842 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6843 Op.getOperand(1));
6844 case Intrinsic::aarch64_sve_revb:
6845 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6846 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6847 case Intrinsic::aarch64_sve_revh:
6848 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6849 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6850 case Intrinsic::aarch64_sve_revw:
6851 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6852 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6853 case Intrinsic::aarch64_sve_revd:
6854 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6855 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6856 case Intrinsic::aarch64_sve_sxtb:
6857 return DAG.getNode(
6858 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6859 Op.getOperand(2), Op.getOperand(3),
6860 DAG.getValueType(Op.getValueType().changeVectorElementType(
6861 *DAG.getContext(), MVT::i8)),
6862 Op.getOperand(1));
6863 case Intrinsic::aarch64_sve_sxth:
6864 return DAG.getNode(
6865 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6866 Op.getOperand(2), Op.getOperand(3),
6867 DAG.getValueType(Op.getValueType().changeVectorElementType(
6868 *DAG.getContext(), MVT::i16)),
6869 Op.getOperand(1));
6870 case Intrinsic::aarch64_sve_sxtw:
6871 return DAG.getNode(
6872 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6873 Op.getOperand(2), Op.getOperand(3),
6874 DAG.getValueType(Op.getValueType().changeVectorElementType(
6875 *DAG.getContext(), MVT::i32)),
6876 Op.getOperand(1));
6877 case Intrinsic::aarch64_sve_uxtb:
6878 return DAG.getNode(
6879 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6880 Op.getOperand(2), Op.getOperand(3),
6881 DAG.getValueType(Op.getValueType().changeVectorElementType(
6882 *DAG.getContext(), MVT::i8)),
6883 Op.getOperand(1));
6884 case Intrinsic::aarch64_sve_uxth:
6885 return DAG.getNode(
6886 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6887 Op.getOperand(2), Op.getOperand(3),
6888 DAG.getValueType(Op.getValueType().changeVectorElementType(
6889 *DAG.getContext(), MVT::i16)),
6890 Op.getOperand(1));
6891 case Intrinsic::aarch64_sve_uxtw:
6892 return DAG.getNode(
6893 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6894 Op.getOperand(2), Op.getOperand(3),
6895 DAG.getValueType(Op.getValueType().changeVectorElementType(
6896 *DAG.getContext(), MVT::i32)),
6897 Op.getOperand(1));
6898 case Intrinsic::localaddress: {
6899 const auto &MF = DAG.getMachineFunction();
6900 const auto *RegInfo = Subtarget->getRegisterInfo();
6901 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6902 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6903 Op.getSimpleValueType());
6904 }
6905
6906 case Intrinsic::eh_recoverfp: {
6907 // FIXME: This needs to be implemented to correctly handle highly aligned
6908 // stack objects. For now we simply return the incoming FP. Refer D53541
6909 // for more details.
6910 SDValue FnOp = Op.getOperand(1);
6911 SDValue IncomingFPOp = Op.getOperand(2);
6912 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6913 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6914 if (!Fn)
6916 "llvm.eh.recoverfp must take a function as the first argument");
6917 return IncomingFPOp;
6918 }
6919 case Intrinsic::aarch64_neon_vsri:
6920 case Intrinsic::aarch64_neon_vsli:
6921 case Intrinsic::aarch64_sve_sri:
6922 case Intrinsic::aarch64_sve_sli: {
6923 EVT Ty = Op.getValueType();
6924
6925 if (!Ty.isVector())
6926 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6927
6928 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6929
6930 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6931 IntNo == Intrinsic::aarch64_sve_sri;
6932 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6933 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6934 Op.getOperand(3));
6935 }
6936
6937 case Intrinsic::aarch64_neon_srhadd:
6938 case Intrinsic::aarch64_neon_urhadd:
6939 case Intrinsic::aarch64_neon_shadd:
6940 case Intrinsic::aarch64_neon_uhadd: {
6941 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6942 IntNo == Intrinsic::aarch64_neon_shadd);
6943 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6944 IntNo == Intrinsic::aarch64_neon_urhadd);
6945 unsigned Opcode = IsSignedAdd
6946 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6947 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6948 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6949 Op.getOperand(2));
6950 }
6951 case Intrinsic::aarch64_neon_saddlp:
6952 case Intrinsic::aarch64_neon_uaddlp: {
6953 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6954 ? AArch64ISD::UADDLP
6955 : AArch64ISD::SADDLP;
6956 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6957 }
6958 case Intrinsic::aarch64_neon_sdot:
6959 case Intrinsic::aarch64_neon_udot:
6960 case Intrinsic::aarch64_sve_sdot:
6961 case Intrinsic::aarch64_sve_udot: {
6962 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6963 IntNo == Intrinsic::aarch64_sve_udot)
6964 ? AArch64ISD::UDOT
6965 : AArch64ISD::SDOT;
6966 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6967 Op.getOperand(2), Op.getOperand(3));
6968 }
6969 case Intrinsic::aarch64_neon_usdot:
6970 case Intrinsic::aarch64_sve_usdot: {
6971 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6972 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6973 }
6974 case Intrinsic::aarch64_neon_saddlv:
6975 case Intrinsic::aarch64_neon_uaddlv: {
6976 EVT OpVT = Op.getOperand(1).getValueType();
6977 EVT ResVT = Op.getValueType();
6978 assert(
6979 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6980 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6981 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6982 "Unexpected aarch64_neon_u/saddlv type");
6983 (void)OpVT;
6984 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6985 SDValue ADDLV = DAG.getNode(
6986 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6987 : AArch64ISD::SADDLV,
6988 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6989 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6990 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6991 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6992 return EXTRACT_VEC_ELT;
6993 }
6994 case Intrinsic::experimental_vector_match: {
6995 return LowerVectorMatch(Op, DAG);
6996 }
6997 case Intrinsic::aarch64_cls:
6998 case Intrinsic::aarch64_cls64: {
6999 SDValue Res = DAG.getNode(ISD::CTLS, DL, Op.getOperand(1).getValueType(),
7000 Op.getOperand(1));
7001 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
7002 }
7003 case Intrinsic::aarch64_neon_cls: {
7004 // Lower NEON CLS intrinsic to ISD::CTLS
7005 return DAG.getNode(ISD::CTLS, DL, Op.getValueType(), Op.getOperand(1));
7006 }
7007 case Intrinsic::aarch64_sve_pmul:
7008 case Intrinsic::aarch64_neon_pmul:
7009 return DAG.getNode(ISD::CLMUL, DL, Op.getValueType(), Op.getOperand(1),
7010 Op.getOperand(2));
7011 }
7012}
7013
7014bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
7015 if (VT.getVectorElementType() == MVT::i8 ||
7016 VT.getVectorElementType() == MVT::i16) {
7017 EltTy = MVT::i32;
7018 return true;
7019 }
7020 return false;
7021}
7022
7023bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
7024 EVT DataVT) const {
7025 const EVT IndexVT = Extend.getOperand(0).getValueType();
7026 // SVE only supports implicit extension of 32-bit indices.
7027 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
7028 return false;
7029
7030 // Indices cannot be smaller than the main data type.
7031 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
7032 return false;
7033
7034 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
7035 // element container type, which would violate the previous clause.
7036 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
7037}
7038
7039/// Helper function to check if a small vector load can be optimized.
7041 const AArch64Subtarget &Subtarget) {
7042 if (!Subtarget.isNeonAvailable())
7043 return false;
7044 if (LD->isVolatile())
7045 return false;
7046
7047 EVT MemVT = LD->getMemoryVT();
7048 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
7049 return false;
7050
7051 Align Alignment = LD->getAlign();
7052 Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
7053 if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
7054 return false;
7055
7056 return true;
7057}
7058
7059bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
7060 EVT ExtVT = ExtVal.getValueType();
7061 // Small, illegal vectors can be extended inreg.
7062 if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
7063 if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
7064 isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
7065 return true;
7066 }
7067 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
7068 return false;
7069
7070 // It may be worth creating extending masked loads if there are multiple
7071 // masked loads using the same predicate. That way we'll end up creating
7072 // extending masked loads that may then get split by the legaliser. This
7073 // results in just one set of predicate unpacks at the start, instead of
7074 // multiple sets of vector unpacks after each load.
7075 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
7076 if (!isLoadLegalOrCustom(ExtVT, Ld->getValueType(0), Ld->getAlign(),
7077 Ld->getAddressSpace(), ISD::ZEXTLOAD, false)) {
7078 // Disable extending masked loads for fixed-width for now, since the code
7079 // quality doesn't look great.
7080 if (!ExtVT.isScalableVector())
7081 return false;
7082
7083 unsigned NumExtMaskedLoads = 0;
7084 for (auto *U : Ld->getMask()->users())
7085 if (isa<MaskedLoadSDNode>(U))
7086 NumExtMaskedLoads++;
7087
7088 if (NumExtMaskedLoads <= 1)
7089 return false;
7090 }
7091 }
7092
7093 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
7094 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
7095 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
7096}
7097
7098unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
7099 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
7100 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
7101 AArch64ISD::GLD1_MERGE_ZERO},
7102 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
7103 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
7104 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
7105 AArch64ISD::GLD1_MERGE_ZERO},
7106 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
7107 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
7108 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
7109 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7110 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
7111 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
7112 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
7113 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7114 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
7115 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
7116 };
7117 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
7118 return AddrModes.find(Key)->second;
7119}
7120
7121unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
7122 switch (Opcode) {
7123 default:
7124 llvm_unreachable("unimplemented opcode");
7125 return Opcode;
7126 case AArch64ISD::GLD1_MERGE_ZERO:
7127 return AArch64ISD::GLD1S_MERGE_ZERO;
7128 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
7129 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
7130 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
7131 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
7132 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
7133 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
7134 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
7135 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
7136 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
7137 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
7138 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
7139 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
7140 }
7141}
7142
7143SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
7144 SelectionDAG &DAG) const {
7145 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
7146
7147 SDLoc DL(Op);
7148 SDValue Chain = MGT->getChain();
7149 SDValue PassThru = MGT->getPassThru();
7150 SDValue Mask = MGT->getMask();
7151 SDValue BasePtr = MGT->getBasePtr();
7152 SDValue Index = MGT->getIndex();
7153 SDValue Scale = MGT->getScale();
7154 EVT VT = Op.getValueType();
7155 EVT MemVT = MGT->getMemoryVT();
7156 ISD::LoadExtType ExtType = MGT->getExtensionType();
7157 ISD::MemIndexType IndexType = MGT->getIndexType();
7158
7159 // SVE supports zero (and so undef) passthrough values only, everything else
7160 // must be handled manually by an explicit select on the load's output.
7161 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
7162 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
7163 SDValue Load =
7164 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7165 MGT->getMemOperand(), IndexType, ExtType);
7166 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7167 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
7168 }
7169
7170 bool IsScaled = MGT->isIndexScaled();
7171 bool IsSigned = MGT->isIndexSigned();
7172
7173 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7174 // must be calculated before hand.
7175 uint64_t ScaleVal = Scale->getAsZExtVal();
7176 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7177 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7178 EVT IndexVT = Index.getValueType();
7179 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7180 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7181 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7182
7183 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7184 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7185 MGT->getMemOperand(), IndexType, ExtType);
7186 }
7187
7188 // Lower fixed length gather to a scalable equivalent.
7189 if (VT.isFixedLengthVector()) {
7190 assert(Subtarget->useSVEForFixedLengthVectors() &&
7191 "Cannot lower when not using SVE for fixed vectors!");
7192
7193 // NOTE: Handle floating-point as if integer then bitcast the result.
7194 EVT DataVT = VT.changeVectorElementTypeToInteger();
7195 MemVT = MemVT.changeVectorElementTypeToInteger();
7196
7197 // Find the smallest integer fixed length vector we can use for the gather.
7198 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7199 if (DataVT.getVectorElementType() == MVT::i64 ||
7200 Index.getValueType().getVectorElementType() == MVT::i64 ||
7201 Mask.getValueType().getVectorElementType() == MVT::i64)
7202 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7203
7204 // Promote vector operands except for passthrough, which we know is either
7205 // undef or zero, and thus best constructed directly.
7206 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7207 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7208 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7209
7210 // A promoted result type forces the need for an extending load.
7211 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
7212 ExtType = ISD::EXTLOAD;
7213
7214 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7215
7216 // Convert fixed length vector operands to scalable.
7217 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7218 MemVT.getVectorElementType());
7219 Index = convertToScalableVector(DAG, ContainerVT, Index);
7221 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
7222 : DAG.getConstant(0, DL, ContainerVT);
7223
7224 // Emit equivalent scalable vector gather.
7225 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7226 SDValue Load =
7227 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
7228 Ops, MGT->getMemOperand(), IndexType, ExtType);
7229
7230 // Extract fixed length data then convert to the required result type.
7231 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
7232 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
7233 if (VT.isFloatingPoint())
7234 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
7235
7236 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7237 }
7238
7239 // Everything else is legal.
7240 return Op;
7241}
7242
7243SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
7244 SelectionDAG &DAG) const {
7245 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
7246
7247 SDLoc DL(Op);
7248 SDValue Chain = MSC->getChain();
7249 SDValue StoreVal = MSC->getValue();
7250 SDValue Mask = MSC->getMask();
7251 SDValue BasePtr = MSC->getBasePtr();
7252 SDValue Index = MSC->getIndex();
7253 SDValue Scale = MSC->getScale();
7254 EVT VT = StoreVal.getValueType();
7255 EVT MemVT = MSC->getMemoryVT();
7256 ISD::MemIndexType IndexType = MSC->getIndexType();
7257 bool Truncating = MSC->isTruncatingStore();
7258
7259 bool IsScaled = MSC->isIndexScaled();
7260 bool IsSigned = MSC->isIndexSigned();
7261
7262 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7263 // must be calculated before hand.
7264 uint64_t ScaleVal = Scale->getAsZExtVal();
7265 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7266 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7267 EVT IndexVT = Index.getValueType();
7268 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7269 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7270 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7271
7272 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7273 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7274 MSC->getMemOperand(), IndexType, Truncating);
7275 }
7276
7277 // Lower fixed length scatter to a scalable equivalent.
7278 if (VT.isFixedLengthVector()) {
7279 assert(Subtarget->useSVEForFixedLengthVectors() &&
7280 "Cannot lower when not using SVE for fixed vectors!");
7281
7282 // Once bitcast we treat floating-point scatters as if integer.
7283 if (VT.isFloatingPoint()) {
7285 MemVT = MemVT.changeVectorElementTypeToInteger();
7286 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
7287 }
7288
7289 // Find the smallest integer fixed length vector we can use for the scatter.
7290 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7291 if (VT.getVectorElementType() == MVT::i64 ||
7292 Index.getValueType().getVectorElementType() == MVT::i64 ||
7293 Mask.getValueType().getVectorElementType() == MVT::i64)
7294 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7295
7296 // Promote vector operands.
7297 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7298 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7299 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7300 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
7301
7302 // A promoted value type forces the need for a truncating store.
7303 if (PromotedVT != VT)
7304 Truncating = true;
7305
7306 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7307
7308 // Convert fixed length vector operands to scalable.
7309 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7310 MemVT.getVectorElementType());
7311 Index = convertToScalableVector(DAG, ContainerVT, Index);
7313 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
7314
7315 // Emit equivalent scalable vector scatter.
7316 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7317 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7318 MSC->getMemOperand(), IndexType, Truncating);
7319 }
7320
7321 // Everything else is legal.
7322 return Op;
7323}
7324
7325SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7326 SDLoc DL(Op);
7327 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7328 assert(LoadNode && "Expected custom lowering of a masked load node");
7329 EVT VT = Op->getValueType(0);
7330
7331 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7332 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7333
7334 SDValue PassThru = LoadNode->getPassThru();
7335 SDValue Mask = LoadNode->getMask();
7336
7337 if (!LoadNode->isExpandingLoad()) {
7338 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7339 return Op;
7340
7342 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7343 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7344 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7345 LoadNode->getExtensionType());
7346
7347 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7348 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7349 }
7350
7351 // Return if EXPAND instruction is not available.
7352 if ((!Subtarget->isSVEAvailable() || !Subtarget->hasSVE2p2()) &&
7353 (!Subtarget->isSVEorStreamingSVEAvailable() || !Subtarget->hasSME2p2()))
7354 return SDValue();
7355
7356 // Create mask using the number of active lanes in the predicate.
7357 SDValue CntActive = DAG.getNode(
7358 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7359 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7360 Mask);
7361
7362 SDValue ActiveMask =
7363 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, Mask->getValueType(0),
7364 DAG.getConstant(0, DL, MVT::i64), CntActive);
7365
7366 // Contiguous load of elements using the active lane mask above.
7368 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7369 LoadNode->getOffset(), ActiveMask, DAG.getUNDEF(VT),
7370 LoadNode->getMemoryVT(), LoadNode->getMemOperand(),
7371 LoadNode->getAddressingMode(), LoadNode->getExtensionType());
7372
7373 // Expand instruction copies the low-numbered elements to active elements
7374 // in the original predicate and zeros all other lanes.
7375 SDValue Result = DAG.getNode(
7377 DAG.getTargetConstant(Intrinsic::aarch64_sve_expand, DL, MVT::i64), Mask,
7378 Load);
7379
7380 // Copy the passthrough value unless zero/undef.
7381 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode()))
7382 Result = DAG.getSelect(DL, VT, Mask, Result, PassThru);
7383
7384 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7385}
7386
7387// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7389 EVT VT, EVT MemVT,
7390 SelectionDAG &DAG) {
7391 assert(VT.isVector() && "VT should be a vector type");
7392 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7393
7394 SDValue Value = ST->getValue();
7395
7396 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7397 // the word lane which represent the v4i8 subvector. It optimizes the store
7398 // to:
7399 //
7400 // xtn v0.8b, v0.8h
7401 // str s0, [x0]
7402
7403 SDValue Poison = DAG.getPOISON(MVT::i16);
7404 SDValue PoisonVec =
7405 DAG.getBuildVector(MVT::v4i16, DL, {Poison, Poison, Poison, Poison});
7406
7407 SDValue TruncExt =
7408 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, PoisonVec);
7409 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7410
7411 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7412 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7413 Trunc, DAG.getConstant(0, DL, MVT::i64));
7414
7415 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7416 ST->getBasePtr(), ST->getMemOperand());
7417}
7418
7420 SDLoc DL(Op);
7421 SDValue Src = Op.getOperand(0);
7422 MVT DestVT = Op.getSimpleValueType();
7423 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7425
7426 unsigned SrcAS = N->getSrcAddressSpace();
7427 unsigned DestAS = N->getDestAddressSpace();
7428 assert(SrcAS != DestAS &&
7429 "addrspacecast must be between different address spaces");
7430 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7431 TLI.getTargetMachine().getPointerSize(DestAS) &&
7432 "addrspacecast must be between different ptr sizes");
7433 (void)TLI;
7434
7435 if (SrcAS == ARM64AS::PTR32_SPTR) {
7436 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7437 DAG.getTargetConstant(0, DL, DestVT));
7438 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7439 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7440 DAG.getTargetConstant(0, DL, DestVT));
7441 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7442 (DestAS == ARM64AS::PTR32_UPTR)) {
7443 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7444 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7445 return Trunc;
7446 } else {
7447 return Src;
7448 }
7449}
7450
7451// Coordinated with STNP handling in
7452// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
7453// `LowerNTStore`
7454static bool isLegalNTStore(Type *DataType, Align Alignment,
7455 const DataLayout &DL) {
7456 // Currently we only support NT stores lowering for little-endian targets.
7457 if (!DL.isLittleEndian())
7458 return false;
7459
7460 // The backend can lower to STNPWi in this case
7461 if (DataType->isIntegerTy(64))
7462 return true;
7463
7464 auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType);
7465 if (!DataTypeTy)
7466 return false;
7467
7468 // Check fixed vector legality
7469 unsigned NumElements = DataTypeTy->getNumElements();
7470 unsigned EltSizeBits = DataTypeTy->getElementType()->getScalarSizeInBits();
7471
7472 // Currently only power-of-2 vectors are supported
7473 if (!isPowerOf2_64(NumElements) || !isPowerOf2_64(EltSizeBits))
7474 return false;
7475
7476 unsigned TotalSizeBits = DataTypeTy->getPrimitiveSizeInBits().getFixedValue();
7477
7478 // The backend can lower to STNPSi or STNPDi in this case
7479 // via `llvm/lib/Target/AArch64/AArch64InstrInfo.td`
7480 if (TotalSizeBits == 64u || TotalSizeBits == 128u)
7481 return true;
7482
7483 // The backend can lower to STNPQi in this case via `LowerNTStore`
7484 if (TotalSizeBits == 256u && (EltSizeBits == 8u || EltSizeBits == 16u ||
7485 EltSizeBits == 32u || EltSizeBits == 64u))
7486 return true;
7487
7488 return false;
7489}
7490
7491// Lower non-temporal stores that would otherwise be broken by legalization.
7492//
7493// Coordinated with STNP constraints in
7494// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
7495// `isLegalNTStore`
7496static SDValue LowerNTStore(StoreSDNode *StoreNode, EVT VT, EVT MemVT,
7497 const SDLoc &DL, SelectionDAG &DAG) {
7498 assert(StoreNode && "Expected a store operation");
7499 assert(StoreNode->isNonTemporal() && "Expected a non-temporal store");
7500
7501 // Currently, STNP lowering can only either keep or increase code size, thus
7502 // we predicate it to not apply when optimizing for code size.
7503 if (DAG.shouldOptForSize())
7504 return SDValue();
7505
7506 // Currently we only support NT stores lowering for little-endian targets.
7507 if (!DAG.getDataLayout().isLittleEndian())
7508 return SDValue();
7509
7510 if (VT.isVector()) {
7511 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7512 // the custom lowering, as there are no un-paired non-temporal stores and
7513 // legalization will break up 256 bit inputs.
7515 if (VT.isVector() && MemVT.getSizeInBits() == 256u && EC.isKnownEven() &&
7516 (MemVT.getScalarSizeInBits() == 8u ||
7517 MemVT.getScalarSizeInBits() == 16u ||
7518 MemVT.getScalarSizeInBits() == 32u ||
7519 MemVT.getScalarSizeInBits() == 64u)) {
7520 SDValue Lo =
7523 StoreNode->getValue(), DAG.getConstant(0, DL, MVT::i64));
7524 SDValue Hi =
7527 StoreNode->getValue(),
7528 DAG.getConstant(EC.getKnownMinValue() / 2, DL, MVT::i64));
7529 SDValue Result = DAG.getMemIntrinsicNode(
7530 AArch64ISD::STNP, DL, DAG.getVTList(MVT::Other),
7531 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7532 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7533 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7535 StoreNode->getAlign(), DAG.getDataLayout()) &&
7536 "Lowering should be consistent with legality");
7537 return Result;
7538 }
7539 }
7540 return SDValue();
7541}
7542
7543// Custom lowering for any store, vector or scalar and/or default or with
7544// a truncate operations. Currently only custom lower truncate operation
7545// from vector v4i16 to v4i8 or volatile stores of i128.
7546SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7547 SelectionDAG &DAG) const {
7548 SDLoc Dl(Op);
7549 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7550 assert (StoreNode && "Can only custom lower store nodes");
7551
7552 SDValue Value = StoreNode->getValue();
7553
7554 EVT VT = Value.getValueType();
7555 EVT MemVT = StoreNode->getMemoryVT();
7556
7557 if (StoreNode->isNonTemporal()) {
7558 if (auto MaybeSTNP = LowerNTStore(StoreNode, VT, MemVT, Dl, DAG))
7559 return MaybeSTNP;
7560 }
7561
7562 if (VT.isVector()) {
7564 VT,
7565 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7566 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7567
7568 unsigned AS = StoreNode->getAddressSpace();
7569 Align Alignment = StoreNode->getAlign();
7570 if (Alignment < MemVT.getStoreSize() &&
7571 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7572 StoreNode->getMemOperand()->getFlags(),
7573 nullptr)) {
7574 return scalarizeVectorStore(StoreNode, DAG);
7575 }
7576
7577 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7578 MemVT == MVT::v4i8) {
7579 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7580 }
7581 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7582 return LowerStore128(Op, DAG);
7583 } else if (MemVT == MVT::i64x8) {
7584 SDValue Value = StoreNode->getValue();
7585 assert(Value->getValueType(0) == MVT::i64x8);
7586 SDValue Chain = StoreNode->getChain();
7587 SDValue Base = StoreNode->getBasePtr();
7588 EVT PtrVT = Base.getValueType();
7589 for (unsigned i = 0; i < 8; i++) {
7590 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, Value,
7591 DAG.getConstant(i, Dl, MVT::i32));
7592 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7593 DAG.getConstant(i * 8, Dl, PtrVT));
7594 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7595 StoreNode->getBaseAlign());
7596 }
7597 return Chain;
7598 }
7599
7600 return SDValue();
7601}
7602
7603/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7604SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7605 SelectionDAG &DAG) const {
7606 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7607 assert(StoreNode->getMemoryVT() == MVT::i128);
7608 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7609
7610 bool IsStoreRelease =
7612 if (StoreNode->isAtomic())
7613 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7614 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7617
7618 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7619 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7620 ? StoreNode->getOperand(1)
7621 : StoreNode->getOperand(2);
7622 SDLoc DL(Op);
7623 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7624 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7625 if (DAG.getDataLayout().isBigEndian())
7626 std::swap(StoreValue.first, StoreValue.second);
7628 Opcode, DL, DAG.getVTList(MVT::Other),
7629 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7630 StoreNode->getBasePtr()},
7631 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7632 return Result;
7633}
7634
7635/// Helper function to optimize loads of extended small vectors.
7636/// These patterns would otherwise get scalarized into inefficient sequences.
7638 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7639 if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7640 return SDValue();
7641
7642 EVT MemVT = Load->getMemoryVT();
7643 EVT ResVT = Load->getValueType(0);
7644 unsigned NumElts = ResVT.getVectorNumElements();
7645 unsigned DstEltBits = ResVT.getScalarSizeInBits();
7646 unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7647
7648 unsigned ExtOpcode;
7649 switch (Load->getExtensionType()) {
7650 case ISD::EXTLOAD:
7651 case ISD::ZEXTLOAD:
7652 ExtOpcode = ISD::ZERO_EXTEND;
7653 break;
7654 case ISD::SEXTLOAD:
7655 ExtOpcode = ISD::SIGN_EXTEND;
7656 break;
7657 case ISD::NON_EXTLOAD:
7658 return SDValue();
7659 }
7660
7661 SDLoc DL(Load);
7662 SDValue Chain = Load->getChain();
7663 SDValue BasePtr = Load->getBasePtr();
7664 const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7665 Align Alignment = Load->getAlign();
7666
7667 // Load the data as an FP scalar to avoid issues with integer loads.
7668 unsigned LoadBits = MemVT.getStoreSizeInBits();
7669 MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7670 SDValue ScalarLoad =
7671 DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7672
7673 MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7674 SDValue ScalarToVec =
7675 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7676 MVT BitcastTy =
7677 MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7678 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7679
7680 SDValue Res = Bitcast;
7681 unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7682 unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7683 while (CurrentEltBits < DstEltBits) {
7684 if (Res.getValueSizeInBits() >= 128) {
7685 CurrentNumElts = CurrentNumElts / 2;
7686 MVT ExtractVT =
7687 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7688 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7689 DAG.getConstant(0, DL, MVT::i64));
7690 }
7691 CurrentEltBits = CurrentEltBits * 2;
7692 MVT ExtVT =
7693 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7694 Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7695 }
7696
7697 if (CurrentNumElts != NumElts) {
7698 MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7699 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7700 DAG.getConstant(0, DL, MVT::i64));
7701 }
7702
7703 return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7704}
7705
7706SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7707 SelectionDAG &DAG) const {
7708 SDLoc DL(Op);
7709 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7710 assert(LoadNode && "Expected custom lowering of a load node");
7711
7712 if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7713 return Result;
7714
7715 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7717 SDValue Base = LoadNode->getBasePtr();
7718 SDValue Chain = LoadNode->getChain();
7719 EVT PtrVT = Base.getValueType();
7720 for (unsigned i = 0; i < 8; i++) {
7721 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7722 DAG.getConstant(i * 8, DL, PtrVT));
7723 SDValue Part =
7724 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7725 LoadNode->getBaseAlign());
7726 Ops.push_back(Part);
7727 Chain = SDValue(Part.getNode(), 1);
7728 }
7729 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7730 return DAG.getMergeValues({Loaded, Chain}, DL);
7731 }
7732
7733 return SDValue();
7734}
7735
7736SDValue AArch64TargetLowering::LowerFixedLengthVectorCompressToSVE(
7737 SDValue Op, SelectionDAG &DAG) const {
7738 SDLoc DL(Op);
7739 EVT VT = Op.getValueType();
7740
7741 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7742 SDValue Vec = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
7743 SDValue Mask = convertFixedMaskToScalableVector(Op.getOperand(1), DAG);
7744 SDValue Passthru =
7745 convertToScalableVector(DAG, ContainerVT, Op.getOperand(2));
7746
7747 SDValue Result =
7748 DAG.getNode(ISD::VECTOR_COMPRESS, DL, ContainerVT, Vec, Mask, Passthru);
7749 return convertFromScalableVector(DAG, VT, Result);
7750}
7751
7752SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7753 SelectionDAG &DAG) const {
7754 EVT VT = Op.getValueType();
7755 if (!Subtarget->isSVEAvailable())
7756 return SDValue();
7757
7758 if (VT.isFixedLengthVector())
7759 return LowerFixedLengthVectorCompressToSVE(Op, DAG);
7760
7761 SDLoc DL(Op);
7762 SDValue Vec = Op.getOperand(0);
7763 SDValue Mask = Op.getOperand(1);
7764 SDValue Passthru = Op.getOperand(2);
7765 EVT MaskVT = Mask.getValueType();
7766
7767 SDValue Compressed = DAG.getNode(
7769 DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
7770 Vec);
7771
7772 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7773 if (Passthru.isUndef() ||
7775 return Compressed;
7776
7777 SDValue CntActive = DAG.getNode(
7778 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7779 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7780 Mask);
7781
7782 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7783 SDValue CompressedMask =
7784 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
7785
7786 return DAG.getNode(ISD::VSELECT, DL, VT, CompressedMask, Compressed,
7787 Passthru);
7788}
7789
7790// Generate SUBS and CSEL for integer abs.
7791SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7792 MVT VT = Op.getSimpleValueType();
7793
7794 if (VT.isVector())
7795 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7796
7797 SDLoc DL(Op);
7798 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7799
7800 // Generate SUBS & CSEL.
7801 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7802 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7803 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7804 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7805}
7806
7808 SDValue Chain = Op.getOperand(0);
7809 SDValue Cond = Op.getOperand(1);
7810 SDValue Dest = Op.getOperand(2);
7811
7813 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7814 SDLoc DL(Op);
7815 SDValue CCVal = getCondCode(DAG, CC);
7816 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7817 Cmp);
7818 }
7819
7820 return SDValue();
7821}
7822
7823// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7824// FSHL is converted to FSHR before deciding what to do with it
7826 SDValue Shifts = Op.getOperand(2);
7827 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7828 // If opcode is FSHL, convert it to FSHR
7829 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7830 SDLoc DL(Op);
7831 MVT VT = Op.getSimpleValueType();
7832 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7833
7834 if (Op.getOpcode() == ISD::FSHL) {
7835 if (NewShiftNo == 0)
7836 return Op.getOperand(0);
7837
7838 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7839 return DAG.getNode(
7840 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7841 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7842 }
7843
7844 if (Op.getOpcode() == ISD::FSHR) {
7845 if (NewShiftNo == 0)
7846 return Op.getOperand(1);
7847
7848 if (ShiftNo->getZExtValue() == NewShiftNo)
7849 return Op;
7850
7851 // Rewrite using the normalised shift amount.
7852 return DAG.getNode(
7853 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7854 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7855 }
7856 }
7857
7858 return SDValue();
7859}
7860
7862 SDValue X = Op.getOperand(0);
7863 EVT XScalarTy = X.getValueType();
7864 SDValue Exp = Op.getOperand(1);
7865
7866 SDLoc DL(Op);
7867 EVT XVT, ExpVT;
7868 switch (Op.getSimpleValueType().SimpleTy) {
7869 default:
7870 return SDValue();
7871 case MVT::bf16:
7872 case MVT::f16:
7873 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7874 [[fallthrough]];
7875 case MVT::f32:
7876 XVT = MVT::nxv4f32;
7877 ExpVT = MVT::nxv4i32;
7878 break;
7879 case MVT::f64:
7880 XVT = MVT::nxv2f64;
7881 ExpVT = MVT::nxv2i64;
7882 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7883 break;
7884 }
7885
7886 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7887 SDValue VX =
7888 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getPOISON(XVT), X, Zero);
7889 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7890 DAG.getPOISON(ExpVT), Exp, Zero);
7891 SDValue VPg = DAG.getConstant(
7892 1, DL, XVT.changeVectorElementType(*DAG.getContext(), MVT::i1));
7893 SDValue FScale = DAG.getNode(
7895 DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
7896 VX, VExp);
7897 SDValue Final =
7898 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7899 if (X.getValueType() != XScalarTy)
7900 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7901 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7902 return Final;
7903}
7904
7905SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7906 SelectionDAG &DAG) const {
7907 return Op.getOperand(0);
7908}
7909
7910SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7911 SelectionDAG &DAG) const {
7912 SDValue Chain = Op.getOperand(0);
7913 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7914 SDValue FPtr = Op.getOperand(2); // nested function
7915 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7916
7917 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7918
7919 // ldr NestReg, .+16
7920 // ldr x17, .+20
7921 // br x17
7922 // .word 0
7923 // .nest: .qword nest
7924 // .fptr: .qword fptr
7925 SDValue OutChains[5];
7926
7927 const Function *Func =
7928 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7929 CallingConv::ID CC = Func->getCallingConv();
7930 unsigned NestReg;
7931
7932 switch (CC) {
7933 default:
7934 NestReg = 0x0f; // X15
7935 break;
7937 // Must be kept in sync with AArch64CallingConv.td
7938 NestReg = 0x04; // X4
7939 break;
7940 }
7941
7942 const char FptrReg = 0x11; // X17
7943
7944 SDValue Addr = Trmp;
7945
7946 SDLoc DL(Op);
7947 OutChains[0] = DAG.getStore(
7948 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7949 MachinePointerInfo(TrmpAddr));
7950
7951 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7952 DAG.getConstant(4, DL, MVT::i64));
7953 OutChains[1] = DAG.getStore(
7954 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7955 MachinePointerInfo(TrmpAddr, 4));
7956
7957 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7958 DAG.getConstant(8, DL, MVT::i64));
7959 OutChains[2] =
7960 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7961 MachinePointerInfo(TrmpAddr, 8));
7962
7963 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7964 DAG.getConstant(16, DL, MVT::i64));
7965 OutChains[3] =
7966 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7967
7968 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7969 DAG.getConstant(24, DL, MVT::i64));
7970 OutChains[4] =
7971 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7972
7973 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7974
7975 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7976 DAG.getConstant(12, DL, MVT::i64));
7977
7978 // Call clear cache on the trampoline instructions.
7979 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7980 EndOfTrmp);
7981}
7982
7983SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const {
7984 SDLoc DL(Op);
7985 EVT VT = Op.getValueType();
7986 if (VT.getScalarType() != MVT::bf16 ||
7987 (Subtarget->hasSVEB16B16() &&
7988 Subtarget->isNonStreamingSVEorSME2Available()))
7989 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7990
7991 assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering");
7992 assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16 || VT == MVT::v8bf16) &&
7993 "Unexpected FMUL VT");
7994
7995 auto MakeGetIntrinsic = [&](Intrinsic::ID IID) {
7996 return [&, IID](EVT VT, auto... Ops) {
7997 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7998 DAG.getConstant(IID, DL, MVT::i32), Ops...);
7999 };
8000 };
8001
8002 auto Reinterpret = [&](SDValue Value, EVT VT) {
8003 EVT SrcVT = Value.getValueType();
8004 if (VT == SrcVT)
8005 return Value;
8006 if (SrcVT.isFixedLengthVector())
8007 return convertToScalableVector(DAG, VT, Value);
8008 if (VT.isFixedLengthVector())
8009 return convertFromScalableVector(DAG, VT, Value);
8010 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value);
8011 };
8012
8013 bool UseSVEBFMLAL = VT.isScalableVector();
8014 auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2);
8015 auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2);
8016
8017 // Note: The NEON BFMLAL[BT] reads even/odd lanes like the SVE variant.
8018 // This does not match BFCVTN[2], so we use SVE to convert back to bf16.
8019 auto BFMLALB =
8020 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalb
8021 : Intrinsic::aarch64_neon_bfmlalb);
8022 auto BFMLALT =
8023 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalt
8024 : Intrinsic::aarch64_neon_bfmlalt);
8025
8026 EVT AccVT = UseSVEBFMLAL ? MVT::nxv4f32 : MVT::v4f32;
8027 bool IgnoreZeroSign = DAG.canIgnoreSignBitOfZero(Op);
8028 SDValue Zero = DAG.getConstantFP(IgnoreZeroSign ? +0.0F : -0.0F, DL, AccVT);
8029 SDValue Pg = getPredicateForVector(DAG, DL, AccVT);
8030
8031 // Lower bf16 FMUL as a pair (VT == [nx]v8bf16) of BFMLAL top/bottom
8032 // instructions. These result in two f32 vectors, which can be converted back
8033 // to bf16 with FCVT and FCVTNT.
8034 SDValue LHS = Op.getOperand(0);
8035 SDValue RHS = Op.getOperand(1);
8036
8037 // All SVE intrinsics expect to operate on full bf16 vector types.
8038 if (UseSVEBFMLAL) {
8039 LHS = Reinterpret(LHS, MVT::nxv8bf16);
8040 RHS = Reinterpret(RHS, MVT::nxv8bf16);
8041 }
8042
8043 SDValue BottomF32 = Reinterpret(BFMLALB(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
8044 SDValue BottomBF16 =
8045 FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32);
8046 // Note: nxv4bf16 only uses even lanes.
8047 if (VT == MVT::nxv4bf16)
8048 return Reinterpret(BottomBF16, VT);
8049
8050 SDValue TopF32 = Reinterpret(BFMLALT(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
8051 SDValue TopBF16 = FCVTNT(MVT::nxv8bf16, BottomBF16, Pg, TopF32);
8052 return Reinterpret(TopBF16, VT);
8053}
8054
8055SDValue AArch64TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const {
8056 SDValue OpA = Op->getOperand(0);
8057 SDValue OpB = Op->getOperand(1);
8058 SDValue OpC = Op->getOperand(2);
8059 EVT VT = Op.getValueType();
8060 SDLoc DL(Op);
8061
8062 assert(VT.isVector() && "Scalar fma lowering should be handled by patterns");
8063
8064 // Bail early if we're definitely not looking to merge FNEGs into the FMA.
8065 if (VT != MVT::v8f16 && VT != MVT::v4f32 && VT != MVT::v2f64)
8066 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
8067
8068 if (OpC.getOpcode() != ISD::FNEG)
8069 return useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())
8070 ? LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED)
8071 : Op; // Fallback to NEON lowering.
8072
8073 // Convert FMA/FNEG nodes to SVE to enable the following patterns:
8074 // fma(a, b, neg(c)) -> fnmls(a, b, c)
8075 // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
8076 // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
8077 SDValue Pg = getPredicateForVector(DAG, DL, VT);
8078 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
8079
8080 auto ConvertToScalableFnegMt = [&](SDValue Op) {
8081 if (Op.getOpcode() == ISD::FNEG)
8082 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
8083 return convertToScalableVector(DAG, ContainerVT, Op);
8084 };
8085
8086 OpA = ConvertToScalableFnegMt(OpA);
8087 OpB = ConvertToScalableFnegMt(OpB);
8088 OpC = ConvertToScalableFnegMt(OpC);
8089
8090 SDValue ScalableRes =
8091 DAG.getNode(AArch64ISD::FMA_PRED, DL, ContainerVT, Pg, OpA, OpB, OpC);
8092 return convertFromScalableVector(DAG, VT, ScalableRes);
8093}
8094
8096 EVT VT = Op.getValueType();
8097 assert(
8098 (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) &&
8099 "Unexpected Type");
8100 SDLoc DL(Op);
8101 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 64 / VT.getSizeInBits());
8102 EVT CLMULTy = VT == MVT::i8 ? MVT::v8i8 : MVT::v1i64;
8103 EVT ExtractTy = VT == MVT::i64 ? MVT::i64 : MVT::i32;
8104 SDValue VecOp0 =
8105 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op.getOperand(0));
8106 SDValue VecOp1 =
8107 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op.getOperand(1));
8108
8109 if (VecVT != CLMULTy) {
8110 VecOp0 = DAG.getNode(ISD::BITCAST, DL, CLMULTy, VecOp0);
8111 VecOp1 = DAG.getNode(ISD::BITCAST, DL, CLMULTy, VecOp1);
8112 }
8113 SDValue CLMUL = DAG.getNode(ISD::CLMUL, DL, CLMULTy, VecOp0, VecOp1);
8114 if (ExtractTy == MVT::i32)
8115 CLMUL = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, CLMUL);
8116 SDValue ExtractVecElt =
8117 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy, CLMUL,
8118 DAG.getTargetConstant(0, DL, MVT::i64));
8119 if (ExtractTy != VT)
8120 ExtractVecElt = DAG.getNode(ISD::TRUNCATE, DL, VT, ExtractVecElt);
8121 return ExtractVecElt;
8122}
8123
8125 SelectionDAG &DAG) const {
8126 LLVM_DEBUG(dbgs() << "Custom lowering: ");
8127 LLVM_DEBUG(Op.dump());
8128
8129 switch (Op.getOpcode()) {
8130 default:
8131 llvm_unreachable("unimplemented operand");
8132 return SDValue();
8135 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
8136 case ISD::BITCAST:
8137 return LowerBITCAST(Op, DAG);
8138 case ISD::GlobalAddress:
8139 return LowerGlobalAddress(Op, DAG);
8141 return LowerGlobalTLSAddress(Op, DAG);
8143 return LowerPtrAuthGlobalAddress(Op, DAG);
8145 return LowerADJUST_TRAMPOLINE(Op, DAG);
8147 return LowerINIT_TRAMPOLINE(Op, DAG);
8148 case ISD::SETCC:
8149 case ISD::STRICT_FSETCC:
8151 return LowerSETCC(Op, DAG);
8152 case ISD::SETCCCARRY:
8153 return LowerSETCCCARRY(Op, DAG);
8154 case ISD::BRCOND:
8155 return LowerBRCOND(Op, DAG);
8156 case ISD::BR_CC:
8157 return LowerBR_CC(Op, DAG);
8158 case ISD::SELECT:
8159 return LowerSELECT(Op, DAG);
8160 case ISD::SELECT_CC:
8161 return LowerSELECT_CC(Op, DAG);
8162 case ISD::JumpTable:
8163 return LowerJumpTable(Op, DAG);
8164 case ISD::BR_JT:
8165 return LowerBR_JT(Op, DAG);
8166 case ISD::BRIND:
8167 return LowerBRIND(Op, DAG);
8168 case ISD::ConstantPool:
8169 return LowerConstantPool(Op, DAG);
8170 case ISD::BlockAddress:
8171 return LowerBlockAddress(Op, DAG);
8172 case ISD::VASTART:
8173 return LowerVASTART(Op, DAG);
8174 case ISD::VACOPY:
8175 return LowerVACOPY(Op, DAG);
8176 case ISD::VAARG:
8177 return LowerVAARG(Op, DAG);
8178 case ISD::UADDO_CARRY:
8179 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
8180 case ISD::USUBO_CARRY:
8181 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
8182 case ISD::SADDO_CARRY:
8183 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
8184 case ISD::SSUBO_CARRY:
8185 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
8186 case ISD::SADDO:
8187 case ISD::UADDO:
8188 case ISD::SSUBO:
8189 case ISD::USUBO:
8190 case ISD::SMULO:
8191 case ISD::UMULO:
8192 return LowerXALUO(Op, DAG);
8193 case ISD::FADD:
8194 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
8195 case ISD::FSUB:
8196 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
8197 case ISD::FMUL:
8198 return LowerFMUL(Op, DAG);
8199 case ISD::FMA:
8200 return LowerFMA(Op, DAG);
8201 case ISD::FDIV:
8202 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
8203 case ISD::FNEG:
8204 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
8205 case ISD::FCEIL:
8206 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
8207 case ISD::FFLOOR:
8208 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
8209 case ISD::FNEARBYINT:
8210 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
8211 case ISD::FRINT:
8212 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
8213 case ISD::FROUND:
8214 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
8215 case ISD::FROUNDEVEN:
8216 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
8217 case ISD::FTRUNC:
8218 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
8219 case ISD::FSQRT:
8220 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
8221 case ISD::FABS:
8222 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
8223 case ISD::FP_ROUND:
8225 return LowerFP_ROUND(Op, DAG);
8226 case ISD::FP_EXTEND:
8228 return LowerFP_EXTEND(Op, DAG);
8229 case ISD::FRAMEADDR:
8230 return LowerFRAMEADDR(Op, DAG);
8231 case ISD::SPONENTRY:
8232 return LowerSPONENTRY(Op, DAG);
8233 case ISD::RETURNADDR:
8234 return LowerRETURNADDR(Op, DAG);
8236 return LowerADDROFRETURNADDR(Op, DAG);
8238 return LowerCONCAT_VECTORS(Op, DAG);
8240 return LowerINSERT_VECTOR_ELT(Op, DAG);
8242 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
8243 case ISD::BUILD_VECTOR:
8244 return LowerBUILD_VECTOR(Op, DAG);
8247 return LowerEXTEND_VECTOR_INREG(Op, DAG);
8249 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
8251 return LowerVECTOR_SHUFFLE(Op, DAG);
8252 case ISD::SPLAT_VECTOR:
8253 return LowerSPLAT_VECTOR(Op, DAG);
8255 return LowerEXTRACT_SUBVECTOR(Op, DAG);
8257 return LowerINSERT_SUBVECTOR(Op, DAG);
8258 case ISD::MASKED_SDIV:
8259 case ISD::MASKED_UDIV:
8260 case ISD::SDIV:
8261 case ISD::UDIV:
8262 return LowerDIV(Op, DAG);
8263 case ISD::SMIN:
8264 case ISD::UMIN:
8265 case ISD::SMAX:
8266 case ISD::UMAX:
8267 return LowerMinMax(Op, DAG);
8268 case ISD::SRA:
8269 case ISD::SRL:
8270 case ISD::SHL:
8271 return LowerVectorSRA_SRL_SHL(Op, DAG);
8272 case ISD::SHL_PARTS:
8273 case ISD::SRL_PARTS:
8274 case ISD::SRA_PARTS:
8275 return LowerShiftParts(Op, DAG);
8276 case ISD::CTPOP:
8277 case ISD::PARITY:
8278 return LowerCTPOP_PARITY(Op, DAG);
8279 case ISD::FCOPYSIGN:
8280 return LowerFCOPYSIGN(Op, DAG);
8281 case ISD::OR:
8282 return LowerVectorOR(Op, DAG);
8283 case ISD::XOR:
8284 return LowerXOR(Op, DAG);
8285 case ISD::PREFETCH:
8286 return LowerPREFETCH(Op, DAG);
8287 case ISD::SINT_TO_FP:
8288 case ISD::UINT_TO_FP:
8291 return LowerINT_TO_FP(Op, DAG);
8292 case ISD::FP_TO_SINT:
8293 case ISD::FP_TO_UINT:
8296 return LowerFP_TO_INT(Op, DAG);
8299 return LowerFP_TO_INT_SAT(Op, DAG);
8300 case ISD::GET_ROUNDING:
8301 return LowerGET_ROUNDING(Op, DAG);
8302 case ISD::SET_ROUNDING:
8303 return LowerSET_ROUNDING(Op, DAG);
8304 case ISD::GET_FPMODE:
8305 return LowerGET_FPMODE(Op, DAG);
8306 case ISD::SET_FPMODE:
8307 return LowerSET_FPMODE(Op, DAG);
8308 case ISD::RESET_FPMODE:
8309 return LowerRESET_FPMODE(Op, DAG);
8310 case ISD::MUL:
8311 return LowerMUL(Op, DAG);
8312 case ISD::MULHS:
8313 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
8314 case ISD::MULHU:
8315 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
8317 return LowerINTRINSIC_W_CHAIN(Op, DAG);
8319 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
8321 return LowerINTRINSIC_VOID(Op, DAG);
8322 case ISD::ATOMIC_STORE:
8323 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
8324 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
8325 return LowerStore128(Op, DAG);
8326 }
8327 return SDValue();
8328 case ISD::STORE:
8329 return LowerSTORE(Op, DAG);
8330 case ISD::MSTORE:
8331 return LowerMSTORE(Op, DAG);
8332 case ISD::MGATHER:
8333 return LowerMGATHER(Op, DAG);
8334 case ISD::MSCATTER:
8335 return LowerMSCATTER(Op, DAG);
8337 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
8338 case ISD::VECREDUCE_ADD:
8339 case ISD::VECREDUCE_AND:
8340 case ISD::VECREDUCE_OR:
8341 case ISD::VECREDUCE_XOR:
8351 return LowerVECREDUCE(Op, DAG);
8352 case ISD::VECREDUCE_MUL:
8354 return LowerVECREDUCE_MUL(Op, DAG);
8356 return LowerATOMIC_LOAD_AND(Op, DAG);
8358 return LowerDYNAMIC_STACKALLOC(Op, DAG);
8359 case ISD::VSCALE:
8360 return LowerVSCALE(Op, DAG);
8362 return LowerVECTOR_COMPRESS(Op, DAG);
8363 case ISD::ANY_EXTEND:
8364 case ISD::SIGN_EXTEND:
8365 case ISD::ZERO_EXTEND:
8366 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
8367 case ISD::ADDRSPACECAST:
8368 return LowerADDRSPACECAST(Op, DAG);
8370 // Only custom lower when ExtraVT has a legal byte based element type.
8371 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
8372 EVT ExtraEltVT = ExtraVT.getVectorElementType();
8373 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
8374 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
8375 return SDValue();
8376
8377 return LowerToPredicatedOp(Op, DAG,
8378 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
8379 }
8380 case ISD::TRUNCATE:
8381 return LowerTRUNCATE(Op, DAG);
8382 case ISD::MLOAD:
8383 return LowerMLOAD(Op, DAG);
8384 case ISD::LOAD:
8385 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
8386 !Subtarget->isNeonAvailable()))
8387 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
8388 return LowerLOAD(Op, DAG);
8389 case ISD::ADD:
8390 case ISD::AND:
8391 case ISD::SUB:
8392 return LowerToScalableOp(Op, DAG);
8393 case ISD::FMAXIMUM:
8394 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
8395 case ISD::FMAXNUM:
8396 case ISD::FMAXNUM_IEEE:
8397 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
8398 case ISD::FMINIMUM:
8399 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
8400 case ISD::FMINNUM:
8401 case ISD::FMINNUM_IEEE:
8402 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
8403 case ISD::VSELECT:
8404 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
8405 case ISD::ABS:
8406 return LowerABS(Op, DAG);
8407 case ISD::ABDS:
8408 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
8409 case ISD::ABDU:
8410 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
8411 case ISD::AVGFLOORS:
8412 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
8413 case ISD::AVGFLOORU:
8414 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
8415 case ISD::AVGCEILS:
8416 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
8417 case ISD::AVGCEILU:
8418 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
8419 case ISD::BITREVERSE:
8420 return LowerBitreverse(Op, DAG);
8421 case ISD::BSWAP:
8422 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
8423 case ISD::CTLZ:
8424 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
8425 case ISD::CTTZ:
8426 return LowerCTTZ(Op, DAG);
8429 return LowerVECTOR_SPLICE(Op, DAG);
8431 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
8433 return LowerVECTOR_INTERLEAVE(Op, DAG);
8435 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
8436 case ISD::LRINT:
8437 case ISD::LLRINT:
8438 if (Op.getValueType().isVector())
8439 return LowerVectorXRINT(Op, DAG);
8440 [[fallthrough]];
8441 case ISD::LROUND:
8442 case ISD::LLROUND: {
8443 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
8444 Op.getOperand(0).getValueType() == MVT::bf16) &&
8445 "Expected custom lowering of rounding operations only for f16");
8446 SDLoc DL(Op);
8447 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
8448 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
8449 }
8450 case ISD::STRICT_LROUND:
8452 case ISD::STRICT_LRINT:
8453 case ISD::STRICT_LLRINT: {
8454 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
8455 Op.getOperand(1).getValueType() == MVT::bf16) &&
8456 "Expected custom lowering of rounding operations only for f16");
8457 SDLoc DL(Op);
8458 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
8459 {Op.getOperand(0), Op.getOperand(1)});
8460 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
8461 {Ext.getValue(1), Ext.getValue(0)});
8462 }
8463 case ISD::WRITE_REGISTER: {
8464 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
8465 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
8466 SDLoc DL(Op);
8467
8468 SDValue Chain = Op.getOperand(0);
8469 SDValue SysRegName = Op.getOperand(1);
8470 std::pair<SDValue, SDValue> Pair =
8471 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
8472
8473 // chain = MSRR(chain, sysregname, lo, hi)
8474 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
8475 SysRegName, Pair.first, Pair.second);
8476
8477 return Result;
8478 }
8479 case ISD::FSHL:
8480 case ISD::FSHR:
8481 return LowerFunnelShift(Op, DAG);
8482 case ISD::FLDEXP:
8483 return LowerFLDEXP(Op, DAG);
8485 return LowerVECTOR_HISTOGRAM(Op, DAG);
8490 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
8491 case ISD::CLMUL:
8492 return LowerCLMUL(Op, DAG);
8493 case ISD::FCANONICALIZE:
8494 return LowerFCANONICALIZE(Op, DAG);
8495 case ISD::CTTZ_ELTS:
8497 SDLoc DL(Op);
8498 SDValue CttzOp = Op.getOperand(0);
8499 EVT VT = CttzOp.getValueType();
8500 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
8501
8502 if (VT.isFixedLengthVector()) {
8503 // We can use SVE instructions to lower this intrinsic by first creating
8504 // an SVE predicate register mask from the fixed-width vector.
8505 VT = getTypeToTransformTo(*DAG.getContext(), VT);
8506 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, CttzOp);
8507 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
8508 }
8509
8510 SDValue Pg = getPredicateForVector(DAG, DL, VT);
8511 SDValue NewCttzElts =
8512 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, Pg, CttzOp);
8513 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
8514 }
8515 }
8516}
8517
8519 return !Subtarget->useSVEForFixedLengthVectors();
8520}
8521
8523 EVT VT, bool OverrideNEON) const {
8524 if (!VT.isFixedLengthVector() || !VT.isSimple())
8525 return false;
8526
8527 // Don't use SVE for vectors we cannot scalarize if required.
8528 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8529 // Fixed length predicates should be promoted to i8.
8530 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
8531 case MVT::i1:
8532 default:
8533 return false;
8534 case MVT::i8:
8535 case MVT::i16:
8536 case MVT::i32:
8537 case MVT::i64:
8538 case MVT::f16:
8539 case MVT::f32:
8540 case MVT::f64:
8541 break;
8542 }
8543
8544 // NEON-sized vectors can be emulated using SVE instructions.
8545 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
8546 return Subtarget->isSVEorStreamingSVEAvailable();
8547
8548 // Ensure NEON MVTs only belong to a single register class.
8549 if (VT.getFixedSizeInBits() <= 128)
8550 return false;
8551
8552 // Ensure wider than NEON code generation is enabled.
8553 if (!Subtarget->useSVEForFixedLengthVectors())
8554 return false;
8555
8556 // Don't use SVE for types that don't fit.
8557 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
8558 return false;
8559
8560 // TODO: Perhaps an artificial restriction, but worth having whilst getting
8561 // the base fixed length SVE support in place.
8562 if (!VT.isPow2VectorType())
8563 return false;
8564
8565 return true;
8566}
8567
8568//===----------------------------------------------------------------------===//
8569// Calling Convention Implementation
8570//===----------------------------------------------------------------------===//
8571
8572static unsigned getIntrinsicID(const SDNode *N) {
8573 unsigned Opcode = N->getOpcode();
8574 switch (Opcode) {
8575 default:
8578 unsigned IID = N->getConstantOperandVal(0);
8579 if (IID < Intrinsic::num_intrinsics)
8580 return IID;
8582 }
8583 }
8584}
8585
8587 SDValue N1) const {
8588 if (!N0.hasOneUse())
8589 return false;
8590
8591 unsigned IID = getIntrinsicID(N1.getNode());
8592 // Avoid reassociating expressions that can be lowered to smlal/umlal.
8593 if (IID == Intrinsic::aarch64_neon_umull ||
8594 N1.getOpcode() == AArch64ISD::UMULL ||
8595 IID == Intrinsic::aarch64_neon_smull ||
8596 N1.getOpcode() == AArch64ISD::SMULL)
8597 return N0.getOpcode() != ISD::ADD;
8598
8599 return true;
8600}
8601
8602/// Selects the correct CCAssignFn for a given CallingConvention value.
8604 bool IsVarArg) const {
8605 switch (CC) {
8606 default:
8607 reportFatalUsageError("unsupported calling convention");
8608 case CallingConv::GHC:
8609 return CC_AArch64_GHC;
8611 // The VarArg implementation makes assumptions about register
8612 // argument passing that do not hold for preserve_none, so we
8613 // instead fall back to C argument passing.
8614 // The non-vararg case is handled in the CC function itself.
8615 if (!IsVarArg)
8617 [[fallthrough]];
8618 case CallingConv::C:
8619 case CallingConv::Fast:
8623 case CallingConv::Swift:
8625 case CallingConv::Tail:
8626 case CallingConv::GRAAL:
8627 if (Subtarget->isTargetWindows()) {
8628 if (IsVarArg) {
8629 if (Subtarget->isWindowsArm64EC())
8632 }
8633 return CC_AArch64_Win64PCS;
8634 }
8635 if (!Subtarget->isTargetDarwin())
8636 return CC_AArch64_AAPCS;
8637 if (!IsVarArg)
8638 return CC_AArch64_DarwinPCS;
8639 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8641 case CallingConv::Win64:
8642 if (IsVarArg) {
8643 if (Subtarget->isWindowsArm64EC())
8646 }
8647 return CC_AArch64_Win64PCS;
8649 if (Subtarget->isWindowsArm64EC())
8657 return CC_AArch64_AAPCS;
8662 }
8663}
8664
8665CCAssignFn *
8667 switch (CC) {
8668 default:
8669 return RetCC_AArch64_AAPCS;
8673 if (Subtarget->isWindowsArm64EC())
8675 return RetCC_AArch64_AAPCS;
8676 }
8677}
8678
8679static bool isPassedInFPR(EVT VT) {
8680 return VT.isFixedLengthVector() ||
8681 (VT.isFloatingPoint() && !VT.isScalableVector());
8682}
8683
8684SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8685 SelectionDAG &DAG) const {
8686 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8687 SDValue Glue = Chain.getValue(1);
8688
8689 MachineFunction &MF = DAG.getMachineFunction();
8690 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8691
8692 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8693
8694 // The following conditions are true on entry to an exception handler:
8695 // - PSTATE.SM is 0.
8696 // - PSTATE.ZA is 0.
8697 // - TPIDR2_EL0 is null.
8698 // See:
8699 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8700 //
8701 // Therefore, if the function that contains this exception handler is a
8702 // streaming[-compatible] function, we must re-enable streaming mode.
8703 //
8704 // These mode changes are usually optimized away in catch blocks as they
8705 // occur before the __cxa_begin_catch (which is a non-streaming function),
8706 // but are necessary in some cases (such as for cleanups).
8707 //
8708 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8709
8710 // [COND_]SMSTART SM
8711 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8712 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8713 /*Glue*/ Glue, AArch64SME::Always);
8714 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8715 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8717 return Chain;
8718}
8719
8720SDValue AArch64TargetLowering::LowerFormalArguments(
8721 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8722 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8723 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8724 MachineFunction &MF = DAG.getMachineFunction();
8725 const Function &F = MF.getFunction();
8726 MachineFrameInfo &MFI = MF.getFrameInfo();
8727 bool IsWin64 =
8728 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8729 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8730 (isVarArg && Subtarget->isWindowsArm64EC());
8731 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8732
8734 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8736 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8737 FuncInfo->setIsSVECC(true);
8738
8739 // Assign locations to all of the incoming arguments.
8741 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8742
8743 // At this point, Ins[].VT may already be promoted to i32. To correctly
8744 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8745 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8746 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8747 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8748 // LocVT.
8749 unsigned NumArgs = Ins.size();
8750 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8751 unsigned CurArgIdx = 0;
8752 bool UseVarArgCC = false;
8753 if (IsWin64)
8754 UseVarArgCC = isVarArg;
8755
8756 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8757
8758 for (unsigned i = 0; i != NumArgs; ++i) {
8759 MVT ValVT = Ins[i].VT;
8760 if (Ins[i].isOrigArg()) {
8761 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8762 CurArgIdx = Ins[i].getOrigArgIndex();
8763
8764 // Get type of the original argument.
8765 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8766 /*AllowUnknown*/ true);
8767 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8768 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8769 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8770 ValVT = MVT::i8;
8771 else if (ActualMVT == MVT::i16)
8772 ValVT = MVT::i16;
8773 }
8774 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8775 Ins[i].OrigTy, CCInfo);
8776 assert(!Res && "Call operand has unhandled type");
8777 (void)Res;
8778 }
8779
8780 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8781 bool IsLocallyStreaming =
8782 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8783 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8784 SDValue Glue = Chain.getValue(1);
8785
8786 unsigned ExtraArgLocs = 0;
8787 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8788 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8789
8790 if (Ins[i].Flags.isByVal()) {
8791 // Byval is used for HFAs in the PCS, but the system should work in a
8792 // non-compliant manner for larger structs.
8793 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8794 int Size = Ins[i].Flags.getByValSize();
8795 unsigned NumRegs = (Size + 7) / 8;
8796
8797 // FIXME: This works on big-endian for composite byvals, which are the common
8798 // case. It should also work for fundamental types too.
8799 unsigned FrameIdx =
8800 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8801 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8802 InVals.push_back(FrameIdxN);
8803
8804 continue;
8805 }
8806
8807 if (Ins[i].Flags.isSwiftAsync())
8808 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8809
8810 SDValue ArgValue;
8811 if (VA.isRegLoc()) {
8812 // Arguments stored in registers.
8813 EVT RegVT = VA.getLocVT();
8814 const TargetRegisterClass *RC;
8815
8816 if (RegVT == MVT::i32)
8817 RC = &AArch64::GPR32RegClass;
8818 else if (RegVT == MVT::i64)
8819 RC = &AArch64::GPR64RegClass;
8820 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8821 RC = &AArch64::FPR16RegClass;
8822 else if (RegVT == MVT::f32)
8823 RC = &AArch64::FPR32RegClass;
8824 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8825 RC = &AArch64::FPR64RegClass;
8826 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8827 RC = &AArch64::FPR128RegClass;
8828 else if (RegVT.isScalableVector() &&
8829 RegVT.getVectorElementType() == MVT::i1) {
8830 FuncInfo->setIsSVECC(true);
8831 RC = &AArch64::PPRRegClass;
8832 } else if (RegVT == MVT::aarch64svcount) {
8833 FuncInfo->setIsSVECC(true);
8834 RC = &AArch64::PPRRegClass;
8835 } else if (RegVT.isScalableVector()) {
8836 FuncInfo->setIsSVECC(true);
8837 RC = &AArch64::ZPRRegClass;
8838 } else
8839 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8840
8841 // Transform the arguments in physical registers into virtual ones.
8842 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8843
8844 if (IsLocallyStreaming) {
8845 // LocallyStreamingFunctions must insert the SMSTART in the correct
8846 // position, so we use Glue to ensure no instructions can be scheduled
8847 // between the chain of:
8848 // t0: ch,glue = EntryNode
8849 // t1: res,ch,glue = CopyFromReg
8850 // ...
8851 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8852 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8853 // ^^^^^^
8854 // This will be the new Chain/Root node.
8855 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8856 Glue = ArgValue.getValue(2);
8857 if (isPassedInFPR(ArgValue.getValueType())) {
8858 ArgValue =
8859 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8860 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8861 {ArgValue, Glue});
8862 Glue = ArgValue.getValue(1);
8863 }
8864 } else
8865 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8866
8867 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8868 // to 64 bits. Insert an assert[sz]ext to capture this, then
8869 // truncate to the right size.
8870 switch (VA.getLocInfo()) {
8871 default:
8872 llvm_unreachable("Unknown loc info!");
8873 case CCValAssign::Full:
8874 break;
8876 assert(
8877 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8878 "Indirect arguments should be scalable on most subtargets");
8879 break;
8880 case CCValAssign::BCvt:
8881 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8882 break;
8883 case CCValAssign::AExt:
8884 case CCValAssign::SExt:
8885 case CCValAssign::ZExt:
8886 break;
8888 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8889 DAG.getConstant(32, DL, RegVT));
8890 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8891 break;
8892 }
8893 } else { // VA.isRegLoc()
8894 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8895 unsigned ArgOffset = VA.getLocMemOffset();
8896 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8897 ? VA.getLocVT().getSizeInBits()
8898 : VA.getValVT().getSizeInBits()) / 8;
8899
8900 uint32_t BEAlign = 0;
8901 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8902 !Ins[i].Flags.isInConsecutiveRegs())
8903 BEAlign = 8 - ArgSize;
8904
8905 SDValue FIN;
8906 MachinePointerInfo PtrInfo;
8907 if (StackViaX4) {
8908 // In both the ARM64EC varargs convention and the thunk convention,
8909 // arguments on the stack are accessed relative to x4, not sp. In
8910 // the thunk convention, there's an additional offset of 32 bytes
8911 // to account for the shadow store.
8912 unsigned ObjOffset = ArgOffset + BEAlign;
8913 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8914 ObjOffset += 32;
8915 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8916 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8917 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8918 DAG.getConstant(ObjOffset, DL, MVT::i64));
8920 } else {
8921 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8922
8923 // Create load nodes to retrieve arguments from the stack.
8924 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8925 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8926 }
8927
8928 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8930 MVT MemVT = VA.getValVT();
8931
8932 switch (VA.getLocInfo()) {
8933 default:
8934 break;
8935 case CCValAssign::Trunc:
8936 case CCValAssign::BCvt:
8937 MemVT = VA.getLocVT();
8938 break;
8940 assert(
8941 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8942 "Indirect arguments should be scalable on most subtargets");
8943 MemVT = VA.getLocVT();
8944 break;
8945 case CCValAssign::SExt:
8946 ExtType = ISD::SEXTLOAD;
8947 break;
8948 case CCValAssign::ZExt:
8949 ExtType = ISD::ZEXTLOAD;
8950 break;
8951 case CCValAssign::AExt:
8952 ExtType = ISD::EXTLOAD;
8953 break;
8954 }
8955
8956 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8957 MemVT);
8958 }
8959
8960 if (VA.getLocInfo() == CCValAssign::Indirect) {
8961 assert((VA.getValVT().isScalableVT() ||
8962 Subtarget->isWindowsArm64EC()) &&
8963 "Indirect arguments should be scalable on most subtargets");
8964
8965 TypeSize PartSize = VA.getValVT().getStoreSize();
8966 unsigned NumParts = 1;
8967 if (Ins[i].Flags.isInConsecutiveRegs()) {
8968 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8969 ++NumParts;
8970 }
8971
8972 MVT PartLoad = VA.getValVT();
8973 SDValue Ptr = ArgValue;
8974
8975 // Ensure we generate all loads for each tuple part, whilst updating the
8976 // pointer after each load correctly using vscale.
8977 while (NumParts > 0) {
8978 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8979 InVals.push_back(ArgValue);
8980 NumParts--;
8981 if (NumParts > 0) {
8982 SDValue BytesIncrement =
8983 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
8984 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8985 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8986 ExtraArgLocs++;
8987 i++;
8988 }
8989 }
8990 } else {
8991 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8992 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8993 ArgValue, DAG.getValueType(MVT::i32));
8994
8995 // i1 arguments are zero-extended to i8 by the caller. Emit a
8996 // hint to reflect this.
8997 if (Ins[i].isOrigArg()) {
8998 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8999 if (OrigArg->getType()->isIntegerTy(1)) {
9000 if (!Ins[i].Flags.isZExt()) {
9001 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
9002 ArgValue.getValueType(), ArgValue);
9003 }
9004 }
9005 }
9006
9007 InVals.push_back(ArgValue);
9008 }
9009 }
9010 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
9011
9012 if (Attrs.hasStreamingCompatibleInterface()) {
9013 SDValue EntryPStateSM =
9014 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
9015 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
9016
9017 // Copy the value to a virtual register, and save that in FuncInfo.
9018 Register EntryPStateSMReg =
9019 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9020 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
9021 EntryPStateSM);
9022 FuncInfo->setPStateSMReg(EntryPStateSMReg);
9023 }
9024
9025 // Insert the SMSTART if this is a locally streaming function and
9026 // make sure it is Glued to the last CopyFromReg value.
9027 if (IsLocallyStreaming) {
9028 if (Attrs.hasStreamingCompatibleInterface())
9029 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
9031 else
9032 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
9034
9035 // Ensure that the SMSTART happens after the CopyWithChain such that its
9036 // chain result is used.
9037 for (unsigned I=0; I<InVals.size(); ++I) {
9040 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
9041 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
9042 InVals[I].getValueType());
9043 }
9044 }
9045
9046 // varargs
9047 if (isVarArg) {
9049 if (!Subtarget->isTargetDarwin() || IsWin64) {
9050 // The AAPCS variadic function ABI is identical to the non-variadic
9051 // one. As a result there may be more arguments in registers and we
9052 // should save them for future reference.
9053 // Win64 variadic functions also pass arguments in registers, but all
9054 // float arguments are passed in integer registers.
9055 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
9056 }
9057
9058 // This will point to the next argument passed via stack.
9059 unsigned VarArgsOffset = CCInfo.getStackSize();
9060 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
9061 VarArgsOffset =
9062 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
9063 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
9064 FuncInfo->setVarArgsStackIndex(
9065 MFI.CreateFixedObject(4, VarArgsOffset, true));
9066 }
9067
9068 if (MFI.hasMustTailInVarArgFunc()) {
9069 SmallVector<MVT, 2> RegParmTypes;
9070 RegParmTypes.push_back(MVT::i64);
9071 RegParmTypes.push_back(MVT::f128);
9072 // Compute the set of forwarded registers. The rest are scratch.
9073 SmallVectorImpl<ForwardedRegister> &Forwards =
9074 FuncInfo->getForwardedMustTailRegParms();
9075 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
9077
9078 // Conservatively forward X8, since it might be used for aggregate return.
9079 if (!CCInfo.isAllocated(AArch64::X8)) {
9080 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
9081 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
9082 }
9083 }
9084 }
9085
9086 // On Windows, InReg pointers must be returned, so record the pointer in a
9087 // virtual register at the start of the function so it can be returned in the
9088 // epilogue.
9089 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
9090 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
9091 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
9092 Ins[I].Flags.isInReg()) &&
9093 Ins[I].Flags.isSRet()) {
9094 assert(!FuncInfo->getSRetReturnReg());
9095
9096 MVT PtrTy = getPointerTy(DAG.getDataLayout());
9097 Register Reg =
9099 FuncInfo->setSRetReturnReg(Reg);
9100
9101 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
9102 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
9103 break;
9104 }
9105 }
9106 }
9107
9108 unsigned StackArgSize = CCInfo.getStackSize();
9109 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9110 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
9111 // This is a non-standard ABI so by fiat I say we're allowed to make full
9112 // use of the stack area to be popped, which must be aligned to 16 bytes in
9113 // any case:
9114 StackArgSize = alignTo(StackArgSize, 16);
9115
9116 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
9117 // a multiple of 16.
9118 FuncInfo->setArgumentStackToRestore(StackArgSize);
9119
9120 // This realignment carries over to the available bytes below. Our own
9121 // callers will guarantee the space is free by giving an aligned value to
9122 // CALLSEQ_START.
9123 }
9124 // Even if we're not expected to free up the space, it's useful to know how
9125 // much is there while considering tail calls (because we can reuse it).
9126 FuncInfo->setBytesInStackArgArea(StackArgSize);
9127
9128 if (Subtarget->hasCustomCallingConv())
9129 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
9130
9131 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
9132 SDValue Size;
9133 if (Attrs.hasZAState()) {
9134 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9135 DAG.getConstant(1, DL, MVT::i32));
9136 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
9137 } else if (Attrs.hasAgnosticZAInterface()) {
9138 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
9139 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9140
9141 SDValue Callee =
9142 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
9143 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
9144 TargetLowering::CallLoweringInfo CLI(DAG);
9145 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9146 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
9147 {});
9148 std::tie(Size, Chain) = LowerCallTo(CLI);
9149 }
9150 if (Size) {
9151 SDValue Buffer = DAG.getNode(
9152 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9153 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9154 Chain = Buffer.getValue(1);
9155
9156 Register BufferPtr =
9157 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9158 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
9159 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
9160 DAG.getVTList(MVT::Other), Chain);
9161 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
9162 MFI.CreateVariableSizedObject(Align(16), nullptr);
9163 }
9164 }
9165
9166 if (CallConv == CallingConv::PreserveNone) {
9167 for (const ISD::InputArg &I : Ins) {
9168 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
9169 I.Flags.isSwiftAsync()) {
9170 MachineFunction &MF = DAG.getMachineFunction();
9171 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9172 MF.getFunction(),
9173 "Swift attributes can't be used with preserve_none",
9174 DL.getDebugLoc()));
9175 break;
9176 }
9177 }
9178 }
9179
9180 return Chain;
9181}
9182
9183void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
9184 SelectionDAG &DAG,
9185 const SDLoc &DL,
9186 SDValue &Chain) const {
9187 MachineFunction &MF = DAG.getMachineFunction();
9188 MachineFrameInfo &MFI = MF.getFrameInfo();
9189 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9190 auto PtrVT = getPointerTy(DAG.getDataLayout());
9191 Function &F = MF.getFunction();
9192 bool IsWin64 =
9193 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
9194
9196
9198 unsigned NumGPRArgRegs = GPRArgRegs.size();
9199 if (Subtarget->isWindowsArm64EC()) {
9200 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
9201 // functions.
9202 NumGPRArgRegs = 4;
9203 }
9204 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
9205
9206 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
9207 int GPRIdx = 0;
9208 if (GPRSaveSize != 0) {
9209 if (IsWin64) {
9210 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
9211 if (GPRSaveSize & 15)
9212 // The extra size here, if triggered, will always be 8.
9213 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
9214 } else
9215 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
9216
9217 SDValue FIN;
9218 if (Subtarget->isWindowsArm64EC()) {
9219 // With the Arm64EC ABI, we reserve the save area as usual, but we
9220 // compute its address relative to x4. For a normal AArch64->AArch64
9221 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
9222 // different address.
9223 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9224 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9225 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
9226 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
9227 } else {
9228 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
9229 }
9230
9231 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
9232 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
9233 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9234 SDValue Store =
9235 DAG.getStore(Val.getValue(1), DL, Val, FIN,
9237 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
9238 : MachinePointerInfo::getStack(MF, i * 8));
9239 MemOps.push_back(Store);
9240 FIN =
9241 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
9242 }
9243 }
9244 FuncInfo->setVarArgsGPRIndex(GPRIdx);
9245 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
9246
9247 if (Subtarget->hasFPARMv8() && !IsWin64) {
9249 const unsigned NumFPRArgRegs = FPRArgRegs.size();
9250 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
9251
9252 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
9253 int FPRIdx = 0;
9254 if (FPRSaveSize != 0) {
9255 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
9256
9257 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
9258
9259 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
9260 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
9261 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
9262
9263 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
9264 MachinePointerInfo::getStack(MF, i * 16));
9265 MemOps.push_back(Store);
9266 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
9267 DAG.getConstant(16, DL, PtrVT));
9268 }
9269 }
9270 FuncInfo->setVarArgsFPRIndex(FPRIdx);
9271 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
9272 }
9273
9274 if (!MemOps.empty()) {
9275 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9276 }
9277}
9278
9279/// LowerCallResult - Lower the result values of a call into the
9280/// appropriate copies out of appropriate physical registers.
9281SDValue AArch64TargetLowering::LowerCallResult(
9282 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
9283 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
9284 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
9285 SDValue ThisVal, bool RequiresSMChange) const {
9286 DenseMap<unsigned, SDValue> CopiedRegs;
9287 // Copy all of the result registers out of their specified physreg.
9288 for (unsigned i = 0; i != RVLocs.size(); ++i) {
9289 CCValAssign VA = RVLocs[i];
9290
9291 // Pass 'this' value directly from the argument to return value, to avoid
9292 // reg unit interference
9293 if (i == 0 && isThisReturn) {
9294 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
9295 "unexpected return calling convention register assignment");
9296 InVals.push_back(ThisVal);
9297 continue;
9298 }
9299
9300 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
9301 // allows one use of a physreg per block.
9302 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
9303 if (!Val) {
9304 Val =
9305 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
9306 Chain = Val.getValue(1);
9307 InGlue = Val.getValue(2);
9308 CopiedRegs[VA.getLocReg()] = Val;
9309 }
9310
9311 switch (VA.getLocInfo()) {
9312 default:
9313 llvm_unreachable("Unknown loc info!");
9314 case CCValAssign::Full:
9315 break;
9316 case CCValAssign::BCvt:
9317 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
9318 break;
9320 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
9321 DAG.getConstant(32, DL, VA.getLocVT()));
9322 [[fallthrough]];
9323 case CCValAssign::AExt:
9324 [[fallthrough]];
9325 case CCValAssign::ZExt:
9326 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
9327 break;
9328 }
9329
9330 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
9331 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9332 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
9333
9334 InVals.push_back(Val);
9335 }
9336
9337 return Chain;
9338}
9339
9340/// Return true if the calling convention is one that we can guarantee TCO for.
9341static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
9342 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
9344}
9345
9346/// Return true if we might ever do TCO for calls with this calling convention.
9348 switch (CC) {
9349 case CallingConv::C:
9354 case CallingConv::Swift:
9356 case CallingConv::Tail:
9357 case CallingConv::Fast:
9358 return true;
9359 default:
9360 return false;
9361 }
9362}
9363
9364/// Return true if the call convention supports varargs
9365/// Currently only those that pass varargs like the C
9366/// calling convention does are eligible
9367/// Calling conventions listed in this function must also
9368/// be properly handled in AArch64Subtarget::isCallingConvWin64
9370 switch (CC) {
9371 case CallingConv::C:
9373 // SVE vector call is only partially supported, but it should
9374 // support named arguments being passed. Any arguments being passed
9375 // as varargs, are still unsupported.
9377 return true;
9378 default:
9379 return false;
9380 }
9381}
9382
9384 const AArch64Subtarget *Subtarget,
9386 CCState &CCInfo) {
9387 const SelectionDAG &DAG = CLI.DAG;
9388 CallingConv::ID CalleeCC = CLI.CallConv;
9389 bool IsVarArg = CLI.IsVarArg;
9390 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9391 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
9392
9393 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
9394 // for the shadow store.
9395 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
9396 CCInfo.AllocateStack(32, Align(16));
9397
9398 unsigned NumArgs = Outs.size();
9399 for (unsigned i = 0; i != NumArgs; ++i) {
9400 MVT ArgVT = Outs[i].VT;
9401 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
9402
9403 bool UseVarArgCC = false;
9404 if (IsVarArg) {
9405 // On Windows, the fixed arguments in a vararg call are passed in GPRs
9406 // too, so use the vararg CC to force them to integer registers.
9407 if (IsCalleeWin64) {
9408 UseVarArgCC = true;
9409 } else {
9410 UseVarArgCC = ArgFlags.isVarArg();
9411 }
9412 }
9413
9414 if (!UseVarArgCC) {
9415 // Get type of the original argument.
9416 EVT ActualVT =
9417 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
9418 /*AllowUnknown*/ true);
9419 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
9420 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
9421 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
9422 ArgVT = MVT::i8;
9423 else if (ActualMVT == MVT::i16)
9424 ArgVT = MVT::i16;
9425 }
9426
9427 // FIXME: CCAssignFnForCall should be called once, for the call and not per
9428 // argument. This logic should exactly mirror LowerFormalArguments.
9429 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
9430 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
9431 Outs[i].OrigTy, CCInfo);
9432 assert(!Res && "Call operand has unhandled type");
9433 (void)Res;
9434 }
9435}
9436
9437static SMECallAttrs
9440 if (CLI.CB)
9441 return SMECallAttrs(*CLI.CB, &RTLCI);
9442 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9443 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
9445}
9446
9447bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9448 const CallLoweringInfo &CLI) const {
9449 CallingConv::ID CalleeCC = CLI.CallConv;
9450 if (!mayTailCallThisCC(CalleeCC))
9451 return false;
9452
9453 SDValue Callee = CLI.Callee;
9454 bool IsVarArg = CLI.IsVarArg;
9455 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9456 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9457 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9458 const SelectionDAG &DAG = CLI.DAG;
9459 MachineFunction &MF = DAG.getMachineFunction();
9460 const Function &CallerF = MF.getFunction();
9461 CallingConv::ID CallerCC = CallerF.getCallingConv();
9462
9463 // SME Streaming functions are not eligible for TCO as they may require
9464 // the streaming mode or ZA/ZT0 to be restored after returning from the call.
9465 SMECallAttrs CallAttrs =
9466 getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
9467 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9468 CallAttrs.requiresPreservingAllZAState() ||
9469 CallAttrs.requiresPreservingZT0() ||
9470 CallAttrs.caller().hasStreamingBody() || CallAttrs.caller().isNewZA() ||
9471 CallAttrs.caller().isNewZT0())
9472 return false;
9473
9474 // Functions using the C or Fast calling convention that have an SVE signature
9475 // preserve more registers and should assume the SVE_VectorCall CC.
9476 // The check for matching callee-saved regs will determine whether it is
9477 // eligible for TCO.
9478 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9479 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9481
9482 bool CCMatch = CallerCC == CalleeCC;
9483
9484 // When using the Windows calling convention on a non-windows OS, we want
9485 // to back up and restore X18 in such functions; we can't do a tail call
9486 // from those functions.
9487 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9488 CalleeCC != CallingConv::Win64)
9489 return false;
9490
9491 // Byval parameters hand the function a pointer directly into the stack area
9492 // we want to reuse during a tail call. Working around this *is* possible (see
9493 // X86) but less efficient and uglier in LowerCall.
9494 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9495 e = CallerF.arg_end();
9496 i != e; ++i) {
9497 if (i->hasByValAttr())
9498 return false;
9499
9500 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9501 // In this case, it is necessary to save X0/X1 in the callee and return it
9502 // in X0. Tail call opt may interfere with this, so we disable tail call
9503 // opt when the caller has an "inreg" attribute -- except if the callee
9504 // also has that attribute on the same argument, and the same value is
9505 // passed.
9506 if (i->hasInRegAttr()) {
9507 unsigned ArgIdx = i - CallerF.arg_begin();
9508 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9509 return false;
9510 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9511 if (!Attrs.hasAttribute(Attribute::InReg) ||
9512 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9513 CLI.CB->getArgOperand(ArgIdx) != i) {
9514 return false;
9515 }
9516 }
9517 }
9518
9519 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9520 return CCMatch;
9521
9522 // Externally-defined functions with weak linkage should not be
9523 // tail-called on AArch64 when the OS does not support dynamic
9524 // pre-emption of symbols, as the AAELF spec requires normal calls
9525 // to undefined weak functions to be replaced with a NOP or jump to the
9526 // next instruction. The behaviour of branch instructions in this
9527 // situation (as used for tail calls) is implementation-defined, so we
9528 // cannot rely on the linker replacing the tail call with a return.
9529 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9530 const GlobalValue *GV = G->getGlobal();
9531 const Triple &TT = getTargetMachine().getTargetTriple();
9532 if (GV->hasExternalWeakLinkage() &&
9533 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9534 return false;
9535 }
9536
9537 // Now we search for cases where we can use a tail call without changing the
9538 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9539 // concept.
9540
9541 // I want anyone implementing a new calling convention to think long and hard
9542 // about this assert.
9543 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9544 report_fatal_error("Unsupported variadic calling convention");
9545
9546 LLVMContext &C = *DAG.getContext();
9547 // Check that the call results are passed in the same way.
9548 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9549 CCAssignFnForCall(CalleeCC, IsVarArg),
9550 CCAssignFnForCall(CallerCC, IsVarArg)))
9551 return false;
9552 // The callee has to preserve all registers the caller needs to preserve.
9553 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9554 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9555 if (!CCMatch) {
9556 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9557 if (Subtarget->hasCustomCallingConv()) {
9558 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9559 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9560 }
9561 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9562 return false;
9563 }
9564
9565 // Nothing more to check if the callee is taking no arguments
9566 if (Outs.empty())
9567 return true;
9568
9570 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9571
9572 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9573
9574 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9575 // When we are musttail, additional checks have been done and we can safely ignore this check
9576 // At least two cases here: if caller is fastcc then we can't have any
9577 // memory arguments (we'd be expected to clean up the stack afterwards). If
9578 // caller is C then we could potentially use its argument area.
9579
9580 // FIXME: for now we take the most conservative of these in both cases:
9581 // disallow all variadic memory operands.
9582 for (const CCValAssign &ArgLoc : ArgLocs)
9583 if (!ArgLoc.isRegLoc())
9584 return false;
9585 }
9586
9587 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9588
9589 // If any of the arguments is passed indirectly, it must be SVE, so the
9590 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9591 // allocate space on the stack. That is why we determine this explicitly here
9592 // the call cannot be a tailcall.
9593 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9594 assert((A.getLocInfo() != CCValAssign::Indirect ||
9595 A.getValVT().isScalableVector() ||
9596 Subtarget->isWindowsArm64EC()) &&
9597 "Expected value to be scalable");
9598 return A.getLocInfo() == CCValAssign::Indirect;
9599 }))
9600 return false;
9601
9602 // If the stack arguments for this call do not fit into our own save area then
9603 // the call cannot be made tail.
9604 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9605 return false;
9606
9607 const MachineRegisterInfo &MRI = MF.getRegInfo();
9608 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9609 return false;
9610
9611 return true;
9612}
9613
9614SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9615 SelectionDAG &DAG,
9616 MachineFrameInfo &MFI,
9617 int ClobberedFI) const {
9618 SmallVector<SDValue, 8> ArgChains;
9619 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9620 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9621
9622 // Include the original chain at the beginning of the list. When this is
9623 // used by target LowerCall hooks, this helps legalize find the
9624 // CALLSEQ_BEGIN node.
9625 ArgChains.push_back(Chain);
9626
9627 // Add a chain value for each stack argument corresponding
9628 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9629 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9630 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9631 if (FI->getIndex() < 0) {
9632 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9633 int64_t InLastByte = InFirstByte;
9634 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9635
9636 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9637 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9638 ArgChains.push_back(SDValue(L, 1));
9639 }
9640
9641 // Build a tokenfactor for all the chains.
9642 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9643}
9644
9645bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9646 bool TailCallOpt) const {
9647 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9648 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9649}
9650
9651// Check if the value is zero-extended from i1 to i8
9652static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9653 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9654 if (SizeInBits < 8)
9655 return false;
9656
9657 APInt RequiredZero(SizeInBits, 0xFE);
9658 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9659 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9660 return ZExtBool;
9661}
9662
9663void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9664 SDNode *Node) const {
9665 // Live-in physreg copies that are glued to SMSTART are applied as
9666 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9667 // register allocator to pass call args in callee saved regs, without extra
9668 // copies to avoid these fake clobbers of actually-preserved GPRs.
9669 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9670 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9671 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9672 if (MachineOperand &MO = MI.getOperand(I);
9673 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9674 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9675 AArch64::GPR64RegClass.contains(MO.getReg())))
9676 MI.removeOperand(I);
9677
9678 // The SVE vector length can change when entering/leaving streaming mode.
9679 // FPMR is set to 0 when entering/leaving streaming mode.
9680 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9681 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9682 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9683 /*IsImplicit=*/true));
9684 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9685 /*IsImplicit=*/true));
9686 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9687 /*IsImplicit=*/true));
9688 }
9689 }
9690
9691 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9692 // have nothing to do with VG, were it not that they are used to materialise a
9693 // frame-address. If they contain a frame-index to a scalable vector, this
9694 // will likely require an ADDVL instruction to materialise the address, thus
9695 // reading VG.
9696 const MachineFunction &MF = *MI.getMF();
9697 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9698 (MI.getOpcode() == AArch64::ADDXri ||
9699 MI.getOpcode() == AArch64::SUBXri)) {
9700 const MachineOperand &MO = MI.getOperand(1);
9701 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9702 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9703 /*IsImplicit=*/true));
9704 }
9705}
9706
9708 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9709 unsigned Condition, bool InsertVectorLengthCheck) const {
9712 FuncInfo->setHasStreamingModeChanges(true);
9713
9714 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9715 SmallVector<SDValue, 2> Ops = {Chain};
9716 if (InGlue)
9717 Ops.push_back(InGlue);
9718 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9719 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9720 };
9721
9722 if (InsertVectorLengthCheck && Enable) {
9723 // Non-streaming -> Streaming
9724 // Insert vector length check before smstart
9725 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9726 Chain = CheckVL.getValue(0);
9727 InGlue = CheckVL.getValue(1);
9728 }
9729
9730 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9731 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9732 SDValue MSROp =
9733 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9734 SmallVector<SDValue> Ops = {Chain, MSROp};
9735 unsigned Opcode;
9736 if (Condition != AArch64SME::Always) {
9737 Register PStateReg = FuncInfo->getPStateSMReg();
9738 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9739 SDValue PStateSM =
9740 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9741 // Use chain and glue from the CopyFromReg.
9742 Ops[0] = PStateSM.getValue(1);
9743 InGlue = PStateSM.getValue(2);
9744 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9745 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9746 Ops.push_back(ConditionOp);
9747 Ops.push_back(PStateSM);
9748 } else {
9749 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9750 }
9751 Ops.push_back(RegMask);
9752
9753 if (InGlue)
9754 Ops.push_back(InGlue);
9755
9756 SDValue SMChange =
9757 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9758
9759 if (!InsertVectorLengthCheck || Enable)
9760 return SMChange;
9761
9762 // Streaming -> Non-streaming
9763 // Insert vector length check after smstop since we cannot read VL
9764 // in streaming mode
9765 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9766}
9767
9770 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9771 CallAttrs.caller().hasStreamingBody())
9772 return AArch64SME::Always;
9773 if (CallAttrs.callee().hasNonStreamingInterface())
9775 if (CallAttrs.callee().hasStreamingInterface())
9777
9778 llvm_unreachable("Unsupported attributes");
9779}
9780
9781/// Check whether a stack argument requires lowering in a tail call.
9783 const CCValAssign &VA, SDValue Arg,
9784 ISD::ArgFlagsTy Flags, int CallOffset) {
9785 // FIXME: We should be able to handle this case, but it's not clear how to.
9786 if (Flags.isZExt() || Flags.isSExt())
9787 return true;
9788
9789 for (;;) {
9790 // Look through nodes that don't alter the bits of the incoming value.
9791 unsigned Op = Arg.getOpcode();
9792 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9793 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9794 Arg = Arg.getOperand(0);
9795 continue;
9796 }
9797 break;
9798 }
9799
9800 // If the argument is a load from the same immutable stack slot, we can reuse
9801 // it.
9802 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9803 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9804 const MachineFrameInfo &MFI = MF.getFrameInfo();
9805 int FI = FINode->getIndex();
9806 if (!MFI.isImmutableObjectIndex(FI))
9807 return true;
9808 if (CallOffset != MFI.getObjectOffset(FI))
9809 return true;
9810 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9811 if (SizeInBits != VA.getValVT().getSizeInBits())
9812 return true;
9813 return false;
9814 }
9815 }
9816
9817 return true;
9818}
9819
9820/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9821/// and add input and output parameter nodes.
9822SDValue
9823AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9824 SmallVectorImpl<SDValue> &InVals) const {
9825 SelectionDAG &DAG = CLI.DAG;
9826 SDLoc &DL = CLI.DL;
9827 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9828 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9829 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9830 SDValue Chain = CLI.Chain;
9831 SDValue Callee = CLI.Callee;
9832 bool &IsTailCall = CLI.IsTailCall;
9833 CallingConv::ID &CallConv = CLI.CallConv;
9834 bool IsVarArg = CLI.IsVarArg;
9835 const CallBase *CB = CLI.CB;
9836
9837 MachineFunction &MF = DAG.getMachineFunction();
9838 MachineFunction::CallSiteInfo CSInfo;
9839 bool IsThisReturn = false;
9840
9841 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9842 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9843 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9844 bool IsSibCall = false;
9845 bool GuardWithBTI = false;
9846
9847 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9848 !Subtarget->noBTIAtReturnTwice()) {
9849 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9850 }
9851
9852 // Analyze operands of the call, assigning locations to each operand.
9854 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9855
9856 if (IsVarArg) {
9857 unsigned NumArgs = Outs.size();
9858
9859 for (unsigned i = 0; i != NumArgs; ++i) {
9860 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9861 report_fatal_error("Passing SVE types to variadic functions is "
9862 "currently not supported");
9863 }
9864 }
9865
9866 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9867
9868 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9869 // Assign locations to each value returned by this call.
9871 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9872 *DAG.getContext());
9873 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9874
9875 // Set type id for call site info.
9876 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
9877
9878 // Check callee args/returns for SVE registers and set calling convention
9879 // accordingly.
9880 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9881 auto HasSVERegLoc = [](CCValAssign &Loc) {
9882 if (!Loc.isRegLoc())
9883 return false;
9884 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9885 AArch64::PPRRegClass.contains(Loc.getLocReg());
9886 };
9887 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9889 }
9890
9891 // Determine whether we need any streaming mode changes.
9892 SMECallAttrs CallAttrs =
9894
9895 std::optional<unsigned> ZAMarkerNode;
9896 if (CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState())
9897 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9898 else if (CallAttrs.requiresPreservingZT0())
9899 ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
9900 else if (CallAttrs.caller().hasZAState() || CallAttrs.caller().hasZT0State())
9901 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9902
9903 if (IsTailCall) {
9904 // Check if it's really possible to do a tail call.
9905 IsTailCall = isEligibleForTailCallOptimization(CLI);
9906
9907 // A sibling call is one where we're under the usual C ABI and not planning
9908 // to change that but can still do a tail call:
9909 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9910 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9911 IsSibCall = true;
9912
9913 if (IsTailCall)
9914 ++NumTailCalls;
9915 }
9916
9917 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9918 report_fatal_error("failed to perform tail call elimination on a call "
9919 "site marked musttail");
9920
9921 // Get a count of how many bytes are to be pushed on the stack.
9922 unsigned NumBytes = CCInfo.getStackSize();
9923
9924 if (IsSibCall) {
9925 // Since we're not changing the ABI to make this a tail call, the memory
9926 // operands are already available in the caller's incoming argument space.
9927 NumBytes = 0;
9928 }
9929
9930 // FPDiff is the byte offset of the call's argument area from the callee's.
9931 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9932 // by this amount for a tail call. In a sibling call it must be 0 because the
9933 // caller will deallocate the entire stack and the callee still expects its
9934 // arguments to begin at SP+0. Completely unused for non-tail calls.
9935 int FPDiff = 0;
9936
9937 if (IsTailCall && !IsSibCall) {
9938 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9939
9940 // Since callee will pop argument stack as a tail call, we must keep the
9941 // popped size 16-byte aligned.
9942 NumBytes = alignTo(NumBytes, 16);
9943
9944 // FPDiff will be negative if this tail call requires more space than we
9945 // would automatically have in our incoming argument space. Positive if we
9946 // can actually shrink the stack.
9947 FPDiff = NumReusableBytes - NumBytes;
9948
9949 // Update the required reserved area if this is the tail call requiring the
9950 // most argument stack space.
9951 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9952 FuncInfo->setTailCallReservedStack(-FPDiff);
9953
9954 // The stack pointer must be 16-byte aligned at all times it's used for a
9955 // memory operation, which in practice means at *all* times and in
9956 // particular across call boundaries. Therefore our own arguments started at
9957 // a 16-byte aligned SP and the delta applied for the tail call should
9958 // satisfy the same constraint.
9959 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9960 }
9961
9962 auto DescribeCallsite =
9963 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9964 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9965 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9966 R << ore::NV("Callee", ES->getSymbol());
9967 else if (CLI.CB && CLI.CB->getCalledFunction())
9968 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9969 else
9970 R << "unknown callee";
9971 R << "'";
9972 return R;
9973 };
9974
9975 bool RequiresSMChange = CallAttrs.requiresSMChange();
9976 if (RequiresSMChange) {
9977 OptimizationRemarkEmitter ORE(&MF.getFunction());
9978 ORE.emit([&]() {
9979 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9980 CLI.CB)
9981 : OptimizationRemarkAnalysis("sme", "SMETransition",
9982 &MF.getFunction());
9983 DescribeCallsite(R) << " requires a streaming mode transition";
9984 return R;
9985 });
9986 }
9987
9988 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9989 // These operations are automatically eliminated by the prolog/epilog pass
9990 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9991 if (!IsSibCall) {
9992 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9993 if (ZAMarkerNode) {
9994 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9995 // using a chain can result in incorrect scheduling. The markers refer to
9996 // the position just before the CALLSEQ_START (though occur after as
9997 // CALLSEQ_START lacks in-glue).
9998 Chain =
9999 DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other, MVT::Glue),
10000 {Chain, Chain.getValue(1)});
10001 }
10002 }
10003
10004 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
10006
10008 SmallSet<unsigned, 8> RegsUsed;
10009 SmallVector<SDValue, 8> MemOpChains;
10010 auto PtrVT = getPointerTy(DAG.getDataLayout());
10011
10012 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
10013 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
10014 for (const auto &F : Forwards) {
10015 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
10016 RegsToPass.emplace_back(F.PReg, Val);
10017 }
10018 }
10019
10020 // Walk the register/memloc assignments, inserting copies/loads.
10021 unsigned ExtraArgLocs = 0;
10022 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
10023 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
10024 SDValue Arg = OutVals[i];
10025 ISD::ArgFlagsTy Flags = Outs[i].Flags;
10026
10027 // Promote the value if needed.
10028 switch (VA.getLocInfo()) {
10029 default:
10030 llvm_unreachable("Unknown loc info!");
10031 case CCValAssign::Full:
10032 break;
10033 case CCValAssign::SExt:
10034 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
10035 break;
10036 case CCValAssign::ZExt:
10037 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10038 break;
10039 case CCValAssign::AExt:
10040 if (Outs[i].ArgVT == MVT::i1) {
10041 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
10042 //
10043 // Check if we actually have to do this, because the value may
10044 // already be zero-extended.
10045 //
10046 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
10047 // and rely on DAGCombiner to fold this, because the following
10048 // (anyext i32) is combined with (zext i8) in DAG.getNode:
10049 //
10050 // (ext (zext x)) -> (zext x)
10051 //
10052 // This will give us (zext i32), which we cannot remove, so
10053 // try to check this beforehand.
10054 if (!checkZExtBool(Arg, DAG)) {
10055 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10056 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
10057 }
10058 }
10059 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10060 break;
10062 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10063 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10064 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10065 DAG.getConstant(32, DL, VA.getLocVT()));
10066 break;
10067 case CCValAssign::BCvt:
10068 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
10069 break;
10070 case CCValAssign::Trunc:
10071 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10072 break;
10073 case CCValAssign::FPExt:
10074 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
10075 break;
10077 bool isScalable = VA.getValVT().isScalableVT();
10078 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
10079 "Indirect arguments should be scalable on most subtargets");
10080
10081 TypeSize StoreSize = VA.getValVT().getStoreSize();
10082 TypeSize PartSize = StoreSize;
10083 unsigned NumParts = 1;
10084 if (Outs[i].Flags.isInConsecutiveRegs()) {
10085 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
10086 ++NumParts;
10087 StoreSize *= NumParts;
10088 }
10089
10090 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
10091 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
10092 MachineFrameInfo &MFI = MF.getFrameInfo();
10093 int FI =
10094 MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
10095 if (isScalable) {
10096 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
10097 VA.getValVT().getVectorElementType() == MVT::i1;
10100 }
10101
10102 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
10103 SDValue Ptr = DAG.getFrameIndex(
10105 SDValue SpillSlot = Ptr;
10106
10107 // Ensure we generate all stores for each tuple part, whilst updating the
10108 // pointer after each store correctly using vscale.
10109 while (NumParts) {
10110 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
10111 MemOpChains.push_back(Store);
10112
10113 NumParts--;
10114 if (NumParts > 0) {
10115 SDValue BytesIncrement =
10116 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
10117 MPI = MachinePointerInfo(MPI.getAddrSpace());
10118 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
10119 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
10120 ExtraArgLocs++;
10121 i++;
10122 }
10123 }
10124
10125 Arg = SpillSlot;
10126 break;
10127 }
10128
10129 if (VA.isRegLoc()) {
10130 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
10131 Outs[0].VT == MVT::i64) {
10132 assert(VA.getLocVT() == MVT::i64 &&
10133 "unexpected calling convention register assignment");
10134 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
10135 "unexpected use of 'returned'");
10136 IsThisReturn = true;
10137 }
10138 if (RegsUsed.count(VA.getLocReg())) {
10139 // If this register has already been used then we're trying to pack
10140 // parts of an [N x i32] into an X-register. The extension type will
10141 // take care of putting the two halves in the right place but we have to
10142 // combine them.
10143 SDValue &Bits =
10144 llvm::find_if(RegsToPass,
10145 [=](const std::pair<unsigned, SDValue> &Elt) {
10146 return Elt.first == VA.getLocReg();
10147 })
10148 ->second;
10149 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10150 // Call site info is used for function's parameter entry value
10151 // tracking. For now we track only simple cases when parameter
10152 // is transferred through whole register.
10154 [&VA](MachineFunction::ArgRegPair ArgReg) {
10155 return ArgReg.Reg == VA.getLocReg();
10156 });
10157 } else {
10158 // Add an extra level of indirection for streaming mode changes by
10159 // using a pseudo copy node that cannot be rematerialised between a
10160 // smstart/smstop and the call by the simple register coalescer.
10161 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
10162 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10163 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
10164 RegsToPass.emplace_back(VA.getLocReg(), Arg);
10165 RegsUsed.insert(VA.getLocReg());
10166 const TargetOptions &Options = DAG.getTarget().Options;
10167 if (Options.EmitCallSiteInfo)
10168 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
10169 }
10170 } else {
10171 assert(VA.isMemLoc());
10172
10173 SDValue DstAddr;
10174 MachinePointerInfo DstInfo;
10175
10176 // FIXME: This works on big-endian for composite byvals, which are the
10177 // common case. It should also work for fundamental types too.
10178 uint32_t BEAlign = 0;
10179 unsigned OpSize;
10180 if (VA.getLocInfo() == CCValAssign::Indirect ||
10182 OpSize = VA.getLocVT().getFixedSizeInBits();
10183 else
10184 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
10185 : VA.getValVT().getSizeInBits();
10186 OpSize = (OpSize + 7) / 8;
10187 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
10188 !Flags.isInConsecutiveRegs()) {
10189 if (OpSize < 8)
10190 BEAlign = 8 - OpSize;
10191 }
10192 unsigned LocMemOffset = VA.getLocMemOffset();
10193 int32_t Offset = LocMemOffset + BEAlign;
10194
10195 if (IsTailCall) {
10196 // When the frame pointer is perfectly aligned for the tail call and the
10197 // same stack argument is passed down intact, we can reuse it.
10198 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
10199 continue;
10200
10201 Offset = Offset + FPDiff;
10202 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
10203
10204 DstAddr = DAG.getFrameIndex(FI, PtrVT);
10205 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
10206
10207 // Make sure any stack arguments overlapping with where we're storing
10208 // are loaded before this eventual operation. Otherwise they'll be
10209 // clobbered.
10210 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
10211 } else {
10212 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
10213
10214 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
10215 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
10216 }
10217
10218 if (Outs[i].Flags.isByVal()) {
10219 SDValue SizeNode =
10220 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
10221 SDValue Cpy = DAG.getMemcpy(
10222 Chain, DL, DstAddr, Arg, SizeNode,
10223 Outs[i].Flags.getNonZeroByValAlign(),
10224 /*isVol = */ false, /*AlwaysInline = */ false,
10225 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
10226
10227 MemOpChains.push_back(Cpy);
10228 } else {
10229 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
10230 // promoted to a legal register type i32, we should truncate Arg back to
10231 // i1/i8/i16.
10232 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
10233 VA.getValVT() == MVT::i16)
10234 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
10235
10236 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
10237 MemOpChains.push_back(Store);
10238 }
10239 }
10240 }
10241
10242 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
10243 !(CLI.CB && CLI.CB->isMustTailCall())) {
10244 SDValue ParamPtr = StackPtr;
10245 if (IsTailCall) {
10246 // Create a dummy object at the top of the stack that can be used to get
10247 // the SP after the epilogue
10248 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
10249 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
10250 }
10251
10252 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
10253 // describing the argument list. x4 contains the address of the
10254 // first stack parameter. x5 contains the size in bytes of all parameters
10255 // passed on the stack.
10256 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
10257 RegsToPass.emplace_back(AArch64::X5,
10258 DAG.getConstant(NumBytes, DL, MVT::i64));
10259 }
10260
10261 if (!MemOpChains.empty())
10262 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
10263
10264 SDValue InGlue;
10265 if (RequiresSMChange) {
10266 bool InsertVectorLengthCheck =
10268 Chain = changeStreamingMode(
10269 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
10270 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
10271 InGlue = Chain.getValue(1);
10272 }
10273
10274 // Build a sequence of copy-to-reg nodes chained together with token chain
10275 // and flag operands which copy the outgoing args into the appropriate regs.
10276 for (auto &RegToPass : RegsToPass) {
10277 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
10278 RegToPass.second, InGlue);
10279 InGlue = Chain.getValue(1);
10280 }
10281
10282 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
10283 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
10284 // node so that legalize doesn't hack it.
10285 const GlobalValue *CalledGlobal = nullptr;
10286 unsigned OpFlags = 0;
10287 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
10288 CalledGlobal = G->getGlobal();
10289 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
10291 if (OpFlags & AArch64II::MO_GOT) {
10292 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
10293 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10294 } else if (!CLI.PAI || !IsTailCall) {
10295 const GlobalValue *GV = G->getGlobal();
10296 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
10297 }
10298 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
10299 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
10300 Subtarget->isTargetMachO()) ||
10302 const char *Sym = S->getSymbol();
10303 if (UseGot) {
10305 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10306 } else {
10307 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
10308 }
10309 }
10310
10311 // We don't usually want to end the call-sequence here because we would tidy
10312 // the frame up *after* the call, however in the ABI-changing tail-call case
10313 // we've carefully laid out the parameters so that when sp is reset they'll be
10314 // in the correct location.
10315 if (IsTailCall && !IsSibCall) {
10316 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
10317 InGlue = Chain.getValue(1);
10318 }
10319
10320 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
10321
10322 std::vector<SDValue> Ops;
10323 Ops.push_back(Chain);
10324 Ops.push_back(Callee);
10325
10326 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
10327 // be expanded to the call, directly followed by a special marker sequence and
10328 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
10329 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
10330 assert(!IsTailCall &&
10331 "tail calls cannot be marked with clang.arc.attachedcall");
10332 Opc = AArch64ISD::CALL_RVMARKER;
10333
10334 // Add a target global address for the retainRV/claimRV runtime function
10335 // just before the call target.
10336 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
10337 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
10338 Ops.insert(Ops.begin() + 1, GA);
10339
10340 // We may or may not need to emit both the marker and the retain/claim call.
10341 // Tell the pseudo expansion using an additional boolean op.
10342 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
10343 SDValue DoEmitMarker =
10344 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
10345 Ops.insert(Ops.begin() + 2, DoEmitMarker);
10346 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10347 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
10348 } else if (GuardWithBTI) {
10349 Opc = AArch64ISD::CALL_BTI;
10350 }
10351
10352 if (IsTailCall) {
10353 // Each tail call may have to adjust the stack by a different amount, so
10354 // this information must travel along with the operation for eventual
10355 // consumption by emitEpilogue.
10356 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
10357 }
10358
10359 if (CLI.PAI) {
10360 const uint64_t Key = CLI.PAI->Key;
10362 "Invalid auth call key");
10363
10364 // Split the discriminator into address/integer components.
10365 SDValue AddrDisc, IntDisc;
10366 std::tie(IntDisc, AddrDisc) =
10367 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
10368
10369 if (Opc == AArch64ISD::CALL_RVMARKER)
10370 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
10371 else
10372 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
10373 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
10374 Ops.push_back(IntDisc);
10375 Ops.push_back(AddrDisc);
10376 }
10377
10378 // Add argument registers to the end of the list so that they are known live
10379 // into the call.
10380 for (auto &RegToPass : RegsToPass)
10381 Ops.push_back(DAG.getRegister(RegToPass.first,
10382 RegToPass.second.getValueType()));
10383
10384 // Add a register mask operand representing the call-preserved registers.
10385 const uint32_t *Mask;
10386 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10387 if (IsThisReturn) {
10388 // For 'this' returns, use the X0-preserving mask if applicable
10389 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10390 if (!Mask) {
10391 IsThisReturn = false;
10392 Mask = TRI->getCallPreservedMask(MF, CallConv);
10393 }
10394 } else
10395 Mask = TRI->getCallPreservedMask(MF, CallConv);
10396
10397 if (Subtarget->hasCustomCallingConv())
10398 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10399
10400 if (TRI->isAnyArgRegReserved(MF))
10401 TRI->emitReservedArgRegCallError(MF);
10402
10403 assert(Mask && "Missing call preserved mask for calling convention");
10404 Ops.push_back(DAG.getRegisterMask(Mask));
10405
10406 if (InGlue.getNode())
10407 Ops.push_back(InGlue);
10408
10409 if (CLI.DeactivationSymbol)
10410 Ops.push_back(DAG.getDeactivationSymbol(CLI.DeactivationSymbol));
10411
10412 // If we're doing a tall call, use a TC_RETURN here rather than an
10413 // actual call instruction.
10414 if (IsTailCall) {
10416 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10417 if (IsCFICall)
10418 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10419
10420 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10421 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10422 if (CalledGlobal &&
10423 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10424 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10425 return Ret;
10426 }
10427
10428 // Returns a chain and a flag for retval copy to use.
10429 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10430 if (IsCFICall)
10431 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10432
10433 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10434 InGlue = Chain.getValue(1);
10435 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10436 if (CalledGlobal &&
10437 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10438 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10439
10440 uint64_t CalleePopBytes =
10441 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10442
10443 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10444 InGlue = Chain.getValue(1);
10445
10446 // Handle result values, copying them out of physregs into vregs that we
10447 // return.
10448 SDValue Result = LowerCallResult(
10449 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10450 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10451
10452 if (!Ins.empty())
10453 InGlue = Result.getValue(Result->getNumValues() - 1);
10454
10455 if (RequiresSMChange) {
10457 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10458 getSMToggleCondition(CallAttrs));
10459 }
10460
10461 if (RequiresSMChange) {
10462 for (unsigned I = 0; I < InVals.size(); ++I) {
10463 // The smstart/smstop is chained as part of the call, but when the
10464 // resulting chain is discarded (which happens when the call is not part
10465 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10466 // smstart/smstop is chained to the result value. We can do that by doing
10467 // a vreg -> vreg copy.
10470 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10471 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10472 InVals[I].getValueType());
10473 }
10474 }
10475
10476 if (CallConv == CallingConv::PreserveNone) {
10477 for (const ISD::OutputArg &O : Outs) {
10478 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10479 O.Flags.isSwiftAsync()) {
10480 MachineFunction &MF = DAG.getMachineFunction();
10481 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10482 MF.getFunction(),
10483 "Swift attributes can't be used with preserve_none",
10484 DL.getDebugLoc()));
10485 break;
10486 }
10487 }
10488 }
10489
10490 return Result;
10491}
10492
10493bool AArch64TargetLowering::CanLowerReturn(
10494 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10496 const Type *RetTy) const {
10497 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10499 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10500 return CCInfo.CheckReturn(Outs, RetCC);
10501}
10502
10503SDValue
10504AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10505 bool isVarArg,
10507 const SmallVectorImpl<SDValue> &OutVals,
10508 const SDLoc &DL, SelectionDAG &DAG) const {
10509 auto &MF = DAG.getMachineFunction();
10510 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10511
10512 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10514 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10515 CCInfo.AnalyzeReturn(Outs, RetCC);
10516
10517 // Copy the result values into the output registers.
10518 SDValue Glue;
10520 SmallSet<unsigned, 4> RegsUsed;
10521 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10522 ++i, ++realRVLocIdx) {
10523 CCValAssign &VA = RVLocs[i];
10524 assert(VA.isRegLoc() && "Can only return in registers!");
10525 SDValue Arg = OutVals[realRVLocIdx];
10526
10527 switch (VA.getLocInfo()) {
10528 default:
10529 llvm_unreachable("Unknown loc info!");
10530 case CCValAssign::Full:
10531 if (Outs[i].ArgVT == MVT::i1) {
10532 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10533 // value. This is strictly redundant on Darwin (which uses "zeroext
10534 // i1"), but will be optimised out before ISel.
10535 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10536 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10537 }
10538 break;
10539 case CCValAssign::BCvt:
10540 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10541 break;
10542 case CCValAssign::AExt:
10543 case CCValAssign::ZExt:
10544 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10545 break;
10547 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10548 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10549 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10550 DAG.getConstant(32, DL, VA.getLocVT()));
10551 break;
10552 }
10553
10554 if (RegsUsed.count(VA.getLocReg())) {
10555 SDValue &Bits =
10556 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10557 return Elt.first == VA.getLocReg();
10558 })->second;
10559 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10560 } else {
10561 RetVals.emplace_back(VA.getLocReg(), Arg);
10562 RegsUsed.insert(VA.getLocReg());
10563 }
10564 }
10565
10566 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10567
10568 // Emit SMSTOP before returning from a locally streaming function
10569 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10570 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10571 if (FuncAttrs.hasStreamingCompatibleInterface())
10572 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10573 /*Glue*/ SDValue(),
10575 else
10576 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10577 /*Glue*/ SDValue(), AArch64SME::Always);
10578 Glue = Chain.getValue(1);
10579 }
10580
10581 SmallVector<SDValue, 4> RetOps(1, Chain);
10582 for (auto &RetVal : RetVals) {
10583 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10584 isPassedInFPR(RetVal.second.getValueType()))
10585 RetVal.second =
10586 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10587 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10588 RetVal.second);
10589 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10590 Glue = Chain.getValue(1);
10591 RetOps.push_back(
10592 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10593 }
10594
10595 // Windows AArch64 ABIs require that for returning structs by value we copy
10596 // the sret argument into X0 for the return.
10597 // We saved the argument into a virtual register in the entry block,
10598 // so now we copy the value out and into X0.
10599 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10600 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10602
10603 unsigned RetValReg = AArch64::X0;
10604 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10605 RetValReg = AArch64::X8;
10606 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10607 Glue = Chain.getValue(1);
10608
10609 RetOps.push_back(
10610 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10611 }
10612
10613 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10614 if (I) {
10615 for (; *I; ++I) {
10616 if (AArch64::GPR64RegClass.contains(*I))
10617 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10618 else if (AArch64::FPR64RegClass.contains(*I))
10619 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10620 else
10621 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10622 }
10623 }
10624
10625 RetOps[0] = Chain; // Update chain.
10626
10627 // Add the glue if we have it.
10628 if (Glue.getNode())
10629 RetOps.push_back(Glue);
10630
10631 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10632 // ARM64EC entry thunks use a special return sequence: instead of a regular
10633 // "ret" instruction, they need to explicitly call the emulator.
10634 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10635 SDValue Arm64ECRetDest =
10636 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10637 Arm64ECRetDest =
10638 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10639 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10640 MachinePointerInfo());
10641 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10642 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10643 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10644 }
10645
10646 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10647}
10648
10649//===----------------------------------------------------------------------===//
10650// Other Lowering Code
10651//===----------------------------------------------------------------------===//
10652
10653SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10654 SelectionDAG &DAG,
10655 unsigned Flag) const {
10656 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10657 N->getOffset(), Flag);
10658}
10659
10660SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10661 SelectionDAG &DAG,
10662 unsigned Flag) const {
10663 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10664}
10665
10666SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10667 SelectionDAG &DAG,
10668 unsigned Flag) const {
10669 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10670 N->getOffset(), Flag);
10671}
10672
10673SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10674 SelectionDAG &DAG,
10675 unsigned Flag) const {
10676 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10677}
10678
10679SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10680 SelectionDAG &DAG,
10681 unsigned Flag) const {
10682 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10683}
10684
10685// (loadGOT sym)
10686template <class NodeTy>
10687SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10688 unsigned Flags) const {
10689 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10690 SDLoc DL(N);
10691 EVT Ty = getPointerTy(DAG.getDataLayout());
10692 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10693 // FIXME: Once remat is capable of dealing with instructions with register
10694 // operands, expand this into two nodes instead of using a wrapper node.
10695 if (DAG.getMachineFunction()
10696 .getInfo<AArch64FunctionInfo>()
10697 ->hasELFSignedGOT())
10698 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10699 0);
10700 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10701}
10702
10703// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10704template <class NodeTy>
10705SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10706 unsigned Flags) const {
10707 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10708 SDLoc DL(N);
10709 EVT Ty = getPointerTy(DAG.getDataLayout());
10710 const unsigned char MO_NC = AArch64II::MO_NC;
10711 return DAG.getNode(
10712 AArch64ISD::WrapperLarge, DL, Ty,
10713 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10714 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10715 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10716 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10717}
10718
10719// (addlow (adrp %hi(sym)) %lo(sym))
10720template <class NodeTy>
10721SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10722 unsigned Flags) const {
10723 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10724 SDLoc DL(N);
10725 EVT Ty = getPointerTy(DAG.getDataLayout());
10726 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10727 SDValue Lo = getTargetNode(N, Ty, DAG,
10729 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10730 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10731}
10732
10733// (adr sym)
10734template <class NodeTy>
10735SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10736 unsigned Flags) const {
10737 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10738 SDLoc DL(N);
10739 EVT Ty = getPointerTy(DAG.getDataLayout());
10740 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10741 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10742}
10743
10744SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10745 SelectionDAG &DAG) const {
10746 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10747 const GlobalValue *GV = GN->getGlobal();
10748 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10749
10750 if (OpFlags != AArch64II::MO_NO_FLAG)
10752 "unexpected offset in global node");
10753
10754 // This also catches the large code model case for Darwin, and tiny code
10755 // model with got relocations.
10756 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10757 return getGOT(GN, DAG, OpFlags);
10758 }
10759
10763 Result = getAddrLarge(GN, DAG, OpFlags);
10764 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10765 Result = getAddrTiny(GN, DAG, OpFlags);
10766 } else {
10767 Result = getAddr(GN, DAG, OpFlags);
10768 }
10769 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10770 SDLoc DL(GN);
10772 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10774 return Result;
10775}
10776
10777/// Convert a TLS address reference into the correct sequence of loads
10778/// and calls to compute the variable's address (for Darwin, currently) and
10779/// return an SDValue containing the final node.
10780
10781/// Darwin only has one TLS scheme which must be capable of dealing with the
10782/// fully general situation, in the worst case. This means:
10783/// + "extern __thread" declaration.
10784/// + Defined in a possibly unknown dynamic library.
10785///
10786/// The general system is that each __thread variable has a [3 x i64] descriptor
10787/// which contains information used by the runtime to calculate the address. The
10788/// only part of this the compiler needs to know about is the first xword, which
10789/// contains a function pointer that must be called with the address of the
10790/// entire descriptor in "x0".
10791///
10792/// Since this descriptor may be in a different unit, in general even the
10793/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10794/// is:
10795/// adrp x0, _var@TLVPPAGE
10796/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10797/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10798/// ; the function pointer
10799/// blr x1 ; Uses descriptor address in x0
10800/// ; Address of _var is now in x0.
10801///
10802/// If the address of _var's descriptor *is* known to the linker, then it can
10803/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10804/// a slight efficiency gain.
10805SDValue
10806AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10807 SelectionDAG &DAG) const {
10808 assert(Subtarget->isTargetDarwin() &&
10809 "This function expects a Darwin target");
10810
10811 SDLoc DL(Op);
10812 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10813 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10814 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10815
10816 SDValue TLVPAddr =
10817 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10818 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10819
10820 // The first entry in the descriptor is a function pointer that we must call
10821 // to obtain the address of the variable.
10822 SDValue Chain = DAG.getEntryNode();
10823 SDValue FuncTLVGet = DAG.getLoad(
10824 PtrMemVT, DL, Chain, DescAddr,
10826 Align(PtrMemVT.getSizeInBits() / 8),
10828 Chain = FuncTLVGet.getValue(1);
10829
10830 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10831 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10832
10833 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10834 MFI.setAdjustsStack(true);
10835
10836 // TLS calls preserve all registers except those that absolutely must be
10837 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10838 // silly).
10839 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10840 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10841 if (Subtarget->hasCustomCallingConv())
10842 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10843
10844 // Finally, we can make the call. This is just a degenerate version of a
10845 // normal AArch64 call node: x0 takes the address of the descriptor, and
10846 // returns the address of the variable in this thread.
10847 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10848
10849 unsigned Opcode = AArch64ISD::CALL;
10851 Ops.push_back(Chain);
10852 Ops.push_back(FuncTLVGet);
10853
10854 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10855 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10856 Opcode = AArch64ISD::AUTH_CALL;
10857 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10858 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10859 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10860 }
10861
10862 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10863 Ops.push_back(DAG.getRegisterMask(Mask));
10864 Ops.push_back(Chain.getValue(1));
10865 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10866 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10867}
10868
10869/// Convert a thread-local variable reference into a sequence of instructions to
10870/// compute the variable's address for the local exec TLS model of ELF targets.
10871/// The sequence depends on the maximum TLS area size.
10872SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10873 SDValue ThreadBase,
10874 const SDLoc &DL,
10875 SelectionDAG &DAG) const {
10876 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10877 SDValue TPOff, Addr;
10878
10879 switch (DAG.getTarget().Options.TLSSize) {
10880 default:
10881 llvm_unreachable("Unexpected TLS size");
10882
10883 case 12: {
10884 // mrs x0, TPIDR_EL0
10885 // add x0, x0, :tprel_lo12:a
10887 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10888 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10889 Var,
10890 DAG.getTargetConstant(0, DL, MVT::i32)),
10891 0);
10892 }
10893
10894 case 24: {
10895 // mrs x0, TPIDR_EL0
10896 // add x0, x0, :tprel_hi12:a
10897 // add x0, x0, :tprel_lo12_nc:a
10898 SDValue HiVar = DAG.getTargetGlobalAddress(
10899 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10900 SDValue LoVar = DAG.getTargetGlobalAddress(
10901 GV, DL, PtrVT, 0,
10903 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10904 HiVar,
10905 DAG.getTargetConstant(0, DL, MVT::i32)),
10906 0);
10907 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10908 LoVar,
10909 DAG.getTargetConstant(0, DL, MVT::i32)),
10910 0);
10911 }
10912
10913 case 32: {
10914 // mrs x1, TPIDR_EL0
10915 // movz x0, #:tprel_g1:a
10916 // movk x0, #:tprel_g0_nc:a
10917 // add x0, x1, x0
10918 SDValue HiVar = DAG.getTargetGlobalAddress(
10919 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10920 SDValue LoVar = DAG.getTargetGlobalAddress(
10921 GV, DL, PtrVT, 0,
10923 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10924 DAG.getTargetConstant(16, DL, MVT::i32)),
10925 0);
10926 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10927 DAG.getTargetConstant(0, DL, MVT::i32)),
10928 0);
10929 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10930 }
10931
10932 case 48: {
10933 // mrs x1, TPIDR_EL0
10934 // movz x0, #:tprel_g2:a
10935 // movk x0, #:tprel_g1_nc:a
10936 // movk x0, #:tprel_g0_nc:a
10937 // add x0, x1, x0
10938 SDValue HiVar = DAG.getTargetGlobalAddress(
10939 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10940 SDValue MiVar = DAG.getTargetGlobalAddress(
10941 GV, DL, PtrVT, 0,
10943 SDValue LoVar = DAG.getTargetGlobalAddress(
10944 GV, DL, PtrVT, 0,
10946 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10947 DAG.getTargetConstant(32, DL, MVT::i32)),
10948 0);
10949 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10950 DAG.getTargetConstant(16, DL, MVT::i32)),
10951 0);
10952 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10953 DAG.getTargetConstant(0, DL, MVT::i32)),
10954 0);
10955 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10956 }
10957 }
10958}
10959
10960/// When accessing thread-local variables under either the general-dynamic or
10961/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10962/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10963/// is a function pointer to carry out the resolution.
10964///
10965/// The sequence is:
10966/// adrp x0, :tlsdesc:var
10967/// ldr x1, [x0, #:tlsdesc_lo12:var]
10968/// add x0, x0, #:tlsdesc_lo12:var
10969/// .tlsdesccall var
10970/// blr x1
10971/// (TPIDR_EL0 offset now in x0)
10972///
10973/// The above sequence must be produced unscheduled, to enable the linker to
10974/// optimize/relax this sequence.
10975/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10976/// above sequence, and expanded really late in the compilation flow, to ensure
10977/// the sequence is produced as per above.
10978SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10979 const SDLoc &DL,
10980 SelectionDAG &DAG) const {
10981 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10982 auto &MF = DAG.getMachineFunction();
10983 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10984
10985 SDValue Glue;
10986 SDValue Chain = DAG.getEntryNode();
10987 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10988
10989 SMECallAttrs TLSCallAttrs(FuncInfo->getSMEFnAttrs(), {}, SMEAttrs::Normal);
10990 bool RequiresSMChange = TLSCallAttrs.requiresSMChange();
10991
10992 auto ChainAndGlue = [](SDValue Chain) -> std::pair<SDValue, SDValue> {
10993 return {Chain, Chain.getValue(1)};
10994 };
10995
10996 if (RequiresSMChange)
10997 std::tie(Chain, Glue) =
10998 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/false, Chain, Glue,
10999 getSMToggleCondition(TLSCallAttrs)));
11000
11001 unsigned Opcode =
11002 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
11003 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
11004 : AArch64ISD::TLSDESC_CALLSEQ;
11005 SDValue Ops[] = {Chain, SymAddr, Glue};
11006 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11007 Opcode, DL, NodeTys, Glue ? ArrayRef(Ops) : ArrayRef(Ops).drop_back()));
11008
11009 if (TLSCallAttrs.requiresLazySave())
11010 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11011 AArch64ISD::REQUIRES_ZA_SAVE, DL, NodeTys, {Chain, Chain.getValue(1)}));
11012
11013 if (RequiresSMChange)
11014 std::tie(Chain, Glue) =
11015 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
11016 getSMToggleCondition(TLSCallAttrs)));
11017
11018 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
11019}
11020
11021SDValue
11022AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
11023 SelectionDAG &DAG) const {
11024 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
11025
11026 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11027 AArch64FunctionInfo *MFI =
11028 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11029
11033
11035 if (Model == TLSModel::LocalDynamic)
11037 }
11038
11040 Model != TLSModel::LocalExec)
11041 report_fatal_error("ELF TLS only supported in small memory model or "
11042 "in local exec TLS model");
11043 // Different choices can be made for the maximum size of the TLS area for a
11044 // module. For the small address model, the default TLS size is 16MiB and the
11045 // maximum TLS size is 4GiB.
11046 // FIXME: add tiny and large code model support for TLS access models other
11047 // than local exec. We currently generate the same code as small for tiny,
11048 // which may be larger than needed.
11049
11050 SDValue TPOff;
11051 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11052 SDLoc DL(Op);
11053 const GlobalValue *GV = GA->getGlobal();
11054
11055 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
11056
11057 if (Model == TLSModel::LocalExec) {
11058 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
11059 } else if (Model == TLSModel::InitialExec) {
11060 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11061 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
11062 } else if (Model == TLSModel::LocalDynamic) {
11063 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
11064 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
11065 // the beginning of the module's TLS region, followed by a DTPREL offset
11066 // calculation.
11067
11068 // These accesses will need deduplicating if there's more than one.
11070
11071 // The call needs a relocation too for linker relaxation. It doesn't make
11072 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11073 // the address.
11074 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
11076
11077 // Now we can calculate the offset from TPIDR_EL0 to this module's
11078 // thread-local area.
11079 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11080
11081 // Now use :dtprel_whatever: operations to calculate this variable's offset
11082 // in its thread-storage area.
11083 SDValue HiVar = DAG.getTargetGlobalAddress(
11084 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11085 SDValue LoVar = DAG.getTargetGlobalAddress(
11086 GV, DL, MVT::i64, 0,
11088
11089 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
11090 DAG.getTargetConstant(0, DL, MVT::i32)),
11091 0);
11092 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
11093 DAG.getTargetConstant(0, DL, MVT::i32)),
11094 0);
11095 } else if (Model == TLSModel::GeneralDynamic) {
11096 // The call needs a relocation too for linker relaxation. It doesn't make
11097 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11098 // the address.
11099 SDValue SymAddr =
11100 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11101
11102 // Finally we can make a call to calculate the offset from tpidr_el0.
11103 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11104 } else
11105 llvm_unreachable("Unsupported ELF TLS access model");
11106
11107 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
11108}
11109
11110SDValue
11111AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
11112 SelectionDAG &DAG) const {
11113 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
11114
11115 SDValue Chain = DAG.getEntryNode();
11116 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11117 SDLoc DL(Op);
11118
11119 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
11120
11121 // Load the ThreadLocalStoragePointer from the TEB
11122 // A pointer to the TLS array is located at offset 0x58 from the TEB.
11123 SDValue TLSArray =
11124 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
11125 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
11126 Chain = TLSArray.getValue(1);
11127
11128 // Load the TLS index from the C runtime;
11129 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
11130 // This also does the same as LOADgot, but using a generic i32 load,
11131 // while LOADgot only loads i64.
11132 SDValue TLSIndexHi =
11133 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
11134 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
11135 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
11136 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
11137 SDValue TLSIndex =
11138 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
11139 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
11140 Chain = TLSIndex.getValue(1);
11141
11142 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
11143 // offset into the TLSArray.
11144 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
11145 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
11146 DAG.getConstant(3, DL, PtrVT));
11147 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
11148 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
11149 MachinePointerInfo());
11150 Chain = TLS.getValue(1);
11151
11152 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11153 const GlobalValue *GV = GA->getGlobal();
11154 SDValue TGAHi = DAG.getTargetGlobalAddress(
11155 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11156 SDValue TGALo = DAG.getTargetGlobalAddress(
11157 GV, DL, PtrVT, 0,
11159
11160 // Add the offset from the start of the .tls section (section base).
11161 SDValue Addr =
11162 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
11163 DAG.getTargetConstant(0, DL, MVT::i32)),
11164 0);
11165 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
11166 return Addr;
11167}
11168
11169SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
11170 SelectionDAG &DAG) const {
11171 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11172 if (DAG.getTarget().useEmulatedTLS())
11173 return LowerToTLSEmulatedModel(GA, DAG);
11174
11175 if (Subtarget->isTargetDarwin())
11176 return LowerDarwinGlobalTLSAddress(Op, DAG);
11177 if (Subtarget->isTargetELF())
11178 return LowerELFGlobalTLSAddress(Op, DAG);
11179 if (Subtarget->isTargetWindows())
11180 return LowerWindowsGlobalTLSAddress(Op, DAG);
11181
11182 llvm_unreachable("Unexpected platform trying to use TLS");
11183}
11184
11185//===----------------------------------------------------------------------===//
11186// PtrAuthGlobalAddress lowering
11187//
11188// We have 3 lowering alternatives to choose from:
11189// - MOVaddrPAC: similar to MOVaddr, with added PAC.
11190// If the GV doesn't need a GOT load (i.e., is locally defined)
11191// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
11192//
11193// - LOADgotPAC: similar to LOADgot, with added PAC.
11194// If the GV needs a GOT load, materialize the pointer using the usual
11195// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
11196// section is assumed to be read-only (for example, via relro mechanism). See
11197// LowerMOVaddrPAC.
11198//
11199// - LOADauthptrstatic: similar to LOADgot, but use a
11200// special stub slot instead of a GOT slot.
11201// Load a signed pointer for symbol 'sym' from a stub slot named
11202// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
11203// resolving. This usually lowers to adrp+ldr, but also emits an entry into
11204// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
11205//
11206// All 3 are pseudos that are expand late to longer sequences: this lets us
11207// provide integrity guarantees on the to-be-signed intermediate values.
11208//
11209// LOADauthptrstatic is undesirable because it requires a large section filled
11210// with often similarly-signed pointers, making it a good harvesting target.
11211// Thus, it's only used for ptrauth references to extern_weak to avoid null
11212// checks.
11213
11215 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
11216 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
11217 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
11218 assert(TGN->getGlobal()->hasExternalWeakLinkage());
11219
11220 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
11221 // offset alone as a pointer if the symbol wasn't available, which would
11222 // probably break null checks in users. Ptrauth complicates things further:
11223 // error out.
11224 if (TGN->getOffset() != 0)
11226 "unsupported non-zero offset in weak ptrauth global reference");
11227
11228 if (!isNullConstant(AddrDiscriminator))
11229 report_fatal_error("unsupported weak addr-div ptrauth global");
11230
11231 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11232 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
11233 {TGA, Key, Discriminator}),
11234 0);
11235}
11236
11237SDValue
11238AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
11239 SelectionDAG &DAG) const {
11240 SDValue Ptr = Op.getOperand(0);
11241 uint64_t KeyC = Op.getConstantOperandVal(1);
11242 SDValue AddrDiscriminator = Op.getOperand(2);
11243 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
11244 EVT VT = Op.getValueType();
11245 SDLoc DL(Op);
11246
11247 if (KeyC > AArch64PACKey::LAST)
11248 report_fatal_error("key in ptrauth global out of range [0, " +
11249 Twine((int)AArch64PACKey::LAST) + "]");
11250
11251 // Blend only works if the integer discriminator is 16-bit wide.
11252 if (!isUInt<16>(DiscriminatorC))
11254 "constant discriminator in ptrauth global out of range [0, 0xffff]");
11255
11256 // Choosing between 3 lowering alternatives is target-specific.
11257 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
11258 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
11259
11260 int64_t PtrOffsetC = 0;
11261 if (Ptr.getOpcode() == ISD::ADD) {
11262 PtrOffsetC = Ptr.getConstantOperandVal(1);
11263 Ptr = Ptr.getOperand(0);
11264 }
11265 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
11266 const GlobalValue *PtrGV = PtrN->getGlobal();
11267
11268 // Classify the reference to determine whether it needs a GOT load.
11269 const unsigned OpFlags =
11270 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
11271 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
11272 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
11273 "unsupported non-GOT op flags on ptrauth global reference");
11274
11275 // Fold any offset into the GV; our pseudos expect it there.
11276 PtrOffsetC += PtrN->getOffset();
11277 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
11278 /*TargetFlags=*/0);
11279 assert(PtrN->getTargetFlags() == 0 &&
11280 "unsupported target flags on ptrauth global");
11281
11282 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11283 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
11284 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
11285 ? AddrDiscriminator
11286 : DAG.getRegister(AArch64::XZR, MVT::i64);
11287
11288 // No GOT load needed -> MOVaddrPAC
11289 if (!NeedsGOTLoad) {
11290 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
11291 return SDValue(
11292 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
11293 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11294 0);
11295 }
11296
11297 // GOT load -> LOADgotPAC
11298 // Note that we disallow extern_weak refs to avoid null checks later.
11299 if (!PtrGV->hasExternalWeakLinkage())
11300 return SDValue(
11301 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
11302 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11303 0);
11304
11305 // extern_weak ref -> LOADauthptrstatic
11307 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
11308 DAG);
11309}
11310
11311// Looks through \param Val to determine the bit that can be used to
11312// check the sign of the value. It returns the unextended value and
11313// the sign bit position.
11314std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
11315 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
11316 return {Val.getOperand(0),
11317 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
11318 1};
11319
11320 if (Val.getOpcode() == ISD::SIGN_EXTEND)
11321 return {Val.getOperand(0),
11322 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
11323
11324 return {Val, Val.getValueSizeInBits() - 1};
11325}
11326
11327// Op is an SDValue that is being compared to 0. If the comparison is a bit
11328// test, optimize it to a TBZ or TBNZ.
11330 SDValue Dest, unsigned Opcode,
11331 SelectionDAG &DAG) {
11332 if (Op.getOpcode() != ISD::AND)
11333 return SDValue();
11334
11335 // See if we can use a TBZ to fold in an AND as well.
11336 // TBZ has a smaller branch displacement than CBZ. If the offset is
11337 // out of bounds, a late MI-layer pass rewrites branches.
11338 // 403.gcc is an example that hits this case.
11339 if (isa<ConstantSDNode>(Op.getOperand(1)) &&
11340 isPowerOf2_64(Op.getConstantOperandVal(1))) {
11341 SDValue Test = Op.getOperand(0);
11342 uint64_t Mask = Op.getConstantOperandVal(1);
11343 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Test,
11344 DAG.getConstant(Log2_64(Mask), DL, MVT::i64), Dest);
11345 }
11346
11347 if (Op.getOperand(0).getOpcode() == ISD::SHL) {
11348 auto Op00 = Op.getOperand(0).getOperand(0);
11349 if (isa<ConstantSDNode>(Op00) && Op00->getAsZExtVal() == 1) {
11350 auto Shr = DAG.getNode(ISD::SRL, DL, Op00.getValueType(),
11351 Op.getOperand(1), Op.getOperand(0).getOperand(1));
11352 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Shr,
11353 DAG.getConstant(0, DL, MVT::i64), Dest);
11354 }
11355 }
11356
11357 return SDValue();
11358}
11359
11360SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
11361 SDValue Chain = Op.getOperand(0);
11362 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
11363 SDValue LHS = Op.getOperand(2);
11364 SDValue RHS = Op.getOperand(3);
11365 SDValue Dest = Op.getOperand(4);
11366 SDLoc DL(Op);
11367
11368 MachineFunction &MF = DAG.getMachineFunction();
11369 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
11370 // will not be produced, as they are conditional branch instructions that do
11371 // not set flags.
11372 bool ProduceNonFlagSettingCondBr =
11373 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
11374
11375 // Handle f128 first, since lowering it will result in comparing the return
11376 // value of a libcall against zero, which is just what the rest of LowerBR_CC
11377 // is expecting to deal with.
11378 if (LHS.getValueType() == MVT::f128) {
11379 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11380
11381 // If softenSetCCOperands returned a scalar, we need to compare the result
11382 // against zero to select between true and false values.
11383 if (!RHS.getNode()) {
11384 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11385 CC = ISD::SETNE;
11386 }
11387 }
11388
11389 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
11390 // instruction.
11392 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
11393 // Only lower legal XALUO ops.
11394 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
11395 return SDValue();
11396
11397 // The actual operation with overflow check.
11399 SDValue Value, Overflow;
11400 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
11401
11402 if (CC == ISD::SETNE)
11403 OFCC = getInvertedCondCode(OFCC);
11404 SDValue CCVal = getCondCode(DAG, OFCC);
11405
11406 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11407 Overflow);
11408 }
11409
11410 if (LHS.getValueType().isInteger()) {
11411 assert((LHS.getValueType() == RHS.getValueType()) &&
11412 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11413
11414 // Normalize (LHS CC 1) -> (LHS NewCC 0) when LHS is known to be 0 or 1.
11415 // This enables the CBZ/CBNZ matching below.
11416 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11417 if (RHSC && RHSC->getZExtValue() == 1 && ProduceNonFlagSettingCondBr &&
11418 // Don't do this when LHS is an overflow/carry result (resNo == 1)
11419 // because we can fold cset + cmp #1 + b.cc into a direct
11420 // flag-consuming branch, which CBZ/CBNZ would prevent.
11421 LHS.getResNo() == 0 &&
11422 // This is true only when we somehow know that it's either 0 or 1.
11423 DAG.computeKnownBits(LHS).getMaxValue().ule(1)) {
11424 // Output params unused; we only care whether it returns true.
11425 bool CanNegate, MustBeFirst, PreferFirst;
11426 // Also skip when LHS is a conjunction tree (AND/OR of SETCCs) --
11427 // emitConjunction will lower it as a CCMP chain, which is better
11428 // than materializing the boolean for CBZ.
11429 if (!canEmitConjunction(DAG, LHS, CanNegate, MustBeFirst, PreferFirst,
11430 false)) {
11432 switch (CC) {
11433 // SETLT/SETGE are canonicalized away before reaching here, but
11434 // handle them defensively.
11435 case ISD::SETNE:
11436 case ISD::SETULT:
11437 case ISD::SETLT:
11438 NewCC = ISD::SETEQ;
11439 break;
11440 case ISD::SETEQ:
11441 case ISD::SETUGE:
11442 case ISD::SETGE:
11443 NewCC = ISD::SETNE;
11444 break;
11445 default:
11446 break;
11447 }
11448 if (NewCC != ISD::SETCC_INVALID) {
11449 CC = NewCC;
11450 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11451 RHSC = cast<ConstantSDNode>(RHS);
11452 }
11453 }
11454 }
11455
11456 // If the RHS of the comparison is zero, we can potentially fold this
11457 // to a specialized branch.
11458 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
11459 if (CC == ISD::SETEQ) {
11460 if (SDValue Result =
11461 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBZ, DAG))
11462 return Result;
11463
11464 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11465 } else if (CC == ISD::SETNE) {
11466 if (SDValue Result =
11467 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBNZ, DAG))
11468 return Result;
11469
11470 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11471 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11472 // Don't combine AND since emitComparison converts the AND to an ANDS
11473 // (a.k.a. TST) and the test in the test bit and branch instruction
11474 // becomes redundant. This would also increase register pressure.
11475 uint64_t SignBitPos;
11476 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11477 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11478 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11479 }
11480 }
11481 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11482 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11483 // Don't combine AND since emitComparison converts the AND to an ANDS
11484 // (a.k.a. TST) and the test in the test bit and branch instruction
11485 // becomes redundant. This would also increase register pressure.
11486 uint64_t SignBitPos;
11487 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11488 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11489 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11490 }
11491
11492 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11493 // larger branch displacement but do prefer CB over cmp + br.
11494 if (Subtarget->hasCMPBR() &&
11496 ProduceNonFlagSettingCondBr) {
11497 SDValue Cond =
11499 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11500 Dest);
11501 }
11502
11503 SDValue CCVal;
11504 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11505 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11506 Cmp);
11507 }
11508
11509 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11510 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11511
11512 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11513 // clean. Some of them require two branches to implement.
11514 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11515 AArch64CC::CondCode CC1, CC2;
11516 changeFPCCToAArch64CC(CC, CC1, CC2);
11517 SDValue CC1Val = getCondCode(DAG, CC1);
11518 SDValue BR1 =
11519 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11520 if (CC2 != AArch64CC::AL) {
11521 SDValue CC2Val = getCondCode(DAG, CC2);
11522 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11523 Cmp);
11524 }
11525
11526 return BR1;
11527}
11528
11529SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11530 SelectionDAG &DAG) const {
11531 if (!Subtarget->isNeonAvailable() &&
11532 !Subtarget->useSVEForFixedLengthVectors())
11533 return SDValue();
11534
11535 EVT VT = Op.getValueType();
11536 EVT IntVT = VT.changeTypeToInteger();
11537 SDLoc DL(Op);
11538
11539 SDValue In1 = Op.getOperand(0);
11540 SDValue In2 = Op.getOperand(1);
11541 EVT SrcVT = In2.getValueType();
11542
11543 if (!SrcVT.bitsEq(VT))
11544 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11545
11546 if (VT.isScalableVector())
11547 IntVT =
11549
11550 if (VT.isFixedLengthVector() &&
11551 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11552 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11553
11554 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11555 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11556
11557 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11558 return convertFromScalableVector(DAG, VT, Res);
11559 }
11560
11561 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11562 // a SVE FCOPYSIGN.
11563 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11564 Subtarget->isSVEorStreamingSVEAvailable()) {
11565 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11566 return SDValue();
11567 EVT SVT = getPackedSVEVectorVT(VT);
11568
11569 SDValue Poison = DAG.getPOISON(SVT);
11570 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11571 SDValue Ins1 =
11572 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, Poison, In1, Zero);
11573 SDValue Ins2 =
11574 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, Poison, In2, Zero);
11575 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11576 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS, Zero);
11577 }
11578
11579 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11580 if (VT.isScalableVector())
11581 return getSVESafeBitCast(VT, Op, DAG);
11582
11583 return DAG.getBitcast(VT, Op);
11584 };
11585
11586 SDValue VecVal1, VecVal2;
11587 EVT VecVT;
11588 auto SetVecVal = [&](int Idx = -1) {
11589 if (!VT.isVector()) {
11590 SDValue Poison = DAG.getPOISON(VecVT);
11591 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, Poison, In1);
11592 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, Poison, In2);
11593 } else {
11594 VecVal1 = BitCast(VecVT, In1, DAG);
11595 VecVal2 = BitCast(VecVT, In2, DAG);
11596 }
11597 };
11598 if (VT.isVector()) {
11599 VecVT = IntVT;
11600 SetVecVal();
11601 } else if (VT == MVT::f64) {
11602 VecVT = MVT::v2i64;
11603 SetVecVal(AArch64::dsub);
11604 } else if (VT == MVT::f32) {
11605 VecVT = MVT::v4i32;
11606 SetVecVal(AArch64::ssub);
11607 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11608 VecVT = MVT::v8i16;
11609 SetVecVal(AArch64::hsub);
11610 } else {
11611 llvm_unreachable("Invalid type for copysign!");
11612 }
11613
11614 unsigned BitWidth = In1.getScalarValueSizeInBits();
11615 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11616
11617 // We want to materialize a mask with every bit but the high bit set, but the
11618 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11619 // 64-bit elements. Instead, materialize all bits set and then negate that.
11620 if (VT == MVT::f64 || VT == MVT::v2f64) {
11621 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11622 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11623 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11624 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11625 }
11626
11627 SDValue BSP =
11628 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11629 if (VT == MVT::f16 || VT == MVT::bf16)
11630 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11631 if (VT == MVT::f32)
11632 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11633 if (VT == MVT::f64)
11634 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11635
11636 return BitCast(VT, BSP, DAG);
11637}
11638
11639SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11640 SelectionDAG &DAG) const {
11642 Attribute::NoImplicitFloat))
11643 return SDValue();
11644
11645 EVT VT = Op.getValueType();
11646 if (VT.isScalableVector() ||
11647 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11648 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11649
11650 bool IsParity = Op.getOpcode() == ISD::PARITY;
11651 SDValue Val = Op.getOperand(0);
11652 SDLoc DL(Op);
11653
11654 // for i32, general parity function using EORs is more efficient compared to
11655 // using floating point
11656 if (VT == MVT::i32 && IsParity)
11657 return SDValue();
11658
11659 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11660 if (VT == MVT::i32 || VT == MVT::i64) {
11661 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11662 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11663 DAG.getPOISON(ContainerVT), Val,
11664 DAG.getVectorIdxConstant(0, DL));
11665 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11666 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11667 DAG.getVectorIdxConstant(0, DL));
11668 if (IsParity)
11669 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11670 return Val;
11671 }
11672
11673 if (VT == MVT::i128) {
11674 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11675 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11676 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11677 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11678 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11679 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11680 if (IsParity)
11681 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11682 return Val;
11683 }
11684 }
11685
11686 if (!Subtarget->isNeonAvailable())
11687 return SDValue();
11688
11689 // If there is no CNT instruction available, GPR popcount can
11690 // be more efficiently lowered to the following sequence that uses
11691 // AdvSIMD registers/instructions as long as the copies to/from
11692 // the AdvSIMD registers are cheap.
11693 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11694 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11695 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11696 // FMOV X0, D0 // copy result back to integer reg
11697 if (VT == MVT::i32 || VT == MVT::i64) {
11698 if (VT == MVT::i32)
11699 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11700 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11701
11702 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11703 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11704 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11705 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11706 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11707 DAG.getConstant(0, DL, MVT::i64));
11708 if (IsParity)
11709 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11710 return AddV;
11711 } else if (VT == MVT::i128) {
11712 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11713
11714 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11715 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11716 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11717 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11718 DAG.getConstant(0, DL, MVT::i64));
11719 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11720 if (IsParity)
11721 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11722 return AddV;
11723 }
11724
11725 assert(!IsParity && "ISD::PARITY of vector types not supported");
11726
11727 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11728 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11729 "Unexpected type for custom ctpop lowering");
11730
11731 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11732 Val = DAG.getBitcast(VT8Bit, Val);
11733 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11734
11735 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11736 VT.getVectorNumElements() >= 2) {
11737 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11738 SDValue Zeros = DAG.getConstant(0, DL, DT);
11739 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11740
11741 if (VT == MVT::v2i64) {
11742 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11743 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11744 } else if (VT == MVT::v2i32) {
11745 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11746 } else if (VT == MVT::v4i32) {
11747 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11748 } else {
11749 llvm_unreachable("Unexpected type for custom ctpop lowering");
11750 }
11751
11752 return Val;
11753 }
11754
11755 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11756 unsigned EltSize = 8;
11757 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11758 while (EltSize != VT.getScalarSizeInBits()) {
11759 EltSize *= 2;
11760 NumElts /= 2;
11761 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11762 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11763 }
11764
11765 return Val;
11766}
11767
11768SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11769 EVT VT = Op.getValueType();
11770 SDLoc DL(Op);
11771 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11772 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11773}
11774
11775SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11776 SelectionDAG &DAG) const {
11777
11778 EVT VT = Op.getValueType();
11779 SDLoc DL(Op);
11780 unsigned Opcode = Op.getOpcode();
11781 ISD::CondCode CC;
11782 switch (Opcode) {
11783 default:
11784 llvm_unreachable("Wrong instruction");
11785 case ISD::SMAX:
11786 CC = ISD::SETGT;
11787 break;
11788 case ISD::SMIN:
11789 CC = ISD::SETLT;
11790 break;
11791 case ISD::UMAX:
11792 CC = ISD::SETUGT;
11793 break;
11794 case ISD::UMIN:
11795 CC = ISD::SETULT;
11796 break;
11797 }
11798
11799 // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11800 // prefer using SVE if available.
11801 if (VT.isScalableVector() ||
11802 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
11803 switch (Opcode) {
11804 default:
11805 llvm_unreachable("Wrong instruction");
11806 case ISD::SMAX:
11807 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11808 case ISD::SMIN:
11809 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11810 case ISD::UMAX:
11811 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11812 case ISD::UMIN:
11813 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11814 }
11815 }
11816
11817 SDValue Op0 = Op.getOperand(0);
11818 SDValue Op1 = Op.getOperand(1);
11819 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11820 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11821}
11822
11823SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11824 SelectionDAG &DAG) const {
11825 EVT VT = Op.getValueType();
11826
11827 if (VT.isScalableVector() ||
11829 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11830 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11831
11832 SDLoc DL(Op);
11833 SDValue REVB;
11834 MVT VST;
11835
11836 switch (VT.getSimpleVT().SimpleTy) {
11837 default:
11838 llvm_unreachable("Invalid type for bitreverse!");
11839
11840 case MVT::v2i32: {
11841 VST = MVT::v8i8;
11842 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11843
11844 break;
11845 }
11846
11847 case MVT::v4i32: {
11848 VST = MVT::v16i8;
11849 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11850
11851 break;
11852 }
11853
11854 case MVT::v1i64: {
11855 VST = MVT::v8i8;
11856 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11857
11858 break;
11859 }
11860
11861 case MVT::v2i64: {
11862 VST = MVT::v16i8;
11863 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11864
11865 break;
11866 }
11867 }
11868
11869 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11870 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11871}
11872
11873// Check whether the continuous comparison sequence.
11874static bool
11875isOrXorChain(SDValue N, unsigned &Num,
11876 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11877 if (Num == MaxXors)
11878 return false;
11879
11880 // Skip the one-use zext
11881 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11882 N = N->getOperand(0);
11883
11884 // The leaf node must be XOR
11885 if (N->getOpcode() == ISD::XOR) {
11886 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11887 Num++;
11888 return true;
11889 }
11890
11891 // All the non-leaf nodes must be OR.
11892 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11893 return false;
11894
11895 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11896 isOrXorChain(N->getOperand(1), Num, WorkList))
11897 return true;
11898 return false;
11899}
11900
11901// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11903 SDValue LHS = N->getOperand(0);
11904 SDValue RHS = N->getOperand(1);
11905 SDLoc DL(N);
11906 EVT VT = N->getValueType(0);
11908
11909 // Only handle integer compares.
11910 if (N->getOpcode() != ISD::SETCC)
11911 return SDValue();
11912
11913 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11914 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11915 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11916 unsigned NumXors = 0;
11917 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11918 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11919 isOrXorChain(LHS, NumXors, WorkList)) {
11920 SDValue XOR0, XOR1;
11921 std::tie(XOR0, XOR1) = WorkList[0];
11922 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11923 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11924 for (unsigned I = 1; I < WorkList.size(); I++) {
11925 std::tie(XOR0, XOR1) = WorkList[I];
11926 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11927 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11928 }
11929
11930 // Exit early by inverting the condition, which help reduce indentations.
11931 return Cmp;
11932 }
11933
11934 return SDValue();
11935}
11936
11937SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11938
11939 if (Op.getValueType().isVector())
11940 return LowerVSETCC(Op, DAG);
11941
11942 bool IsStrict = Op->isStrictFPOpcode();
11943 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11944 unsigned OpNo = IsStrict ? 1 : 0;
11945 SDValue Chain;
11946 if (IsStrict)
11947 Chain = Op.getOperand(0);
11948 SDValue LHS = Op.getOperand(OpNo + 0);
11949 SDValue RHS = Op.getOperand(OpNo + 1);
11950 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11951 SDLoc DL(Op);
11952
11953 // We chose ZeroOrOneBooleanContents, so use zero and one.
11954 EVT VT = Op.getValueType();
11955 SDValue TVal = DAG.getConstant(1, DL, VT);
11956 SDValue FVal = DAG.getConstant(0, DL, VT);
11957
11958 // Handle f128 first, since one possible outcome is a normal integer
11959 // comparison which gets picked up by the next if statement.
11960 if (LHS.getValueType() == MVT::f128) {
11961 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11962 IsSignaling);
11963
11964 // If softenSetCCOperands returned a scalar, use it.
11965 if (!RHS.getNode()) {
11966 assert(LHS.getValueType() == Op.getValueType() &&
11967 "Unexpected setcc expansion!");
11968 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11969 }
11970 }
11971
11972 if (LHS.getValueType().isInteger()) {
11973 if (Subtarget->hasCSSC() && CC == ISD::SETNE && isNullConstant(RHS)) {
11974 SDValue One = DAG.getConstant(1, DL, LHS.getValueType());
11975 SDValue UMin = DAG.getNode(ISD::UMIN, DL, LHS.getValueType(), LHS, One);
11976 SDValue Res = DAG.getZExtOrTrunc(UMin, DL, VT);
11977 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11978 }
11979 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11980
11981 SDValue CCVal;
11983 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11984
11985 // Note that we inverted the condition above, so we reverse the order of
11986 // the true and false operands here. This will allow the setcc to be
11987 // matched to a single CSINC instruction.
11988 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11989 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11990 }
11991
11992 // Now we know we're dealing with FP values.
11993 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11994 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11995
11996 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11997 // and do the comparison.
11998 SDValue Cmp;
11999 if (IsStrict)
12000 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
12001 else
12002 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12003
12004 AArch64CC::CondCode CC1, CC2;
12005 changeFPCCToAArch64CC(CC, CC1, CC2);
12006 SDValue Res;
12007 if (CC2 == AArch64CC::AL) {
12008 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
12009 CC2);
12010 SDValue CC1Val = getCondCode(DAG, CC1);
12011
12012 // Note that we inverted the condition above, so we reverse the order of
12013 // the true and false operands here. This will allow the setcc to be
12014 // matched to a single CSINC instruction.
12015 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
12016 } else {
12017 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
12018 // totally clean. Some of them require two CSELs to implement. As is in
12019 // this case, we emit the first CSEL and then emit a second using the output
12020 // of the first as the RHS. We're effectively OR'ing the two CC's together.
12021
12022 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
12023 SDValue CC1Val = getCondCode(DAG, CC1);
12024 SDValue CS1 =
12025 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12026
12027 SDValue CC2Val = getCondCode(DAG, CC2);
12028 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12029 }
12030 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
12031}
12032
12033SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
12034 SelectionDAG &DAG) const {
12035
12036 SDValue LHS = Op.getOperand(0);
12037 SDValue RHS = Op.getOperand(1);
12038 EVT VT = LHS.getValueType();
12039 if (VT != MVT::i32 && VT != MVT::i64)
12040 return SDValue();
12041
12042 SDLoc DL(Op);
12043 SDValue Carry = Op.getOperand(2);
12044 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
12045 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
12046 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
12047 LHS, RHS, InvCarry);
12048
12049 EVT OpVT = Op.getValueType();
12050 SDValue TVal = DAG.getConstant(1, DL, OpVT);
12051 SDValue FVal = DAG.getConstant(0, DL, OpVT);
12052
12053 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
12055 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
12056 // Inputs are swapped because the condition is inverted. This will allow
12057 // matching with a single CSINC instruction.
12058 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
12059 Cmp.getValue(1));
12060}
12061
12062/// Emit vector comparison for floating-point values, producing a mask.
12064 AArch64CC::CondCode CC, bool NoNans, EVT VT,
12065 const SDLoc &DL, SelectionDAG &DAG) {
12066 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
12067 "function only supposed to emit natural comparisons");
12068
12069 switch (CC) {
12070 default:
12071 return SDValue();
12072 case AArch64CC::NE: {
12073 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12074 // Use vector semantics for the inversion to potentially save a copy between
12075 // SIMD and regular registers.
12076 if (!LHS.getValueType().isVector()) {
12077 EVT VecVT =
12078 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12079 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12080 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
12081 DAG.getPOISON(VecVT), Fcmeq, Zero);
12082 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
12083 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
12084 }
12085 return DAG.getNOT(DL, Fcmeq, VT);
12086 }
12087 case AArch64CC::EQ:
12088 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12089 case AArch64CC::GE:
12090 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
12091 case AArch64CC::GT:
12092 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
12093 case AArch64CC::LE:
12094 if (!NoNans)
12095 return SDValue();
12096 // If we ignore NaNs then we can use to the LS implementation.
12097 [[fallthrough]];
12098 case AArch64CC::LS:
12099 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
12100 case AArch64CC::LT:
12101 if (!NoNans)
12102 return SDValue();
12103 // If we ignore NaNs then we can use to the MI implementation.
12104 [[fallthrough]];
12105 case AArch64CC::MI:
12106 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
12107 }
12108}
12109
12110/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
12111/// values are scalars, try to emit a mask generating vector instruction.
12113 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
12114 const SDLoc &DL, SelectionDAG &DAG) {
12115 assert(!LHS.getValueType().isVector());
12116 assert(!RHS.getValueType().isVector());
12117
12118 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
12119 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
12120 if (!CTVal || !CFVal)
12121 return {};
12122 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
12123 !(CTVal->isZero() && CFVal->isAllOnes()))
12124 return {};
12125
12126 if (CTVal->isZero())
12127 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12128
12129 EVT VT = TVal.getValueType();
12130 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
12131 return {};
12132
12133 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
12134 bool OneNaN = false;
12135 if (LHS == RHS) {
12136 OneNaN = true;
12137 } else if (DAG.isKnownNeverNaN(RHS)) {
12138 OneNaN = true;
12139 RHS = LHS;
12140 } else if (DAG.isKnownNeverNaN(LHS)) {
12141 OneNaN = true;
12142 LHS = RHS;
12143 }
12144 if (OneNaN)
12145 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
12146 }
12147
12150 bool ShouldInvert = false;
12151 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12152 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
12153 SDValue Cmp2;
12154 if (CC2 != AArch64CC::AL) {
12155 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
12156 if (!Cmp2)
12157 return {};
12158 }
12159 if (!Cmp2 && !ShouldInvert)
12160 return Cmp;
12161
12162 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12163 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12164 SDValue Poison = DAG.getPOISON(VecVT);
12165 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Poison, Cmp, Zero);
12166 if (Cmp2) {
12167 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Poison, Cmp2, Zero);
12168 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
12169 }
12170 if (ShouldInvert)
12171 Cmp = DAG.getNOT(DL, Cmp, VecVT);
12172 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
12173 return Cmp;
12174}
12175
12176SDValue AArch64TargetLowering::LowerSELECT_CC(
12179 const SDLoc &DL, SelectionDAG &DAG) const {
12180 // Handle f128 first, because it will result in a comparison of some RTLIB
12181 // call result against zero.
12182 if (LHS.getValueType() == MVT::f128) {
12183 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
12184
12185 // If softenSetCCOperands returned a scalar, we need to compare the result
12186 // against zero to select between true and false values.
12187 if (!RHS.getNode()) {
12188 RHS = DAG.getConstant(0, DL, LHS.getValueType());
12189 CC = ISD::SETNE;
12190 }
12191 }
12192
12193 // Also handle f16, for which we need to do a f32 comparison.
12194 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
12195 LHS.getValueType() == MVT::bf16) {
12196 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
12197 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
12198 }
12199
12200 // Next, handle integers.
12201 if (LHS.getValueType().isInteger()) {
12202 assert((LHS.getValueType() == RHS.getValueType()) &&
12203 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
12204
12205 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
12206 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
12207 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
12208
12209 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
12210 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
12211 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
12212 // Both require less instructions than compare and conditional select.
12213 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
12214 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
12215 LHS.getValueType() == RHS.getValueType()) {
12216 EVT VT = LHS.getValueType();
12217 SDValue Shift =
12218 DAG.getNode(ISD::SRA, DL, VT, LHS,
12219 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
12220
12221 if (CC == ISD::SETGT)
12222 Shift = DAG.getNOT(DL, Shift, VT);
12223
12224 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
12225 }
12226
12227 // Check for sign bit test patterns that can use TST optimization.
12228 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
12229 // -> TST %operand, sign_bit; CSEL
12230 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
12231 // -> TST %operand, sign_bit; CSEL
12232 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
12233 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
12234 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12235
12236 uint64_t SignBitPos;
12237 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
12238 EVT TestVT = LHS.getValueType();
12239 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
12240 SDValue TST =
12241 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
12242 LHS, SignBitConst);
12243
12244 SDValue Flags = TST.getValue(1);
12245 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
12246 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
12247 }
12248
12249 // Canonicalise absolute difference patterns:
12250 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
12251 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
12252 //
12253 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
12254 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
12255 // The second forms can be matched into subs+cneg.
12256 // NOTE: Drop poison generating flags from the negated operand to avoid
12257 // inadvertently propagating poison after the canonicalisation.
12258 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
12259 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
12260 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
12262 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
12263 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
12264 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
12266 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
12267 }
12268 }
12269
12270 unsigned Opcode = AArch64ISD::CSEL;
12271
12272 // If both the TVal and the FVal are constants, see if we can swap them in
12273 // order to for a CSINV or CSINC out of them.
12274 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
12275 std::swap(TVal, FVal);
12276 std::swap(CTVal, CFVal);
12277 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12278 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
12279 std::swap(TVal, FVal);
12280 std::swap(CTVal, CFVal);
12281 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12282 } else if (TVal.getOpcode() == ISD::XOR) {
12283 // If TVal is a NOT we want to swap TVal and FVal so that we can match
12284 // with a CSINV rather than a CSEL.
12285 if (isAllOnesConstant(TVal.getOperand(1))) {
12286 std::swap(TVal, FVal);
12287 std::swap(CTVal, CFVal);
12288 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12289 }
12290 } else if (TVal.getOpcode() == ISD::SUB) {
12291 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
12292 // that we can match with a CSNEG rather than a CSEL.
12293 if (isNullConstant(TVal.getOperand(0))) {
12294 std::swap(TVal, FVal);
12295 std::swap(CTVal, CFVal);
12296 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12297 }
12298 } else if (CTVal && CFVal) {
12299 const int64_t TrueVal = CTVal->getSExtValue();
12300 const int64_t FalseVal = CFVal->getSExtValue();
12301 bool Swap = false;
12302
12303 // If both TVal and FVal are constants, see if FVal is the
12304 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
12305 // instead of a CSEL in that case.
12306 if (TrueVal == ~FalseVal) {
12307 Opcode = AArch64ISD::CSINV;
12308 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
12309 TrueVal == -FalseVal) {
12310 Opcode = AArch64ISD::CSNEG;
12311 } else if (TVal.getValueType() == MVT::i32) {
12312 // If our operands are only 32-bit wide, make sure we use 32-bit
12313 // arithmetic for the check whether we can use CSINC. This ensures that
12314 // the addition in the check will wrap around properly in case there is
12315 // an overflow (which would not be the case if we do the check with
12316 // 64-bit arithmetic).
12317 const uint32_t TrueVal32 = CTVal->getZExtValue();
12318 const uint32_t FalseVal32 = CFVal->getZExtValue();
12319
12320 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
12321 Opcode = AArch64ISD::CSINC;
12322
12323 if (TrueVal32 > FalseVal32) {
12324 Swap = true;
12325 }
12326 }
12327 } else {
12328 // 64-bit check whether we can use CSINC.
12329 const uint64_t TrueVal64 = TrueVal;
12330 const uint64_t FalseVal64 = FalseVal;
12331
12332 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
12333 Opcode = AArch64ISD::CSINC;
12334
12335 if (TrueVal > FalseVal) {
12336 Swap = true;
12337 }
12338 }
12339 }
12340
12341 // Swap TVal and FVal if necessary.
12342 if (Swap) {
12343 std::swap(TVal, FVal);
12344 std::swap(CTVal, CFVal);
12345 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12346 }
12347
12348 if (Opcode != AArch64ISD::CSEL) {
12349 // Drop FVal since we can get its value by simply inverting/negating
12350 // TVal.
12351 FVal = TVal;
12352 }
12353 }
12354
12355 // Avoid materializing a constant when possible by reusing a known value in
12356 // a register. However, don't perform this optimization if the known value
12357 // is one, zero or negative one in the case of a CSEL. We can always
12358 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
12359 // FVal, respectively.
12360 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
12361 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
12362 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
12364 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
12365 // "a != C ? x : a" to avoid materializing C.
12366 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
12367 TVal = LHS;
12368 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
12369 FVal = LHS;
12370 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
12371 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
12372 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
12373 // avoid materializing C.
12375 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
12376 Opcode = AArch64ISD::CSINV;
12377 TVal = LHS;
12378 FVal = DAG.getConstant(0, DL, FVal.getValueType());
12379 }
12380 }
12381
12382 SDValue CCVal;
12383 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
12384 EVT VT = TVal.getValueType();
12385 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
12386 }
12387
12388 // Now we know we're dealing with FP values.
12389 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
12390 LHS.getValueType() == MVT::f64);
12391 assert(LHS.getValueType() == RHS.getValueType());
12392 EVT VT = TVal.getValueType();
12393
12394 // If the purpose of the comparison is to select between all ones
12395 // or all zeros, try to use a vector comparison because the operands are
12396 // already stored in SIMD registers.
12397 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
12398 switch (U->getOpcode()) {
12399 default:
12400 return false;
12403 case AArch64ISD::DUP:
12404 return true;
12405 }
12406 })) {
12407 bool NoNaNs = Flags.hasNoNaNs();
12408 SDValue VectorCmp =
12409 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
12410 if (VectorCmp)
12411 return VectorCmp;
12412 }
12413
12414 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12415
12416 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12417 // clean. Some of them require two CSELs to implement.
12418 AArch64CC::CondCode CC1, CC2;
12419 changeFPCCToAArch64CC(CC, CC1, CC2);
12420
12421 if (Flags.hasNoSignedZeros()) {
12422 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
12423 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
12424 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
12425 if (RHSVal && RHSVal->isZero()) {
12426 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
12427 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
12428
12429 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
12430 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
12431 TVal = LHS;
12432 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
12433 CFVal && CFVal->isZero() &&
12434 FVal.getValueType() == LHS.getValueType())
12435 FVal = LHS;
12436 }
12437 }
12438
12439 // Emit first, and possibly only, CSEL.
12440 SDValue CC1Val = getCondCode(DAG, CC1);
12441 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12442
12443 // If we need a second CSEL, emit it, using the output of the first as the
12444 // RHS. We're effectively OR'ing the two CC's together.
12445 if (CC2 != AArch64CC::AL) {
12446 SDValue CC2Val = getCondCode(DAG, CC2);
12447 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12448 }
12449
12450 // Otherwise, return the output of the first CSEL.
12451 return CS1;
12452}
12453
12454SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12455 SelectionDAG &DAG) const {
12456 EVT Ty = Op.getValueType();
12457 if (!isa<ConstantSDNode>(Op.getOperand(2)))
12458 return SDValue();
12459 auto Idx = Op.getConstantOperandAPInt(2);
12460 int64_t IdxVal = Idx.getSExtValue();
12461 assert(Ty.isScalableVector() &&
12462 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12463
12464 // We can use the splice instruction for certain index values where we are
12465 // able to efficiently generate the correct predicate. The index will be
12466 // inverted and used directly as the input to the ptrue instruction, i.e.
12467 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12468 // splice predicate. However, we can only do this if we can guarantee that
12469 // there are enough elements in the vector, hence we check the index <= min
12470 // number of elements.
12471 std::optional<unsigned> PredPattern;
12472 if (Ty.isScalableVector() && Op.getOpcode() == ISD::VECTOR_SPLICE_RIGHT &&
12473 (PredPattern = getSVEPredPatternFromNumElements(IdxVal)) !=
12474 std::nullopt) {
12475 SDLoc DL(Op);
12476
12477 // Create a predicate where all but the last -IdxVal elements are false.
12478 EVT PredVT = Ty.changeVectorElementType(*DAG.getContext(), MVT::i1);
12479 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12480 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12481
12482 // Now splice the two inputs together using the predicate.
12483 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12484 Op.getOperand(1));
12485 }
12486
12487 // We can select to an EXT instruction when indexing the first 256 bytes.
12489 if (Op.getOpcode() == ISD::VECTOR_SPLICE_LEFT &&
12490 (IdxVal * BlockSize / 8) < 256)
12491 return Op;
12492
12493 return SDValue();
12494}
12495
12496SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12497 SelectionDAG &DAG) const {
12498 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12499 SDValue LHS = Op.getOperand(0);
12500 SDValue RHS = Op.getOperand(1);
12501 SDValue TVal = Op.getOperand(2);
12502 SDValue FVal = Op.getOperand(3);
12503 SDNodeFlags Flags = Op->getFlags();
12504 SDLoc DL(Op);
12505 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12506}
12507
12508SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12509 SelectionDAG &DAG) const {
12510 SDValue CCVal = Op->getOperand(0);
12511 SDValue TVal = Op->getOperand(1);
12512 SDValue FVal = Op->getOperand(2);
12513 SDLoc DL(Op);
12514
12515 EVT Ty = Op.getValueType();
12516 if (Ty == MVT::aarch64svcount) {
12517 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12518 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12519 SDValue Sel =
12520 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12521 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12522 }
12523
12524 if (Ty.isScalableVector()) {
12525 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12526 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12527 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12528 }
12529
12530 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12531 // FIXME: Ideally this would be the same as above using i1 types, however
12532 // for the moment we can't deal with fixed i1 vector types properly, so
12533 // instead extend the predicate to a result type sized integer vector.
12534 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12535 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12536 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12537 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12538 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12539 }
12540
12541 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12542 // instruction.
12543 if (ISD::isOverflowIntrOpRes(CCVal)) {
12544 // Only lower legal XALUO ops.
12545 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12546 return SDValue();
12547
12549 SDValue Value, Overflow;
12550 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12551 SDValue CCVal = getCondCode(DAG, OFCC);
12552
12553 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12554 CCVal, Overflow);
12555 }
12556
12557 // Lower it the same way as we would lower a SELECT_CC node.
12558 ISD::CondCode CC;
12559 SDValue LHS, RHS;
12560 if (CCVal.getOpcode() == ISD::SETCC) {
12561 LHS = CCVal.getOperand(0);
12562 RHS = CCVal.getOperand(1);
12563 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12564 } else {
12565 LHS = CCVal;
12566 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12567 CC = ISD::SETNE;
12568 }
12569
12570 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12571 // order to use FCSELSrrr
12572 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12573 SDValue Poison = DAG.getPOISON(MVT::f32);
12574 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, Poison, TVal);
12575 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, Poison, FVal);
12576 }
12577
12578 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12579 Op->getFlags(), DL, DAG);
12580
12581 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12582 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12583 }
12584
12585 return Res;
12586}
12587
12588SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12589 SelectionDAG &DAG) const {
12590 // Jump table entries as PC relative offsets. No additional tweaking
12591 // is necessary here. Just get the address of the jump table.
12592 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12593
12596 !Subtarget->isTargetMachO())
12597 return getAddrLarge(JT, DAG);
12598 if (CM == CodeModel::Tiny)
12599 return getAddrTiny(JT, DAG);
12600 return getAddr(JT, DAG);
12601}
12602
12603SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12604 SelectionDAG &DAG) const {
12605 // Jump table entries as PC relative offsets. No additional tweaking
12606 // is necessary here. Just get the address of the jump table.
12607 SDLoc DL(Op);
12608 SDValue JT = Op.getOperand(1);
12609 SDValue Entry = Op.getOperand(2);
12610 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12611
12612 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12613 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12614
12615 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12616 // sequence later, to guarantee the integrity of the intermediate values.
12618 "aarch64-jump-table-hardening")) {
12620 if (Subtarget->isTargetMachO()) {
12621 if (CM != CodeModel::Small && CM != CodeModel::Large)
12622 report_fatal_error("Unsupported code-model for hardened jump-table");
12623 } else {
12624 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12625 assert(Subtarget->isTargetELF() &&
12626 "jump table hardening only supported on MachO/ELF");
12627 if (CM != CodeModel::Small)
12628 report_fatal_error("Unsupported code-model for hardened jump-table");
12629 }
12630
12631 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12632 Entry, SDValue());
12633 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12634 DAG.getTargetJumpTable(JTI, MVT::i32),
12635 X16Copy.getValue(0), X16Copy.getValue(1));
12636 return SDValue(B, 0);
12637 }
12638
12639 SDNode *Dest =
12640 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12641 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12642 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12643 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12644}
12645
12646SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12647 SDValue Chain = Op.getOperand(0);
12648 SDValue Dest = Op.getOperand(1);
12649
12650 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12651 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12652 if (Dest->isMachineOpcode() &&
12653 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12654 return SDValue();
12655
12656 const MachineFunction &MF = DAG.getMachineFunction();
12657 std::optional<uint16_t> BADisc =
12658 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12659 if (!BADisc)
12660 return SDValue();
12661
12662 SDLoc DL(Op);
12663
12664 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12666 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12667
12668 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12669 {Dest, Key, Disc, AddrDisc, Chain});
12670 return SDValue(BrA, 0);
12671}
12672
12673SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12674 SelectionDAG &DAG) const {
12675 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12677 if (CM == CodeModel::Large) {
12678 // Use the GOT for the large code model on iOS.
12679 if (Subtarget->isTargetMachO()) {
12680 return getGOT(CP, DAG);
12681 }
12683 return getAddrLarge(CP, DAG);
12684 } else if (CM == CodeModel::Tiny) {
12685 return getAddrTiny(CP, DAG);
12686 }
12687 return getAddr(CP, DAG);
12688}
12689
12690SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12691 SelectionDAG &DAG) const {
12692 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12693 const BlockAddress *BA = BAN->getBlockAddress();
12694
12695 if (std::optional<uint16_t> BADisc =
12696 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12697 *BA->getFunction())) {
12698 SDLoc DL(Op);
12699
12700 // This isn't cheap, but BRIND is rare.
12701 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12702
12703 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12704
12706 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12707
12708 SDNode *MOV =
12709 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12710 {TargetBA, Key, AddrDisc, Disc});
12711 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12712 SDValue(MOV, 1));
12713 }
12714
12716 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12718 return getAddrLarge(BAN, DAG);
12719 } else if (CM == CodeModel::Tiny) {
12720 return getAddrTiny(BAN, DAG);
12721 }
12722 return getAddr(BAN, DAG);
12723}
12724
12725SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12726 SelectionDAG &DAG) const {
12727 AArch64FunctionInfo *FuncInfo =
12728 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12729
12730 SDLoc DL(Op);
12731 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12733 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12734 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12735 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12736 MachinePointerInfo(SV));
12737}
12738
12739SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12740 SelectionDAG &DAG) const {
12741 MachineFunction &MF = DAG.getMachineFunction();
12742 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12743
12744 SDLoc DL(Op);
12745 SDValue FR;
12746 if (Subtarget->isWindowsArm64EC()) {
12747 // With the Arm64EC ABI, we compute the address of the varargs save area
12748 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12749 // but calls from an entry thunk can pass in a different address.
12750 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12751 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12752 uint64_t StackOffset;
12753 if (FuncInfo->getVarArgsGPRSize() > 0)
12754 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12755 else
12756 StackOffset = FuncInfo->getVarArgsStackOffset();
12757 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12758 DAG.getConstant(StackOffset, DL, MVT::i64));
12759 } else {
12760 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12761 ? FuncInfo->getVarArgsGPRIndex()
12762 : FuncInfo->getVarArgsStackIndex(),
12764 }
12765 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12766 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12767 MachinePointerInfo(SV));
12768}
12769
12770SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12771 SelectionDAG &DAG) const {
12772 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12773 // Standard, section B.3.
12774 MachineFunction &MF = DAG.getMachineFunction();
12775 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12776 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12777 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12778 auto PtrVT = getPointerTy(DAG.getDataLayout());
12779 SDLoc DL(Op);
12780
12781 SDValue Chain = Op.getOperand(0);
12782 SDValue VAList = Op.getOperand(1);
12783 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12785
12786 // void *__stack at offset 0
12787 unsigned Offset = 0;
12788 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12789 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12790 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12791 MachinePointerInfo(SV), Align(PtrSize)));
12792
12793 // void *__gr_top at offset 8 (4 on ILP32)
12794 Offset += PtrSize;
12795 int GPRSize = FuncInfo->getVarArgsGPRSize();
12796 if (GPRSize > 0) {
12797 SDValue GRTop, GRTopAddr;
12798
12799 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12800 DAG.getConstant(Offset, DL, PtrVT));
12801
12802 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12803 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12804 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12805 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12806
12807 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12808 MachinePointerInfo(SV, Offset),
12809 Align(PtrSize)));
12810 }
12811
12812 // void *__vr_top at offset 16 (8 on ILP32)
12813 Offset += PtrSize;
12814 int FPRSize = FuncInfo->getVarArgsFPRSize();
12815 if (FPRSize > 0) {
12816 SDValue VRTop, VRTopAddr;
12817 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12818 DAG.getConstant(Offset, DL, PtrVT));
12819
12820 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12821 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12822 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12823 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12824
12825 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12826 MachinePointerInfo(SV, Offset),
12827 Align(PtrSize)));
12828 }
12829
12830 // int __gr_offs at offset 24 (12 on ILP32)
12831 Offset += PtrSize;
12832 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12833 DAG.getConstant(Offset, DL, PtrVT));
12834 MemOps.push_back(
12835 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12836 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12837
12838 // int __vr_offs at offset 28 (16 on ILP32)
12839 Offset += 4;
12840 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12841 DAG.getConstant(Offset, DL, PtrVT));
12842 MemOps.push_back(
12843 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12844 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12845
12846 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12847}
12848
12849SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12850 SelectionDAG &DAG) const {
12851 MachineFunction &MF = DAG.getMachineFunction();
12852 Function &F = MF.getFunction();
12853
12854 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12855 return LowerWin64_VASTART(Op, DAG);
12856 else if (Subtarget->isTargetDarwin())
12857 return LowerDarwin_VASTART(Op, DAG);
12858 else
12859 return LowerAAPCS_VASTART(Op, DAG);
12860}
12861
12862SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12863 SelectionDAG &DAG) const {
12864 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12865 // pointer.
12866 SDLoc DL(Op);
12867 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12868 unsigned VaListSize =
12869 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12870 ? PtrSize
12871 : Subtarget->isTargetILP32() ? 20 : 32;
12872 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12873 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12874
12875 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12876 DAG.getConstant(VaListSize, DL, MVT::i32),
12877 Align(PtrSize), false, false, /*CI=*/nullptr,
12878 std::nullopt, MachinePointerInfo(DestSV),
12879 MachinePointerInfo(SrcSV));
12880}
12881
12882SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12883 assert(Subtarget->isTargetDarwin() &&
12884 "automatic va_arg instruction only works on Darwin");
12885
12886 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12887 EVT VT = Op.getValueType();
12888 SDLoc DL(Op);
12889 SDValue Chain = Op.getOperand(0);
12890 SDValue Addr = Op.getOperand(1);
12891 MaybeAlign Align(Op.getConstantOperandVal(3));
12892 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12893 auto PtrVT = getPointerTy(DAG.getDataLayout());
12894 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12895 SDValue VAList =
12896 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12897 Chain = VAList.getValue(1);
12898 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12899
12900 if (VT.isScalableVector())
12901 report_fatal_error("Passing SVE types to variadic functions is "
12902 "currently not supported");
12903
12904 if (Align && *Align > MinSlotSize) {
12905 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12906 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12907 VAList =
12908 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12909 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12910 }
12911
12912 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12913 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12914
12915 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12916 // up to 64 bits. At the very least, we have to increase the striding of the
12917 // vaargs list to match this, and for FP values we need to introduce
12918 // FP_ROUND nodes as well.
12919 if (VT.isInteger() && !VT.isVector())
12920 ArgSize = std::max(ArgSize, MinSlotSize);
12921 bool NeedFPTrunc = false;
12922 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12923 ArgSize = 8;
12924 NeedFPTrunc = true;
12925 }
12926
12927 // Increment the pointer, VAList, to the next vaarg
12928 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12929 DAG.getConstant(ArgSize, DL, PtrVT));
12930 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12931
12932 // Store the incremented VAList to the legalized pointer
12933 SDValue APStore =
12934 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12935
12936 // Load the actual argument out of the pointer VAList
12937 if (NeedFPTrunc) {
12938 // Load the value as an f64.
12939 SDValue WideFP =
12940 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12941 // Round the value down to an f32.
12942 SDValue NarrowFP =
12943 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12944 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12945 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12946 // Merge the rounded value with the chain output of the load.
12947 return DAG.getMergeValues(Ops, DL);
12948 }
12949
12950 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12951}
12952
12953SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12954 SelectionDAG &DAG) const {
12955 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12956 MFI.setFrameAddressIsTaken(true);
12957
12958 EVT VT = Op.getValueType();
12959 SDLoc DL(Op);
12960 unsigned Depth = Op.getConstantOperandVal(0);
12961 SDValue FrameAddr =
12962 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12963 while (Depth--)
12964 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12965 MachinePointerInfo());
12966
12967 if (Subtarget->isTargetILP32())
12968 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12969 DAG.getValueType(VT));
12970
12971 return FrameAddr;
12972}
12973
12974SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12975 SelectionDAG &DAG) const {
12976 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12977
12978 EVT VT = getPointerTy(DAG.getDataLayout());
12979 int FI = MFI.CreateFixedObject(4, 0, false);
12980 return DAG.getFrameIndex(FI, VT);
12981}
12982
12983#define GET_REGISTER_MATCHER
12984#include "AArch64GenAsmMatcher.inc"
12985
12986// FIXME? Maybe this could be a TableGen attribute on some registers and
12987// this table could be generated automatically from RegInfo.
12988Register AArch64TargetLowering::
12989getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12991 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12992 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12993 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12994 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12995 !MRI->isReservedReg(MF, Reg))
12996 Reg = Register();
12997 }
12998 return Reg;
12999}
13000
13001SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
13002 SelectionDAG &DAG) const {
13004
13005 EVT VT = Op.getValueType();
13006 SDLoc DL(Op);
13007
13008 SDValue FrameAddr =
13009 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
13011
13012 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
13013}
13014
13015SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
13016 SelectionDAG &DAG) const {
13017 MachineFunction &MF = DAG.getMachineFunction();
13018 MachineFrameInfo &MFI = MF.getFrameInfo();
13019 MFI.setReturnAddressIsTaken(true);
13020
13021 EVT VT = Op.getValueType();
13022 SDLoc DL(Op);
13023 unsigned Depth = Op.getConstantOperandVal(0);
13024 SDValue ReturnAddress;
13025 if (Depth) {
13026 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
13028 ReturnAddress = DAG.getLoad(
13029 VT, DL, DAG.getEntryNode(),
13030 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
13031 } else {
13032 // Return LR, which contains the return address. Mark it an implicit
13033 // live-in.
13034 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
13035 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
13036 }
13037
13038 // The XPACLRI instruction assembles to a hint-space instruction before
13039 // Armv8.3-A therefore this instruction can be safely used for any pre
13040 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
13041 // that instead.
13042 SDNode *St;
13043 if (Subtarget->hasPAuth()) {
13044 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
13045 } else {
13046 // XPACLRI operates on LR therefore we must move the operand accordingly.
13047 SDValue Chain =
13048 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
13049 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
13050 }
13051 return SDValue(St, 0);
13052}
13053
13054/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
13055/// i32 values and take a 2 x i32 value to shift plus a shift amount.
13056SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
13057 SelectionDAG &DAG) const {
13058 SDValue Lo, Hi;
13059 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
13060 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
13061}
13062
13064 const GlobalAddressSDNode *GA) const {
13065 // Offsets are folded in the DAG combine rather than here so that we can
13066 // intelligently choose an offset based on the uses.
13067 return false;
13068}
13069
13071 EVT VT) const {
13072 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
13073 // 16-bit case when target has full fp16 support.
13074 // We encode bf16 bit patterns as if they were fp16. This results in very
13075 // strange looking assembly but should populate the register with appropriate
13076 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
13077 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
13078 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
13079 // FIXME: We should be able to handle f128 as well with a clever lowering.
13080 const APInt ImmInt = Imm.bitcastToAPInt();
13081
13082 if (VT == MVT::f64)
13083 return AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
13084
13085 if (VT == MVT::f32)
13086 return AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
13087
13088 if (VT == MVT::f16 || VT == MVT::bf16)
13089 return (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
13090 Imm.isPosZero();
13091
13092 return false;
13093}
13094
13096 bool OptForSize) const {
13097 bool IsLegal = isFPImmLegalAsFMov(Imm, VT);
13098 const APInt ImmInt = Imm.bitcastToAPInt();
13099
13100 // If we can not materialize in immediate field for fmov, check if the
13101 // value can be encoded as the immediate operand of a logical instruction.
13102 // The immediate value will be created with either MOVZ, MOVN, or ORR.
13103 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
13104 // generate that fmov.
13105 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
13106 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
13107 // however the mov+fmov sequence is always better because of the reduced
13108 // cache pressure. The timings are still the same if you consider
13109 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
13110 // movw+movk is fused). So by default we limit up to 2 instructions
13111 // or 4 with hasFuseLiterals.
13114 assert(Insn.size() <= 4 &&
13115 "Should be able to build any value with at most 4 moves");
13116 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
13117 IsLegal = Insn.size() <= Limit;
13118 }
13119
13120 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
13121 << " imm value: "; Imm.dump(););
13122 return IsLegal;
13123}
13124
13125//===----------------------------------------------------------------------===//
13126// AArch64 Optimization Hooks
13127//===----------------------------------------------------------------------===//
13128
13129static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
13130 SDValue Operand, SelectionDAG &DAG,
13131 int &ExtraSteps) {
13132 EVT VT = Operand.getValueType();
13133 if ((ST->hasNEON() &&
13134 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
13135 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
13136 VT == MVT::v4f32)) ||
13137 (ST->hasSVE() &&
13138 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
13140 // For the reciprocal estimates, convergence is quadratic, so the number
13141 // of digits is doubled after each iteration. In ARMv8, the accuracy of
13142 // the initial estimate is 2^-8. Thus the number of extra steps to refine
13143 // the result for float (23 mantissa bits) is 2 and for double (52
13144 // mantissa bits) is 3.
13145 constexpr unsigned AccurateBits = 8;
13146 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
13147 ExtraSteps = DesiredBits <= AccurateBits
13148 ? 0
13149 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
13150 }
13151
13152 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
13153 }
13154
13155 return SDValue();
13156}
13157
13158SDValue AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13159 const DenormalMode &Mode,
13160 SDNodeFlags Flags) const {
13161 SDLoc DL(Op);
13162 EVT VT = Op.getValueType();
13163 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
13164 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
13165 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ, /*Chain=*/{},
13166 /*Signaling=*/false, Flags);
13167}
13168
13169SDValue
13170AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
13171 SelectionDAG &DAG) const {
13172 return Op;
13173}
13174
13175SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
13176 SelectionDAG &DAG, int Enabled,
13177 int &ExtraSteps,
13178 bool &UseOneConst,
13179 bool Reciprocal) const {
13181 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
13182 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
13183 DAG, ExtraSteps)) {
13184 SDLoc DL(Operand);
13185 EVT VT = Operand.getValueType();
13186
13187 // Ensure nodes can be recognized by isAssociativeAndCommutative.
13188 SDNodeFlags Flags =
13190
13191 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
13192 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
13193 for (int i = ExtraSteps; i > 0; --i) {
13194 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
13195 Flags);
13196 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
13197 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13198 }
13199 if (!Reciprocal)
13200 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
13201
13202 ExtraSteps = 0;
13203 return Estimate;
13204 }
13205
13206 return SDValue();
13207}
13208
13209SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
13210 SelectionDAG &DAG, int Enabled,
13211 int &ExtraSteps) const {
13213 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
13214 DAG, ExtraSteps)) {
13215 SDLoc DL(Operand);
13216 EVT VT = Operand.getValueType();
13217
13219
13220 // Newton reciprocal iteration: E * (2 - X * E)
13221 // AArch64 reciprocal iteration instruction: (2 - M * N)
13222 for (int i = ExtraSteps; i > 0; --i) {
13223 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
13224 Estimate, Flags);
13225 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13226 }
13227
13228 ExtraSteps = 0;
13229 return Estimate;
13230 }
13231
13232 return SDValue();
13233}
13234
13235//===----------------------------------------------------------------------===//
13236// AArch64 Inline Assembly Support
13237//===----------------------------------------------------------------------===//
13238
13239// Table of Constraints
13240// TODO: This is the current set of constraints supported by ARM for the
13241// compiler, not all of them may make sense.
13242//
13243// r - A general register
13244// w - An FP/SIMD register of some size in the range v0-v31
13245// x - An FP/SIMD register of some size in the range v0-v15
13246// I - Constant that can be used with an ADD instruction
13247// J - Constant that can be used with a SUB instruction
13248// K - Constant that can be used with a 32-bit logical instruction
13249// L - Constant that can be used with a 64-bit logical instruction
13250// M - Constant that can be used as a 32-bit MOV immediate
13251// N - Constant that can be used as a 64-bit MOV immediate
13252// Q - A memory reference with base register and no offset
13253// S - A symbolic address
13254// Y - Floating point constant zero
13255// Z - Integer constant zero
13256//
13257// Note that general register operands will be output using their 64-bit x
13258// register name, whatever the size of the variable, unless the asm operand
13259// is prefixed by the %w modifier. Floating-point and SIMD register operands
13260// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
13261// %q modifier.
13262const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
13263 // At this point, we have to lower this constraint to something else, so we
13264 // lower it to an "r" or "w". However, by doing this we will force the result
13265 // to be in register, while the X constraint is much more permissive.
13266 //
13267 // Although we are correct (we are free to emit anything, without
13268 // constraints), we might break use cases that would expect us to be more
13269 // efficient and emit something else.
13270 if (!Subtarget->hasFPARMv8())
13271 return "r";
13272
13273 if (ConstraintVT.isFloatingPoint())
13274 return "w";
13275
13276 if (ConstraintVT.isVector() &&
13277 (ConstraintVT.getSizeInBits() == 64 ||
13278 ConstraintVT.getSizeInBits() == 128))
13279 return "w";
13280
13281 return "r";
13282}
13283
13285
13286// Returns a {Reg, RegisterClass} tuple if the constraint is
13287// a specific predicate register.
13288//
13289// For some constraint like "{pn3}" the default path in
13290// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
13291// suitable register class for this register is "PPRorPNR", after which it
13292// determines that nxv16i1 is an appropriate type for the constraint, which is
13293// not what we want. The code here pre-empts this by matching the register
13294// explicitly.
13295static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
13297 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
13298 (Constraint[1] != 'p' && Constraint[1] != 'z'))
13299 return std::nullopt;
13300
13301 bool IsPredicate = Constraint[1] == 'p';
13302 Constraint = Constraint.substr(2, Constraint.size() - 3);
13303 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
13304 if (IsPredicateAsCount)
13305 Constraint = Constraint.drop_front(1);
13306
13307 unsigned V;
13308 if (Constraint.getAsInteger(10, V) || V > 31)
13309 return std::nullopt;
13310
13311 if (IsPredicateAsCount)
13312 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
13313 if (IsPredicate)
13314 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
13315 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
13316}
13317
13318static std::optional<PredicateConstraint>
13321 .Case("Uph", PredicateConstraint::Uph)
13324 .Default(std::nullopt);
13325}
13326
13327static const TargetRegisterClass *
13329 if (VT != MVT::aarch64svcount &&
13330 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
13331 return nullptr;
13332
13333 switch (Constraint) {
13335 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
13336 : &AArch64::PPR_p8to15RegClass;
13338 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
13339 : &AArch64::PPR_3bRegClass;
13341 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
13342 : &AArch64::PPRRegClass;
13343 }
13344
13345 llvm_unreachable("Missing PredicateConstraint!");
13346}
13347
13349
13350static std::optional<ReducedGprConstraint>
13353 .Case("Uci", ReducedGprConstraint::Uci)
13355 .Default(std::nullopt);
13356}
13357
13358static const TargetRegisterClass *
13360 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
13361 return nullptr;
13362
13363 switch (Constraint) {
13365 return &AArch64::MatrixIndexGPR32_8_11RegClass;
13367 return &AArch64::MatrixIndexGPR32_12_15RegClass;
13368 }
13369
13370 llvm_unreachable("Missing ReducedGprConstraint!");
13371}
13372
13373// The set of cc code supported is from
13374// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
13377 .Case("{@cchi}", AArch64CC::HI)
13378 .Case("{@cccs}", AArch64CC::HS)
13379 .Case("{@cclo}", AArch64CC::LO)
13380 .Case("{@ccls}", AArch64CC::LS)
13381 .Case("{@cccc}", AArch64CC::LO)
13382 .Case("{@cceq}", AArch64CC::EQ)
13383 .Case("{@ccgt}", AArch64CC::GT)
13384 .Case("{@ccge}", AArch64CC::GE)
13385 .Case("{@cclt}", AArch64CC::LT)
13386 .Case("{@ccle}", AArch64CC::LE)
13387 .Case("{@cchs}", AArch64CC::HS)
13388 .Case("{@ccne}", AArch64CC::NE)
13389 .Case("{@ccvc}", AArch64CC::VC)
13390 .Case("{@ccpl}", AArch64CC::PL)
13391 .Case("{@ccvs}", AArch64CC::VS)
13392 .Case("{@ccmi}", AArch64CC::MI)
13394 return Cond;
13395}
13396
13397/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
13398/// WZR, invert(<cond>)'.
13400 SelectionDAG &DAG) {
13401 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
13402 DAG.getConstant(0, DL, MVT::i32),
13403 DAG.getConstant(0, DL, MVT::i32),
13404 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
13405}
13406
13407// Lower @cc flag output via getSETCC.
13408SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
13409 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
13410 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
13411 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
13412 if (Cond == AArch64CC::Invalid)
13413 return SDValue();
13414 // The output variable should be a scalar integer.
13415 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
13416 OpInfo.ConstraintVT.getSizeInBits() < 8)
13417 report_fatal_error("Flag output operand is of invalid type");
13418
13419 // Get NZCV register. Only update chain when copyfrom is glued.
13420 if (Glue.getNode()) {
13421 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
13422 Chain = Glue.getValue(1);
13423 } else
13424 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
13425 // Extract CC code.
13426 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
13427
13429
13430 // Truncate or ZERO_EXTEND based on value types.
13431 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
13432 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
13433 else
13434 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
13435
13436 return Result;
13437}
13438
13439/// getConstraintType - Given a constraint letter, return the type of
13440/// constraint it is for this target.
13442AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
13443 if (Constraint.size() == 1) {
13444 switch (Constraint[0]) {
13445 default:
13446 break;
13447 case 'x':
13448 case 'w':
13449 case 'y':
13450 return C_RegisterClass;
13451 // An address with a single base register. Due to the way we
13452 // currently handle addresses it is the same as 'r'.
13453 case 'Q':
13454 return C_Memory;
13455 case 'I':
13456 case 'J':
13457 case 'K':
13458 case 'L':
13459 case 'M':
13460 case 'N':
13461 case 'Y':
13462 case 'Z':
13463 return C_Immediate;
13464 case 'z':
13465 case 'S': // A symbol or label reference with a constant offset
13466 return C_Other;
13467 }
13468 } else if (parsePredicateConstraint(Constraint))
13469 return C_RegisterClass;
13470 else if (parseReducedGprConstraint(Constraint))
13471 return C_RegisterClass;
13472 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13473 return C_Other;
13474 return TargetLowering::getConstraintType(Constraint);
13475}
13476
13477/// Examine constraint type and operand type and determine a weight value.
13478/// This object must already have been set up with the operand type
13479/// and the current alternative constraint selected.
13481AArch64TargetLowering::getSingleConstraintMatchWeight(
13482 AsmOperandInfo &info, const char *constraint) const {
13484 Value *CallOperandVal = info.CallOperandVal;
13485 // If we don't have a value, we can't do a match,
13486 // but allow it at the lowest weight.
13487 if (!CallOperandVal)
13488 return CW_Default;
13489 Type *type = CallOperandVal->getType();
13490 // Look at the constraint type.
13491 switch (*constraint) {
13492 default:
13494 break;
13495 case 'x':
13496 case 'w':
13497 case 'y':
13498 if (type->isFloatingPointTy() || type->isVectorTy())
13499 weight = CW_Register;
13500 break;
13501 case 'z':
13502 weight = CW_Constant;
13503 break;
13504 case 'U':
13505 if (parsePredicateConstraint(constraint) ||
13506 parseReducedGprConstraint(constraint))
13507 weight = CW_Register;
13508 break;
13509 }
13510 return weight;
13511}
13512
13513std::pair<unsigned, const TargetRegisterClass *>
13514AArch64TargetLowering::getRegForInlineAsmConstraint(
13515 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13516 if (Constraint.size() == 1) {
13517 switch (Constraint[0]) {
13518 case 'r':
13519 if (VT.isScalableVector())
13520 return std::make_pair(0U, nullptr);
13521 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13522 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13523 if (VT.getFixedSizeInBits() == 64)
13524 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13525 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13526 case 'w': {
13527 if (!Subtarget->hasFPARMv8())
13528 break;
13529 if (VT.isScalableVector()) {
13530 if (VT.getVectorElementType() != MVT::i1)
13531 return std::make_pair(0U, &AArch64::ZPRRegClass);
13532 return std::make_pair(0U, nullptr);
13533 }
13534 if (VT == MVT::Other)
13535 break;
13536 uint64_t VTSize = VT.getFixedSizeInBits();
13537 if (VTSize == 16)
13538 return std::make_pair(0U, &AArch64::FPR16RegClass);
13539 if (VTSize == 32)
13540 return std::make_pair(0U, &AArch64::FPR32RegClass);
13541 if (VTSize == 64)
13542 return std::make_pair(0U, &AArch64::FPR64RegClass);
13543 if (VTSize == 128)
13544 return std::make_pair(0U, &AArch64::FPR128RegClass);
13545 break;
13546 }
13547 // The instructions that this constraint is designed for can
13548 // only take 128-bit registers so just use that regclass.
13549 case 'x':
13550 if (!Subtarget->hasFPARMv8())
13551 break;
13552 if (VT.isScalableVector())
13553 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13554 if (VT.getSizeInBits() == 128)
13555 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13556 break;
13557 case 'y':
13558 if (!Subtarget->hasFPARMv8())
13559 break;
13560 if (VT.isScalableVector())
13561 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13562 break;
13563 }
13564 } else {
13565 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13566 // SME functions that are not in streaming mode, should
13567 // still observe clobbers of Z-registers by clobbering
13568 // the lower 128bits of those registers.
13569 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13570 !Subtarget->isSVEorStreamingSVEAvailable())
13571 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13572 &AArch64::FPR128RegClass);
13573 return *P;
13574 }
13575 if (const auto PC = parsePredicateConstraint(Constraint))
13576 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13577 return std::make_pair(0U, RegClass);
13578
13579 if (const auto RGC = parseReducedGprConstraint(Constraint))
13580 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13581 return std::make_pair(0U, RegClass);
13582 }
13583 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13585 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13586
13587 if (Constraint == "{za}") {
13588 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13589 }
13590
13591 if (Constraint == "{zt0}") {
13592 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13593 }
13594
13595 // Use the default implementation in TargetLowering to convert the register
13596 // constraint into a member of a register class.
13597 std::pair<unsigned, const TargetRegisterClass *> Res;
13599
13600 // Not found as a standard register?
13601 if (!Res.second) {
13602 unsigned Size = Constraint.size();
13603 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13604 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13605 int RegNo;
13606 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13607 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13608 // v0 - v31 are aliases of q0/d0/s0/h0 - ...31 depending on size.
13609 // By default we'll emit v0-v31 for this unless there's a modifier where
13610 // we'll emit the correct register as well.
13611 if (VT != MVT::Other) {
13612 switch (VT.getSizeInBits()) {
13613 case 16:
13614 Res.first = AArch64::FPR16RegClass.getRegister(RegNo);
13615 Res.second = &AArch64::FPR16RegClass;
13616 break;
13617 case 32:
13618 Res.first = AArch64::FPR32RegClass.getRegister(RegNo);
13619 Res.second = &AArch64::FPR32RegClass;
13620 break;
13621 case 64:
13622 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13623 Res.second = &AArch64::FPR64RegClass;
13624 break;
13625 case 128:
13626 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13627 Res.second = &AArch64::FPR128RegClass;
13628 break;
13629 default:
13630 return std::make_pair(0U, nullptr);
13631 }
13632 } else {
13633 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13634 Res.second = &AArch64::FPR128RegClass;
13635 }
13636 }
13637 }
13638 }
13639
13640 if (Res.second && !Subtarget->hasFPARMv8() &&
13641 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13642 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13643 return std::make_pair(0U, nullptr);
13644
13645 return Res;
13646}
13647
13649 llvm::Type *Ty,
13650 bool AllowUnknown) const {
13651 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13652 return EVT(MVT::i64x8);
13653
13654 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13655}
13656
13657/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13658/// vector. If it is invalid, don't add anything to Ops.
13659void AArch64TargetLowering::LowerAsmOperandForConstraint(
13660 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13661 SelectionDAG &DAG) const {
13662 SDValue Result;
13663
13664 // Currently only support length 1 constraints.
13665 if (Constraint.size() != 1)
13666 return;
13667
13668 char ConstraintLetter = Constraint[0];
13669 switch (ConstraintLetter) {
13670 default:
13671 break;
13672
13673 // This set of constraints deal with valid constants for various instructions.
13674 // Validate and return a target constant for them if we can.
13675 case 'z': {
13676 // 'z' maps to xzr or wzr so it needs an input of 0.
13677 if (!isNullConstant(Op))
13678 return;
13679
13680 if (Op.getValueType() == MVT::i64)
13681 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13682 else
13683 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13684 break;
13685 }
13686 case 'S':
13687 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13688 // supported for PIC while "s" isn't, making "s" less useful. We implement
13689 // "S" but not "s".
13691 break;
13692
13693 case 'I':
13694 case 'J':
13695 case 'K':
13696 case 'L':
13697 case 'M':
13698 case 'N':
13700 if (!C)
13701 return;
13702
13703 // Grab the value and do some validation.
13704 uint64_t CVal = C->getZExtValue();
13705 switch (ConstraintLetter) {
13706 // The I constraint applies only to simple ADD or SUB immediate operands:
13707 // i.e. 0 to 4095 with optional shift by 12
13708 // The J constraint applies only to ADD or SUB immediates that would be
13709 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13710 // instruction [or vice versa], in other words -1 to -4095 with optional
13711 // left shift by 12.
13712 case 'I':
13713 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13714 break;
13715 return;
13716 case 'J': {
13717 uint64_t NVal = -C->getSExtValue();
13718 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13719 CVal = C->getSExtValue();
13720 break;
13721 }
13722 return;
13723 }
13724 // The K and L constraints apply *only* to logical immediates, including
13725 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13726 // been removed and MOV should be used). So these constraints have to
13727 // distinguish between bit patterns that are valid 32-bit or 64-bit
13728 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13729 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13730 // versa.
13731 case 'K':
13732 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13733 break;
13734 return;
13735 case 'L':
13736 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13737 break;
13738 return;
13739 // The M and N constraints are a superset of K and L respectively, for use
13740 // with the MOV (immediate) alias. As well as the logical immediates they
13741 // also match 32 or 64-bit immediates that can be loaded either using a
13742 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13743 // (M) or 64-bit 0x1234000000000000 (N) etc.
13744 // As a note some of this code is liberally stolen from the asm parser.
13745 case 'M': {
13746 if (!isUInt<32>(CVal))
13747 return;
13748 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13749 break;
13750 if ((CVal & 0xFFFF) == CVal)
13751 break;
13752 if ((CVal & 0xFFFF0000ULL) == CVal)
13753 break;
13754 uint64_t NCVal = ~(uint32_t)CVal;
13755 if ((NCVal & 0xFFFFULL) == NCVal)
13756 break;
13757 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13758 break;
13759 return;
13760 }
13761 case 'N': {
13762 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13763 break;
13764 if ((CVal & 0xFFFFULL) == CVal)
13765 break;
13766 if ((CVal & 0xFFFF0000ULL) == CVal)
13767 break;
13768 if ((CVal & 0xFFFF00000000ULL) == CVal)
13769 break;
13770 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13771 break;
13772 uint64_t NCVal = ~CVal;
13773 if ((NCVal & 0xFFFFULL) == NCVal)
13774 break;
13775 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13776 break;
13777 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13778 break;
13779 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13780 break;
13781 return;
13782 }
13783 default:
13784 return;
13785 }
13786
13787 // All assembler immediates are 64-bit integers.
13788 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13789 break;
13790 }
13791
13792 if (Result.getNode()) {
13793 Ops.push_back(Result);
13794 return;
13795 }
13796
13797 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13798}
13799
13800//===----------------------------------------------------------------------===//
13801// AArch64 Advanced SIMD Support
13802//===----------------------------------------------------------------------===//
13803
13804/// WidenVector - Given a value in the V64 register class, produce the
13805/// equivalent value in the V128 register class.
13807 EVT VT = V64Reg.getValueType();
13808 unsigned NarrowSize = VT.getVectorNumElements();
13809 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13810 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13811 SDLoc DL(V64Reg);
13812
13813 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getPOISON(WideTy),
13814 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13815}
13816
13817/// getExtFactor - Determine the adjustment factor for the position when
13818/// generating an "extract from vector registers" instruction.
13819static unsigned getExtFactor(SDValue &V) {
13820 EVT EltType = V.getValueType().getVectorElementType();
13821 return EltType.getSizeInBits() / 8;
13822}
13823
13824// Check if a vector is built from one vector via extracted elements of
13825// another together with an AND mask, ensuring that all elements fit
13826// within range. This can be reconstructed using AND and NEON's TBL1.
13828 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13829 SDLoc DL(Op);
13830 EVT VT = Op.getValueType();
13831 assert(!VT.isScalableVector() &&
13832 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13833
13834 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13835 // directly to TBL1.
13836 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13837 return SDValue();
13838
13839 unsigned NumElts = VT.getVectorNumElements();
13840 assert((NumElts == 8 || NumElts == 16) &&
13841 "Need to have exactly 8 or 16 elements in vector.");
13842
13843 SDValue SourceVec;
13844 SDValue MaskSourceVec;
13845 SmallVector<SDValue, 16> AndMaskConstants;
13846
13847 for (unsigned i = 0; i < NumElts; ++i) {
13848 SDValue V = Op.getOperand(i);
13849 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13850 return SDValue();
13851
13852 SDValue OperandSourceVec = V.getOperand(0);
13853 if (!SourceVec)
13854 SourceVec = OperandSourceVec;
13855 else if (SourceVec != OperandSourceVec)
13856 return SDValue();
13857
13858 // This only looks at shuffles with elements that are
13859 // a) truncated by a constant AND mask extracted from a mask vector, or
13860 // b) extracted directly from a mask vector.
13861 SDValue MaskSource = V.getOperand(1);
13862 if (MaskSource.getOpcode() == ISD::AND) {
13863 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13864 return SDValue();
13865
13866 AndMaskConstants.push_back(MaskSource.getOperand(1));
13867 MaskSource = MaskSource->getOperand(0);
13868 } else if (!AndMaskConstants.empty()) {
13869 // Either all or no operands should have an AND mask.
13870 return SDValue();
13871 }
13872
13873 // An ANY_EXTEND may be inserted between the AND and the source vector
13874 // extraction. We don't care about that, so we can just skip it.
13875 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13876 MaskSource = MaskSource.getOperand(0);
13877
13878 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13879 return SDValue();
13880
13881 SDValue MaskIdx = MaskSource.getOperand(1);
13882 if (!isa<ConstantSDNode>(MaskIdx) ||
13883 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13884 return SDValue();
13885
13886 // We only apply this if all elements come from the same vector with the
13887 // same vector type.
13888 if (!MaskSourceVec) {
13889 MaskSourceVec = MaskSource->getOperand(0);
13890 if (MaskSourceVec.getValueType() != VT)
13891 return SDValue();
13892 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13893 return SDValue();
13894 }
13895 }
13896
13897 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13898 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13899 // insert, we know that the index in the mask must be smaller than the number
13900 // of elements in the source, or we would have an out-of-bounds access.
13901 if (NumElts == 8)
13902 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13903 DAG.getPOISON(VT));
13904
13905 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13906 if (!AndMaskConstants.empty())
13907 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13908 DAG.getBuildVector(VT, DL, AndMaskConstants));
13909
13910 return DAG.getNode(
13912 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
13913 SourceVec, MaskSourceVec);
13914}
13915
13916// Gather data to see if the operation can be modelled as a
13917// shuffle in combination with VEXTs.
13919 SelectionDAG &DAG) const {
13920 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13921 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13922 SDLoc DL(Op);
13923 EVT VT = Op.getValueType();
13924 assert(!VT.isScalableVector() &&
13925 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13926 unsigned NumElts = VT.getVectorNumElements();
13927
13928 struct ShuffleSourceInfo {
13929 SDValue Vec;
13930 unsigned MinElt;
13931 unsigned MaxElt;
13932
13933 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13934 // be compatible with the shuffle we intend to construct. As a result
13935 // ShuffleVec will be some sliding window into the original Vec.
13936 SDValue ShuffleVec;
13937
13938 // Code should guarantee that element i in Vec starts at element "WindowBase
13939 // + i * WindowScale in ShuffleVec".
13940 int WindowBase;
13941 int WindowScale;
13942
13943 ShuffleSourceInfo(SDValue Vec)
13944 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13945 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13946
13947 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13948 };
13949
13950 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13951 // node.
13953 for (unsigned i = 0; i < NumElts; ++i) {
13954 SDValue V = Op.getOperand(i);
13955 if (V.isUndef())
13956 continue;
13957 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13958 !isa<ConstantSDNode>(V.getOperand(1)) ||
13959 V.getOperand(0).getValueType().isScalableVector()) {
13960 LLVM_DEBUG(
13961 dbgs() << "Reshuffle failed: "
13962 "a shuffle can only come from building a vector from "
13963 "various elements of other fixed-width vectors, provided "
13964 "their indices are constant\n");
13965 return SDValue();
13966 }
13967
13968 // Add this element source to the list if it's not already there.
13969 SDValue SourceVec = V.getOperand(0);
13970 auto Source = find(Sources, SourceVec);
13971 if (Source == Sources.end())
13972 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13973
13974 // Update the minimum and maximum lane number seen.
13975 unsigned EltNo = V.getConstantOperandVal(1);
13976 Source->MinElt = std::min(Source->MinElt, EltNo);
13977 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13978 }
13979
13980 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13981 // better than moving to/from gpr registers for larger vectors.
13982 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13983 // Construct a mask for the tbl. We may need to adjust the index for types
13984 // larger than i8.
13986 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13987 for (unsigned I = 0; I < NumElts; ++I) {
13988 SDValue V = Op.getOperand(I);
13989 if (V.isUndef()) {
13990 for (unsigned OF = 0; OF < OutputFactor; OF++)
13991 Mask.push_back(-1);
13992 continue;
13993 }
13994 // Set the Mask lanes adjusted for the size of the input and output
13995 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13996 // output element, adjusted in their positions per input and output types.
13997 unsigned Lane = V.getConstantOperandVal(1);
13998 for (unsigned S = 0; S < Sources.size(); S++) {
13999 if (V.getOperand(0) == Sources[S].Vec) {
14000 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
14001 unsigned InputBase = 16 * S + Lane * InputSize / 8;
14002 for (unsigned OF = 0; OF < OutputFactor; OF++)
14003 Mask.push_back(InputBase + OF);
14004 break;
14005 }
14006 }
14007 }
14008
14009 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
14010 // v16i8, and the TBLMask
14011 SmallVector<SDValue, 16> TBLOperands;
14012 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
14013 ? Intrinsic::aarch64_neon_tbl3
14014 : Intrinsic::aarch64_neon_tbl4,
14015 DL, MVT::i32));
14016 for (unsigned i = 0; i < Sources.size(); i++) {
14017 SDValue Src = Sources[i].Vec;
14018 EVT SrcVT = Src.getValueType();
14019 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
14020 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
14021 "Expected a legally typed vector");
14022 if (SrcVT.is64BitVector())
14023 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
14024 DAG.getPOISON(MVT::v8i8));
14025 TBLOperands.push_back(Src);
14026 }
14027
14029 for (unsigned i = 0; i < Mask.size(); i++)
14030 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
14031 assert((Mask.size() == 8 || Mask.size() == 16) &&
14032 "Expected a v8i8 or v16i8 Mask");
14033 TBLOperands.push_back(DAG.getBuildVector(
14034 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
14035
14036 SDValue Shuffle =
14038 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
14039 return DAG.getBitcast(VT, Shuffle);
14040 }
14041
14042 if (Sources.size() > 2) {
14043 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
14044 << "sensible when at most two source vectors are "
14045 << "involved\n");
14046 return SDValue();
14047 }
14048
14049 // Find out the smallest element size among result and two sources, and use
14050 // it as element size to build the shuffle_vector.
14051 EVT SmallestEltTy = VT.getVectorElementType();
14052 for (auto &Source : Sources) {
14053 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
14054 if (SrcEltTy.bitsLT(SmallestEltTy)) {
14055 SmallestEltTy = SrcEltTy;
14056 }
14057 }
14058 unsigned ResMultiplier =
14059 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14060 uint64_t VTSize = VT.getFixedSizeInBits();
14061 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
14062 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
14063
14064 // If the source vector is too wide or too narrow, we may nevertheless be able
14065 // to construct a compatible shuffle either by concatenating it with UNDEF or
14066 // extracting a suitable range of elements.
14067 for (auto &Src : Sources) {
14068 EVT SrcVT = Src.ShuffleVec.getValueType();
14069
14070 TypeSize SrcVTSize = SrcVT.getSizeInBits();
14071 if (SrcVTSize == TypeSize::getFixed(VTSize))
14072 continue;
14073
14074 // This stage of the search produces a source with the same element type as
14075 // the original, but with a total width matching the BUILD_VECTOR output.
14076 EVT EltVT = SrcVT.getVectorElementType();
14077 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
14078 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
14079
14080 if (SrcVTSize.getFixedValue() < VTSize) {
14081 assert(2 * SrcVTSize == VTSize);
14082 // We can pad out the smaller vector for free, so if it's part of a
14083 // shuffle...
14084 Src.ShuffleVec =
14085 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
14086 DAG.getPOISON(Src.ShuffleVec.getValueType()));
14087 continue;
14088 }
14089
14090 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
14091 LLVM_DEBUG(
14092 dbgs() << "Reshuffle failed: result vector too small to extract\n");
14093 return SDValue();
14094 }
14095
14096 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
14097 LLVM_DEBUG(
14098 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
14099 return SDValue();
14100 }
14101
14102 if (Src.MinElt >= NumSrcElts) {
14103 // The extraction can just take the second half
14104 Src.ShuffleVec =
14105 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14106 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14107 Src.WindowBase = -NumSrcElts;
14108 } else if (Src.MaxElt < NumSrcElts) {
14109 // The extraction can just take the first half
14110 Src.ShuffleVec =
14111 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14112 DAG.getConstant(0, DL, MVT::i64));
14113 } else {
14114 // An actual VEXT is needed
14115 SDValue VEXTSrc1 =
14116 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14117 DAG.getConstant(0, DL, MVT::i64));
14118 SDValue VEXTSrc2 =
14119 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14120 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14121 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
14122
14123 if (!SrcVT.is64BitVector()) {
14124 LLVM_DEBUG(
14125 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
14126 "for SVE vectors.");
14127 return SDValue();
14128 }
14129
14130 Src.ShuffleVec =
14131 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
14132 DAG.getConstant(Imm, DL, MVT::i32));
14133 Src.WindowBase = -Src.MinElt;
14134 }
14135 }
14136
14137 // Another possible incompatibility occurs from the vector element types. We
14138 // can fix this by bitcasting the source vectors to the same type we intend
14139 // for the shuffle.
14140 for (auto &Src : Sources) {
14141 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
14142 if (SrcEltTy == SmallestEltTy)
14143 continue;
14144 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
14145 if (DAG.getDataLayout().isBigEndian()) {
14146 Src.ShuffleVec =
14147 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
14148 } else {
14149 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
14150 }
14151 Src.WindowScale =
14152 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14153 Src.WindowBase *= Src.WindowScale;
14154 }
14155
14156 // Final check before we try to actually produce a shuffle.
14157 LLVM_DEBUG({
14158 for (auto Src : Sources)
14159 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
14160 });
14161
14162 // The stars all align, our next step is to produce the mask for the shuffle.
14163 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
14164 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
14165 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
14166 SDValue Entry = Op.getOperand(i);
14167 if (Entry.isUndef())
14168 continue;
14169
14170 auto Src = find(Sources, Entry.getOperand(0));
14171 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
14172
14173 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
14174 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
14175 // segment.
14176 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
14177 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
14178 VT.getScalarSizeInBits());
14179 int LanesDefined = BitsDefined / BitsPerShuffleLane;
14180
14181 // This source is expected to fill ResMultiplier lanes of the final shuffle,
14182 // starting at the appropriate offset.
14183 int *LaneMask = &Mask[i * ResMultiplier];
14184
14185 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
14186 ExtractBase += NumElts * (Src - Sources.begin());
14187 for (int j = 0; j < LanesDefined; ++j)
14188 LaneMask[j] = ExtractBase + j;
14189 }
14190
14191 // Final check before we try to produce nonsense...
14192 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
14193 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
14194 return SDValue();
14195 }
14196
14197 SDValue Poison = DAG.getPOISON(ShuffleVT);
14199 for (unsigned i = 0; i < Sources.size(); ++i)
14200 ShuffleOps[i] = Sources[i].ShuffleVec;
14201
14202 SDValue Shuffle =
14203 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
14204 SDValue V;
14205 if (DAG.getDataLayout().isBigEndian()) {
14206 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
14207 } else {
14208 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
14209 }
14210
14211 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
14212 dbgs() << "Reshuffle, creating node: "; V.dump(););
14213
14214 return V;
14215}
14216
14217// check if an EXT instruction can handle the shuffle mask when the
14218// vector sources of the shuffle are the same.
14219static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
14220 unsigned NumElts = VT.getVectorNumElements();
14221
14222 // Assume that the first shuffle index is not UNDEF. Fail if it is.
14223 if (M[0] < 0)
14224 return false;
14225
14226 Imm = M[0];
14227
14228 // If this is a VEXT shuffle, the immediate value is the index of the first
14229 // element. The other shuffle indices must be the successive elements after
14230 // the first one.
14231 unsigned ExpectedElt = Imm;
14232 for (unsigned i = 1; i < NumElts; ++i) {
14233 // Increment the expected index. If it wraps around, just follow it
14234 // back to index zero and keep going.
14235 ++ExpectedElt;
14236 if (ExpectedElt == NumElts)
14237 ExpectedElt = 0;
14238
14239 if (M[i] < 0)
14240 continue; // ignore UNDEF indices
14241 if (ExpectedElt != static_cast<unsigned>(M[i]))
14242 return false;
14243 }
14244
14245 return true;
14246}
14247
14248// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14249// v4i32s. This is really a truncate, which we can construct out of (legal)
14250// concats and truncate nodes.
14252 if (V.getValueType() != MVT::v16i8)
14253 return SDValue();
14254 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
14255
14256 for (unsigned X = 0; X < 4; X++) {
14257 // Check the first item in each group is an extract from lane 0 of a v4i32
14258 // or v4i16.
14259 SDValue BaseExt = V.getOperand(X * 4);
14260 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14261 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
14262 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
14263 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
14264 BaseExt.getConstantOperandVal(1) != 0)
14265 return SDValue();
14266 SDValue Base = BaseExt.getOperand(0);
14267 // And check the other items are extracts from the same vector.
14268 for (unsigned Y = 1; Y < 4; Y++) {
14269 SDValue Ext = V.getOperand(X * 4 + Y);
14270 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14271 Ext.getOperand(0) != Base ||
14273 Ext.getConstantOperandVal(1) != Y)
14274 return SDValue();
14275 }
14276 }
14277
14278 // Turn the buildvector into a series of truncates and concates, which will
14279 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
14280 // concat together to produce 2 v8i16. These are both truncated and concat
14281 // together.
14282 SDLoc DL(V);
14283 SDValue Trunc[4] = {
14284 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
14285 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
14286 for (SDValue &V : Trunc)
14287 if (V.getValueType() == MVT::v4i32)
14288 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
14289 SDValue Concat0 =
14290 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
14291 SDValue Concat1 =
14292 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
14293 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
14294 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
14295 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
14296}
14297
14298/// Check if a vector shuffle corresponds to a DUP instructions with a larger
14299/// element width than the vector lane type. If that is the case the function
14300/// returns true and writes the value of the DUP instruction lane operand into
14301/// DupLaneOp
14302static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
14303 unsigned &DupLaneOp) {
14304 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
14305 "Only possible block sizes for wide DUP are: 16, 32, 64");
14306
14307 if (BlockSize <= VT.getScalarSizeInBits())
14308 return false;
14309 if (BlockSize % VT.getScalarSizeInBits() != 0)
14310 return false;
14311 if (VT.getSizeInBits() % BlockSize != 0)
14312 return false;
14313
14314 size_t SingleVecNumElements = VT.getVectorNumElements();
14315 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
14316 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
14317
14318 // We are looking for masks like
14319 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
14320 // might be replaced by 'undefined'. BlockIndices will eventually contain
14321 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
14322 // for the above examples)
14323 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
14324 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
14325 for (size_t I = 0; I < NumEltsPerBlock; I++) {
14326 int Elt = M[BlockIndex * NumEltsPerBlock + I];
14327 if (Elt < 0)
14328 continue;
14329 // For now we don't support shuffles that use the second operand
14330 if ((unsigned)Elt >= SingleVecNumElements)
14331 return false;
14332 if (BlockElts[I] < 0)
14333 BlockElts[I] = Elt;
14334 else if (BlockElts[I] != Elt)
14335 return false;
14336 }
14337
14338 // We found a candidate block (possibly with some undefs). It must be a
14339 // sequence of consecutive integers starting with a value divisible by
14340 // NumEltsPerBlock with some values possibly replaced by undef-s.
14341
14342 // Find first non-undef element
14343 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
14344 assert(FirstRealEltIter != BlockElts.end() &&
14345 "Shuffle with all-undefs must have been caught by previous cases, "
14346 "e.g. isSplat()");
14347 if (FirstRealEltIter == BlockElts.end()) {
14348 DupLaneOp = 0;
14349 return true;
14350 }
14351
14352 // Index of FirstRealElt in BlockElts
14353 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
14354
14355 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
14356 return false;
14357 // BlockElts[0] must have the following value if it isn't undef:
14358 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
14359
14360 // Check the first element
14361 if (Elt0 % NumEltsPerBlock != 0)
14362 return false;
14363 // Check that the sequence indeed consists of consecutive integers (modulo
14364 // undefs)
14365 for (size_t I = 0; I < NumEltsPerBlock; I++)
14366 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
14367 return false;
14368
14369 DupLaneOp = Elt0 / NumEltsPerBlock;
14370 return true;
14371}
14372
14373// check if an EXT instruction can handle the shuffle mask when the
14374// vector sources of the shuffle are different.
14375static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
14376 unsigned &Imm) {
14377 // Look for the first non-undef element.
14378 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
14379
14380 // Benefit from APInt to handle overflow when calculating expected element.
14381 unsigned NumElts = VT.getVectorNumElements();
14382 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
14383 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
14384 /*implicitTrunc=*/true);
14385 // The following shuffle indices must be the successive elements after the
14386 // first real element.
14387 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
14388 return Elt != ExpectedElt++ && Elt >= 0;
14389 });
14390 if (FoundWrongElt)
14391 return false;
14392
14393 // The index of an EXT is the first element if it is not UNDEF.
14394 // Watch out for the beginning UNDEFs. The EXT index should be the expected
14395 // value of the first element. E.g.
14396 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
14397 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
14398 // ExpectedElt is the last mask index plus 1.
14399 Imm = ExpectedElt.getZExtValue();
14400
14401 // There are two difference cases requiring to reverse input vectors.
14402 // For example, for vector <4 x i32> we have the following cases,
14403 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
14404 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
14405 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
14406 // to reverse two input vectors.
14407 if (Imm < NumElts)
14408 ReverseEXT = true;
14409 else
14410 Imm -= NumElts;
14411
14412 return true;
14413}
14414
14415// Check if an EXT instruction can handle the shuffle mask when one source is a
14416// splat. This matches shuffles where the splat occupies either a prefix or a
14417// suffix and the remaining lanes are a contiguous slice from the non-splat
14418// source.
14419static bool isEXTMaskWithSplat(ArrayRef<int> M, EVT VT, unsigned SplatOperand,
14420 bool &ReverseEXT, unsigned &Imm) {
14421 unsigned NumElts = VT.getVectorNumElements();
14422 unsigned OtherBase = SplatOperand == 0 ? NumElts : 0;
14423 auto IsSplatElt = [=](int Elt) {
14424 return Elt < 0 ||
14425 (SplatOperand == 0 ? Elt < (int)NumElts : Elt >= (int)NumElts);
14426 };
14427
14428 unsigned PrefixSplatElts = 0;
14429 while (PrefixSplatElts != NumElts && IsSplatElt(M[PrefixSplatElts]))
14430 ++PrefixSplatElts;
14431
14432 if (PrefixSplatElts > 0 && PrefixSplatElts < NumElts) {
14433 bool Match = true;
14434 for (unsigned I = PrefixSplatElts; I != NumElts; ++I) {
14435 int Expected = OtherBase + I - PrefixSplatElts;
14436 if (M[I] >= 0 && M[I] != Expected) {
14437 Match = false;
14438 break;
14439 }
14440 }
14441
14442 if (Match) {
14443 ReverseEXT = SplatOperand == 1;
14444 Imm = NumElts - PrefixSplatElts;
14445 return true;
14446 }
14447 }
14448
14449 unsigned SuffixSplatElts = 0;
14450 while (SuffixSplatElts != NumElts &&
14451 IsSplatElt(M[NumElts - 1 - SuffixSplatElts]))
14452 ++SuffixSplatElts;
14453
14454 if (0 < SuffixSplatElts && SuffixSplatElts < NumElts) {
14455 bool Match = true;
14456 for (unsigned I = 0; I != NumElts - SuffixSplatElts; ++I) {
14457 int Expected = OtherBase + I + SuffixSplatElts;
14458 if (M[I] >= 0 && M[I] != Expected) {
14459 Match = false;
14460 break;
14461 }
14462 }
14463
14464 if (Match) {
14465 ReverseEXT = SplatOperand == 0;
14466 Imm = SuffixSplatElts;
14467 return true;
14468 }
14469 }
14470
14471 return false;
14472}
14473
14474/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
14475/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14476/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
14477static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14478 unsigned NumElts = VT.getVectorNumElements();
14479 if (NumElts % 2 != 0)
14480 return false;
14481 WhichResult = (M[0] == 0 ? 0 : 1);
14482 unsigned Idx = WhichResult * NumElts / 2;
14483 for (unsigned i = 0; i != NumElts; i += 2) {
14484 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
14485 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
14486 return false;
14487 Idx += 1;
14488 }
14489
14490 return true;
14491}
14492
14493/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
14494/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14495/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
14496static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14497 unsigned Half = VT.getVectorNumElements() / 2;
14498 WhichResult = (M[0] == 0 ? 0 : 1);
14499 for (unsigned j = 0; j != 2; ++j) {
14500 unsigned Idx = WhichResult;
14501 for (unsigned i = 0; i != Half; ++i) {
14502 int MIdx = M[i + j * Half];
14503 if (MIdx >= 0 && (unsigned)MIdx != Idx)
14504 return false;
14505 Idx += 2;
14506 }
14507 }
14508
14509 return true;
14510}
14511
14512/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
14513/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14514/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
14515static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14516 unsigned NumElts = VT.getVectorNumElements();
14517 if (NumElts % 2 != 0)
14518 return false;
14519 WhichResult = (M[0] == 0 ? 0 : 1);
14520 for (unsigned i = 0; i < NumElts; i += 2) {
14521 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
14522 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
14523 return false;
14524 }
14525 return true;
14526}
14527
14528static bool isINSMask(ArrayRef<int> M, int NumInputElements,
14529 bool &DstIsLeft, int &Anomaly) {
14530 if (M.size() != static_cast<size_t>(NumInputElements))
14531 return false;
14532
14533 int NumLHSMatch = 0, NumRHSMatch = 0;
14534 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14535
14536 for (int i = 0; i < NumInputElements; ++i) {
14537 if (M[i] == -1) {
14538 ++NumLHSMatch;
14539 ++NumRHSMatch;
14540 continue;
14541 }
14542
14543 if (M[i] == i)
14544 ++NumLHSMatch;
14545 else
14546 LastLHSMismatch = i;
14547
14548 if (M[i] == i + NumInputElements)
14549 ++NumRHSMatch;
14550 else
14551 LastRHSMismatch = i;
14552 }
14553
14554 if (NumLHSMatch == NumInputElements - 1) {
14555 DstIsLeft = true;
14556 Anomaly = LastLHSMismatch;
14557 return true;
14558 } else if (NumRHSMatch == NumInputElements - 1) {
14559 DstIsLeft = false;
14560 Anomaly = LastRHSMismatch;
14561 return true;
14562 }
14563
14564 return false;
14565}
14566
14567static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14568 if (VT.getSizeInBits() != 128)
14569 return false;
14570
14571 unsigned NumElts = VT.getVectorNumElements();
14572
14573 for (int I = 0, E = NumElts / 2; I != E; I++) {
14574 if (Mask[I] != I)
14575 return false;
14576 }
14577
14578 int Offset = NumElts / 2;
14579 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14580 if (Mask[I] != I + SplitLHS * Offset)
14581 return false;
14582 }
14583
14584 return true;
14585}
14586
14588 SDLoc DL(Op);
14589 EVT VT = Op.getValueType();
14590 SDValue V0 = Op.getOperand(0);
14591 SDValue V1 = Op.getOperand(1);
14592 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14593
14596 return SDValue();
14597
14598 bool SplitV0 = V0.getValueSizeInBits() == 128;
14599
14600 if (!isConcatMask(Mask, VT, SplitV0))
14601 return SDValue();
14602
14603 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14604 if (SplitV0) {
14605 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14606 DAG.getConstant(0, DL, MVT::i64));
14607 }
14608 if (V1.getValueSizeInBits() == 128) {
14609 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14610 DAG.getConstant(0, DL, MVT::i64));
14611 }
14612 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14613}
14614
14615/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14616/// the specified operations to build the shuffle. ID is the perfect-shuffle
14617//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14618//table entry and LHS/RHS are the immediate inputs for this stage of the
14619//shuffle.
14621 unsigned PFEntry, SDValue LHS,
14622 SDValue RHS, SelectionDAG &DAG,
14623 const SDLoc &DL) {
14624 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14625 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14626 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14627
14628 enum {
14629 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14630 OP_VREV,
14631 OP_VDUP0,
14632 OP_VDUP1,
14633 OP_VDUP2,
14634 OP_VDUP3,
14635 OP_VEXT1,
14636 OP_VEXT2,
14637 OP_VEXT3,
14638 OP_VUZPL, // VUZP, left result
14639 OP_VUZPR, // VUZP, right result
14640 OP_VZIPL, // VZIP, left result
14641 OP_VZIPR, // VZIP, right result
14642 OP_VTRNL, // VTRN, left result
14643 OP_VTRNR, // VTRN, right result
14644 OP_MOVLANE // Move lane. RHSID is the lane to move into
14645 };
14646
14647 if (OpNum == OP_COPY) {
14648 if (LHSID == (1 * 9 + 2) * 9 + 3)
14649 return LHS;
14650 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14651 return RHS;
14652 }
14653
14654 if (OpNum == OP_MOVLANE) {
14655 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14656 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14657 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14658 Elt = 3 - Elt;
14659 while (Elt > 0) {
14660 ID /= 9;
14661 Elt--;
14662 }
14663 return (ID % 9 == 8) ? -1 : ID % 9;
14664 };
14665
14666 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14667 // get the lane to move from the PFID, which is always from the
14668 // original vectors (V1 or V2).
14670 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14671 EVT VT = OpLHS.getValueType();
14672 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14673 unsigned ExtLane = 0;
14674 SDValue Input;
14675
14676 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14677 // convert into a higher type.
14678 if (RHSID & 0x4) {
14679 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14680 if (MaskElt == -1)
14681 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14682 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14683 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14684 Input = MaskElt < 2 ? V1 : V2;
14685 if (VT.getScalarSizeInBits() == 16) {
14686 Input = DAG.getBitcast(MVT::v2f32, Input);
14687 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14688 } else {
14689 assert(VT.getScalarSizeInBits() == 32 &&
14690 "Expected 16 or 32 bit shuffle elements");
14691 Input = DAG.getBitcast(MVT::v2f64, Input);
14692 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14693 }
14694 } else {
14695 int MaskElt = getPFIDLane(ID, RHSID);
14696 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14697 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14698 Input = MaskElt < 4 ? V1 : V2;
14699 // Be careful about creating illegal types. Use f16 instead of i16.
14700 if (VT == MVT::v4i16) {
14701 Input = DAG.getBitcast(MVT::v4f16, Input);
14702 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14703 }
14704 }
14706 Input.getValueType().getVectorElementType(),
14707 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14708 SDValue Ins =
14709 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14710 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14711 return DAG.getBitcast(VT, Ins);
14712 }
14713
14714 SDValue OpLHS, OpRHS;
14715 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14716 RHS, DAG, DL);
14717 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14718 RHS, DAG, DL);
14719 EVT VT = OpLHS.getValueType();
14720
14721 switch (OpNum) {
14722 default:
14723 llvm_unreachable("Unknown shuffle opcode!");
14724 case OP_VREV: {
14725 // VREV divides the vector in half and swaps within the half.
14726 if (VT.getVectorElementType() == MVT::i32 ||
14727 VT.getVectorElementType() == MVT::f32)
14728 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14729 // vrev <4 x i16> -> REV32
14730 if (VT.getVectorElementType() == MVT::i16 ||
14731 VT.getVectorElementType() == MVT::f16 ||
14732 VT.getVectorElementType() == MVT::bf16)
14733 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14734 // vrev <4 x i8> -> BSWAP which is REV16
14735 assert(VT == MVT::v8i8 || VT == MVT::v16i8);
14736 EVT BSVT = VT == MVT::v8i8 ? MVT::v4i16 : MVT::v8i16;
14737 return DAG.getNode(
14738 AArch64ISD::NVCAST, DL, VT,
14739 DAG.getNode(ISD::BSWAP, DL, BSVT,
14740 DAG.getNode(AArch64ISD::NVCAST, DL, BSVT, OpLHS)));
14741 }
14742 case OP_VDUP0:
14743 case OP_VDUP1:
14744 case OP_VDUP2:
14745 case OP_VDUP3: {
14746 EVT EltTy = VT.getVectorElementType();
14747 unsigned Opcode;
14748 if (EltTy == MVT::i8)
14749 Opcode = AArch64ISD::DUPLANE8;
14750 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14751 Opcode = AArch64ISD::DUPLANE16;
14752 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14753 Opcode = AArch64ISD::DUPLANE32;
14754 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14755 Opcode = AArch64ISD::DUPLANE64;
14756 else
14757 llvm_unreachable("Invalid vector element type?");
14758
14759 if (VT.getSizeInBits() == 64)
14760 OpLHS = WidenVector(OpLHS, DAG);
14761 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14762 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14763 }
14764 case OP_VEXT1:
14765 case OP_VEXT2:
14766 case OP_VEXT3: {
14767 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14768 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14769 DAG.getConstant(Imm, DL, MVT::i32));
14770 }
14771 case OP_VUZPL:
14772 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14773 case OP_VUZPR:
14774 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14775 case OP_VZIPL:
14776 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14777 case OP_VZIPR:
14778 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14779 case OP_VTRNL:
14780 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14781 case OP_VTRNR:
14782 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14783 }
14784}
14785
14787 SelectionDAG &DAG) {
14788 // Check to see if we can use the TBL instruction.
14789 SDValue V1 = Op.getOperand(0);
14790 SDValue V2 = Op.getOperand(1);
14791 SDLoc DL(Op);
14792
14793 EVT EltVT = Op.getValueType().getVectorElementType();
14794 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14795
14796 bool Swap = false;
14797 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14798 std::swap(V1, V2);
14799 Swap = true;
14800 }
14801
14802 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14803 // out of range values with 0s. We do need to make sure that any out-of-range
14804 // values are really out-of-range for a v16i8 vector.
14805 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14806 MVT IndexVT = MVT::v8i8;
14807 unsigned IndexLen = 8;
14808 if (Op.getValueSizeInBits() == 128) {
14809 IndexVT = MVT::v16i8;
14810 IndexLen = 16;
14811 }
14812
14814 for (int Val : ShuffleMask) {
14815 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14816 unsigned Offset = Byte + Val * BytesPerElt;
14817 if (Swap)
14818 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14819 if (IsUndefOrZero && Offset >= IndexLen)
14820 Offset = 255;
14821 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14822 }
14823 }
14824
14825 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14826 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14827
14828 SDValue Shuffle;
14829 if (IsUndefOrZero) {
14830 if (IndexLen == 8)
14831 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14832 Shuffle = DAG.getNode(
14833 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14834 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14835 V1Cst,
14836 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14837 } else {
14838 if (IndexLen == 8) {
14839 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14840 Shuffle = DAG.getNode(
14841 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14842 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14843 V1Cst,
14844 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14845 } else {
14846 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14847 // cannot currently represent the register constraints on the input
14848 // table registers.
14849 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14850 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14851 // IndexLen));
14852 Shuffle = DAG.getNode(
14853 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14854 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
14855 V1Cst, V2Cst,
14856 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14857 }
14858 }
14859 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14860}
14861
14862static unsigned getDUPLANEOp(EVT EltType) {
14863 if (EltType == MVT::i8)
14864 return AArch64ISD::DUPLANE8;
14865 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14866 return AArch64ISD::DUPLANE16;
14867 if (EltType == MVT::i32 || EltType == MVT::f32)
14868 return AArch64ISD::DUPLANE32;
14869 if (EltType == MVT::i64 || EltType == MVT::f64)
14870 return AArch64ISD::DUPLANE64;
14871
14872 llvm_unreachable("Invalid vector element type?");
14873}
14874
14875static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14876 unsigned Opcode, SelectionDAG &DAG) {
14877 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14878 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14879 // Match: dup (bitcast (extract_subv X, C)), LaneC
14880 if (BitCast.getOpcode() != ISD::BITCAST ||
14882 return false;
14883
14884 // The extract index must align in the destination type. That may not
14885 // happen if the bitcast is from narrow to wide type.
14886 SDValue Extract = BitCast.getOperand(0);
14887 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14888 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14889 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14890 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14891 if (ExtIdxInBits % CastedEltBitWidth != 0)
14892 return false;
14893
14894 // Can't handle cases where vector size is not 128-bit
14895 if (!Extract.getOperand(0).getValueType().is128BitVector())
14896 return false;
14897
14898 // Update the lane value by offsetting with the scaled extract index.
14899 LaneC += ExtIdxInBits / CastedEltBitWidth;
14900
14901 // Determine the casted vector type of the wide vector input.
14902 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14903 // Examples:
14904 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14905 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14906 unsigned SrcVecNumElts =
14907 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14909 SrcVecNumElts);
14910 return true;
14911 };
14912 MVT CastVT;
14913 if (getScaledOffsetDup(V, Lane, CastVT)) {
14914 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14915 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14916 V.getOperand(0).getValueType().is128BitVector()) {
14917 // The lane is incremented by the index of the extract.
14918 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14919 Lane += V.getConstantOperandVal(1);
14920 V = V.getOperand(0);
14921 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14922 // The lane is decremented if we are splatting from the 2nd operand.
14923 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14924 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14925 Lane -= Idx * VT.getVectorNumElements() / 2;
14926 V = WidenVector(V.getOperand(Idx), DAG);
14927 } else if (VT.getSizeInBits() == 64) {
14928 // Widen the operand to 128-bit register with undef.
14929 V = WidenVector(V, DAG);
14930 }
14931 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14932}
14933
14934// Try to widen element type to get a new mask value for a better permutation
14935// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14936// UZP1/2, TRN1/2, REV, INS, etc.
14937// For example:
14938// shufflevector <4 x i32> %a, <4 x i32> %b,
14939// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14940// is equivalent to:
14941// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14942// Finally, we can get:
14943// mov v0.d[0], v1.d[1]
14945 SDLoc DL(Op);
14946 EVT VT = Op.getValueType();
14947 EVT ScalarVT = VT.getVectorElementType();
14948 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14949 SDValue V0 = Op.getOperand(0);
14950 SDValue V1 = Op.getOperand(1);
14951 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14952
14953 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14954 // We need to make sure the wider element type is legal. Thus, ElementSize
14955 // should be not larger than 32 bits, and i1 type should also be excluded.
14956 if (ElementSize > 32 || ElementSize == 1)
14957 return SDValue();
14958
14959 SmallVector<int, 8> NewMask;
14960 if (widenShuffleMaskElts(Mask, NewMask)) {
14961 MVT NewEltVT = VT.isFloatingPoint()
14962 ? MVT::getFloatingPointVT(ElementSize * 2)
14963 : MVT::getIntegerVT(ElementSize * 2);
14964 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14965 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14966 V0 = DAG.getBitcast(NewVT, V0);
14967 V1 = DAG.getBitcast(NewVT, V1);
14968 return DAG.getBitcast(VT,
14969 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14970 }
14971 }
14972
14973 return SDValue();
14974}
14975
14976// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14978 ArrayRef<int> ShuffleMask,
14979 SelectionDAG &DAG) {
14980 SDValue Tbl1 = Op->getOperand(0);
14981 SDValue Tbl2 = Op->getOperand(1);
14982 SDLoc DL(Op);
14983 SDValue Tbl2ID =
14984 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14985
14986 EVT VT = Op.getValueType();
14987 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14988 Tbl1.getOperand(0) != Tbl2ID ||
14990 Tbl2.getOperand(0) != Tbl2ID)
14991 return SDValue();
14992
14993 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14994 return SDValue();
14995
14996 SDValue Mask1 = Tbl1.getOperand(3);
14997 SDValue Mask2 = Tbl2.getOperand(3);
14998 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14999 Mask2.getOpcode() != ISD::BUILD_VECTOR)
15000 return SDValue();
15001
15002 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
15003 for (unsigned I = 0; I < 16; I++) {
15004 if (ShuffleMask[I] < 16)
15005 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
15006 else {
15007 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
15008 if (!C)
15009 return SDValue();
15010 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
15011 }
15012 }
15013
15014 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
15015 SDValue ID =
15016 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
15017
15018 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
15019 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
15020 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
15021}
15022
15023SDValue
15024AArch64TargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
15025 SelectionDAG &DAG) const {
15026 SDLoc DL(Op);
15027 EVT VT = Op.getValueType();
15028 assert(VT.isScalableVector() && "Unexpected result type!");
15029
15030 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
15031 unsigned UnpackOpcode = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15032
15033 // Repeatedly unpack Val until the result is of the desired type.
15034 SDValue Val = Op.getOperand(0);
15035 switch (Val.getSimpleValueType().SimpleTy) {
15036 default:
15037 return SDValue();
15038 case MVT::nxv16i8:
15039 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv8i16, Val);
15040 if (VT == MVT::nxv8i16)
15041 break;
15042 [[fallthrough]];
15043 case MVT::nxv8i16:
15044 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv4i32, Val);
15045 if (VT == MVT::nxv4i32)
15046 break;
15047 [[fallthrough]];
15048 case MVT::nxv4i32:
15049 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv2i64, Val);
15050 assert(VT == MVT::nxv2i64 && "Unexpected result type!");
15051 break;
15052 }
15053
15054 return Val;
15055}
15056
15057// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
15058// but we don't have an appropriate instruction,
15059// so custom-lower it as ZIP1-with-zeros.
15060SDValue
15061AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
15062 SelectionDAG &DAG) const {
15063 SDLoc DL(Op);
15064 EVT VT = Op.getValueType();
15065
15066 if (VT.isScalableVector())
15067 return LowerEXTEND_VECTOR_INREG(Op, DAG);
15068
15069 SDValue SrcOp = Op.getOperand(0);
15070 EVT SrcVT = SrcOp.getValueType();
15071 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
15072 "Unexpected extension factor.");
15073 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
15074 // FIXME: support multi-step zipping?
15075 if (Scale != 2)
15076 return SDValue();
15077 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
15078 return DAG.getBitcast(VT,
15079 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
15080}
15081
15082SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
15083 SelectionDAG &DAG) const {
15084 SDLoc DL(Op);
15085 EVT VT = Op.getValueType();
15086
15087 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
15088
15089 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15090 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
15091
15092 // Convert shuffles that are directly supported on NEON to target-specific
15093 // DAG nodes, instead of keeping them as shuffles and matching them again
15094 // during code selection. This is more efficient and avoids the possibility
15095 // of inconsistencies between legalization and selection.
15096 ArrayRef<int> ShuffleMask = SVN->getMask();
15097
15098 SDValue V1 = Op.getOperand(0);
15099 SDValue V2 = Op.getOperand(1);
15100
15101 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
15102 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
15103 "Unexpected VECTOR_SHUFFLE mask size!");
15104
15105 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
15106 return Res;
15107
15108 if (SVN->isSplat()) {
15109 int Lane = SVN->getSplatIndex();
15110 // If this is undef splat, generate it via "just" vdup, if possible.
15111 if (Lane == -1)
15112 Lane = 0;
15113
15114 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
15115 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
15116 V1.getOperand(0));
15117 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
15118 // constant. If so, we can just reference the lane's definition directly.
15119 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
15121 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
15122
15123 // Otherwise, duplicate from the lane of the input vector.
15124 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
15125 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
15126 }
15127
15128 // Check if the mask matches a DUP for a wider element
15129 for (unsigned LaneSize : {64U, 32U, 16U}) {
15130 unsigned Lane = 0;
15131 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
15132 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
15133 : LaneSize == 32 ? AArch64ISD::DUPLANE32
15134 : AArch64ISD::DUPLANE16;
15135 // Cast V1 to an integer vector with required lane size
15136 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
15137 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
15138 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
15139 V1 = DAG.getBitcast(NewVecTy, V1);
15140 // Construct the DUP instruction
15141 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
15142 // Cast back to the original type
15143 return DAG.getBitcast(VT, V1);
15144 }
15145 }
15146
15147 unsigned NumElts = VT.getVectorNumElements();
15148 unsigned EltSize = VT.getScalarSizeInBits();
15149 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
15150 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
15151 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
15152 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
15153 if (isREVMask(ShuffleMask, EltSize, NumElts, 16)) {
15154 EVT VT = V1.getValueType();
15155 assert(VT == MVT::v8i8 || VT == MVT::v16i8);
15156 EVT BSVT = VT == MVT::v8i8 ? MVT::v4i16 : MVT::v8i16;
15157 return DAG.getNode(
15158 AArch64ISD::NVCAST, DL, VT,
15159 DAG.getNode(ISD::BSWAP, DL, BSVT,
15160 DAG.getNode(AArch64ISD::NVCAST, DL, BSVT, V1)));
15161 }
15162
15163 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
15164 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
15165 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
15166 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
15167 DAG.getConstant(8, DL, MVT::i32));
15168 }
15169
15170 bool IsSplat1 =
15171 V1.getValueType() == VT && DAG.isSplatValue(V1, /*AllowUndefs=*/false);
15172 bool IsSplat2 =
15173 V2.getValueType() == VT && DAG.isSplatValue(V2, /*AllowUndefs=*/false);
15174 for (unsigned SplatOperand : {0U, 1U}) {
15175 if ((SplatOperand == 0 && !IsSplat1) || (SplatOperand == 1 && !IsSplat2))
15176 continue;
15177
15178 bool ReverseSplatEXT = false;
15179 unsigned SplatImm;
15180 if (isEXTMaskWithSplat(ShuffleMask, VT, SplatOperand, ReverseSplatEXT,
15181 SplatImm)) {
15182 SDValue ExtOp1 = V1;
15183 SDValue ExtOp2 = V2;
15184 if (ReverseSplatEXT)
15185 std::swap(ExtOp1, ExtOp2);
15186 SplatImm *= getExtFactor(ExtOp1);
15187 return DAG.getNode(AArch64ISD::EXT, DL, VT, ExtOp1, ExtOp2,
15188 DAG.getConstant(SplatImm, DL, MVT::i32));
15189 }
15190 }
15191
15192 bool ReverseEXT = false;
15193 unsigned Imm;
15194 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
15195 if (ReverseEXT)
15196 std::swap(V1, V2);
15197 Imm *= getExtFactor(V1);
15198 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
15199 DAG.getConstant(Imm, DL, MVT::i32));
15200 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
15201 Imm *= getExtFactor(V1);
15202 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
15203 DAG.getConstant(Imm, DL, MVT::i32));
15204 }
15205
15206 unsigned WhichResult;
15207 unsigned OperandOrder;
15208 if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15209 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15210 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15211 OperandOrder == 0 ? V2 : V1);
15212 }
15213 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
15214 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15215 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
15216 }
15217 if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15218 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15219 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15220 OperandOrder == 0 ? V2 : V1);
15221 }
15222
15223 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15224 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15225 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15226 }
15227 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15228 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15229 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15230 }
15231 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15232 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15233 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15234 }
15235
15237 return Concat;
15238
15239 bool DstIsLeft;
15240 int Anomaly;
15241 int NumInputElements = V1.getValueType().getVectorNumElements();
15242 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
15243 SDValue DstVec = DstIsLeft ? V1 : V2;
15244 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
15245
15246 SDValue SrcVec = V1;
15247 int SrcLane = ShuffleMask[Anomaly];
15248 if (SrcLane >= NumInputElements) {
15249 SrcVec = V2;
15250 SrcLane -= NumElts;
15251 }
15252 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
15253
15254 EVT ScalarVT = VT.getVectorElementType();
15255
15256 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
15257 ScalarVT = MVT::i32;
15258
15259 return DAG.getNode(
15260 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
15261 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
15262 DstLaneV);
15263 }
15264
15265 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
15266 return NewSD;
15267
15268 // If the shuffle is not directly supported and it has 4 elements, use
15269 // the PerfectShuffle-generated table to synthesize it from other shuffles.
15270 if (NumElts == 4) {
15271 unsigned PFIndexes[4];
15272 for (unsigned i = 0; i != 4; ++i) {
15273 if (ShuffleMask[i] < 0)
15274 PFIndexes[i] = 8;
15275 else
15276 PFIndexes[i] = ShuffleMask[i];
15277 }
15278
15279 // Compute the index in the perfect shuffle table.
15280 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
15281 PFIndexes[2] * 9 + PFIndexes[3];
15282 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
15283 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
15284 DL);
15285 }
15286
15287 // Check for a "select shuffle", generating a BSL to pick between lanes in
15288 // V1/V2.
15289 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
15290 assert(VT.getScalarSizeInBits() <= 32 &&
15291 "Expected larger vector element sizes to be handled already");
15292 SmallVector<SDValue> MaskElts;
15293 for (int M : ShuffleMask)
15294 MaskElts.push_back(DAG.getConstant(
15295 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
15296 EVT IVT = VT.changeVectorElementTypeToInteger();
15297 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
15298 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
15299 DAG.getBitcast(IVT, V1),
15300 DAG.getBitcast(IVT, V2)));
15301 }
15302
15303 // Fall back to generating a TBL
15304 return GenerateTBL(Op, ShuffleMask, DAG);
15305}
15306
15307SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
15308 SelectionDAG &DAG) const {
15309 EVT VT = Op.getValueType();
15310
15311 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15312 return LowerToScalableOp(Op, DAG);
15313
15314 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
15315 "Unexpected vector type!");
15316
15317 // We can handle the constant cases during isel.
15318 if (isa<ConstantSDNode>(Op.getOperand(0)))
15319 return Op;
15320
15321 // There isn't a natural way to handle the general i1 case, so we use some
15322 // trickery with whilelo.
15323 SDLoc DL(Op);
15324 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
15325 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
15326 DAG.getValueType(MVT::i1));
15327 SDValue ID =
15328 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
15329 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15330 if (VT == MVT::nxv1i1)
15331 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
15332 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
15333 Zero, SplatVal),
15334 Zero);
15335 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
15336}
15337
15338SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
15339 SelectionDAG &DAG) const {
15340 SDLoc DL(Op);
15341
15342 EVT VT = Op.getValueType();
15343 if (!isTypeLegal(VT) || !VT.isScalableVector())
15344 return SDValue();
15345
15346 // Current lowering only supports the SVE-ACLE types.
15348 return SDValue();
15349
15350 // The DUPQ operation is independent of element type so normalise to i64s.
15351 SDValue Idx128 = Op.getOperand(2);
15352
15353 // DUPQ can be used when idx is in range.
15354 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
15355 if (CIdx && (CIdx->getZExtValue() <= 3)) {
15356 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
15357 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
15358 }
15359
15360 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
15361
15362 // The ACLE says this must produce the same result as:
15363 // svtbl(data, svadd_x(svptrue_b64(),
15364 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
15365 // index * 2))
15366 SDValue One = DAG.getConstant(1, DL, MVT::i64);
15367 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
15368
15369 // create the vector 0,1,0,1,...
15370 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
15371 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
15372
15373 // create the vector idx64,idx64+1,idx64,idx64+1,...
15374 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
15375 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
15376 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
15377
15378 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
15379 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
15380 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
15381}
15382
15383
15384static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
15385 APInt &UndefBits) {
15386 EVT VT = BVN->getValueType(0);
15387 APInt SplatBits, SplatUndef;
15388 unsigned SplatBitSize;
15389 bool HasAnyUndefs;
15390 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
15391 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
15392
15393 for (unsigned i = 0; i < NumSplats; ++i) {
15394 CnstBits <<= SplatBitSize;
15395 UndefBits <<= SplatBitSize;
15396 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
15397 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
15398 }
15399
15400 return true;
15401 }
15402
15403 return false;
15404}
15405
15406// Try 64-bit splatted SIMD immediate.
15407static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15408 const APInt &Bits) {
15409 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15410 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15411 EVT VT = Op.getValueType();
15412 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
15413
15416
15417 SDLoc DL(Op);
15418 SDValue Mov =
15419 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15420 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15421 }
15422 }
15423
15424 return SDValue();
15425}
15426
15427// Try 32-bit splatted SIMD immediate.
15428static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15429 const APInt &Bits,
15430 const SDValue *LHS = nullptr) {
15431 EVT VT = Op.getValueType();
15432 if (VT.isFixedLengthVector() &&
15434 return SDValue();
15435
15436 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15437 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15438 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15439 bool isAdvSIMDModImm = false;
15440 uint64_t Shift;
15441
15442 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
15444 Shift = 0;
15445 }
15446 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
15448 Shift = 8;
15449 }
15450 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
15452 Shift = 16;
15453 }
15454 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
15456 Shift = 24;
15457 }
15458
15459 if (isAdvSIMDModImm) {
15460 SDLoc DL(Op);
15461 SDValue Mov;
15462
15463 if (LHS)
15464 Mov = DAG.getNode(NewOp, DL, MovTy,
15465 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15466 DAG.getConstant(Value, DL, MVT::i32),
15467 DAG.getConstant(Shift, DL, MVT::i32));
15468 else
15469 Mov =
15470 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15471 DAG.getConstant(Shift, DL, MVT::i32));
15472
15473 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15474 }
15475 }
15476
15477 return SDValue();
15478}
15479
15480// Try 16-bit splatted SIMD immediate.
15481static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15482 const APInt &Bits,
15483 const SDValue *LHS = nullptr) {
15484 EVT VT = Op.getValueType();
15485 if (VT.isFixedLengthVector() &&
15487 return SDValue();
15488
15489 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15490 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15491 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
15492 bool isAdvSIMDModImm = false;
15493 uint64_t Shift;
15494
15495 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
15497 Shift = 0;
15498 }
15499 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
15501 Shift = 8;
15502 }
15503
15504 if (isAdvSIMDModImm) {
15505 SDLoc DL(Op);
15506 SDValue Mov;
15507
15508 if (LHS)
15509 Mov = DAG.getNode(NewOp, DL, MovTy,
15510 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15511 DAG.getConstant(Value, DL, MVT::i32),
15512 DAG.getConstant(Shift, DL, MVT::i32));
15513 else
15514 Mov =
15515 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15516 DAG.getConstant(Shift, DL, MVT::i32));
15517
15518 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15519 }
15520 }
15521
15522 return SDValue();
15523}
15524
15525// Try 32-bit splatted SIMD immediate with shifted ones.
15527 SelectionDAG &DAG, const APInt &Bits) {
15528 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15529 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15530 EVT VT = Op.getValueType();
15531 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15532 bool isAdvSIMDModImm = false;
15533 uint64_t Shift;
15534
15535 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
15537 Shift = 264;
15538 }
15539 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
15541 Shift = 272;
15542 }
15543
15544 if (isAdvSIMDModImm) {
15545 SDLoc DL(Op);
15546 SDValue Mov =
15547 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15548 DAG.getConstant(Shift, DL, MVT::i32));
15549 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15550 }
15551 }
15552
15553 return SDValue();
15554}
15555
15556// Try 8-bit splatted SIMD immediate.
15557static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15558 const APInt &Bits) {
15559 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15560 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15561 EVT VT = Op.getValueType();
15562 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
15563
15566
15567 SDLoc DL(Op);
15568 SDValue Mov =
15569 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15570 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15571 }
15572 }
15573
15574 return SDValue();
15575}
15576
15577// Try FP splatted SIMD immediate.
15578static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15579 const APInt &Bits) {
15580 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15581 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15582 EVT VT = Op.getValueType();
15583 bool isWide = (VT.getSizeInBits() == 128);
15584 MVT MovTy;
15585 bool isAdvSIMDModImm = false;
15586
15587 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
15589 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
15590 }
15591 else if (isWide &&
15592 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
15594 MovTy = MVT::v2f64;
15595 }
15596
15597 if (isAdvSIMDModImm) {
15598 SDLoc DL(Op);
15599 SDValue Mov =
15600 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15601 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15602 }
15603 }
15604
15605 return SDValue();
15606}
15607
15608// Specialized code to quickly find if PotentialBVec is a BuildVector that
15609// consists of only the same constant int value, returned in reference arg
15610// ConstVal
15611static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15612 uint64_t &ConstVal) {
15613 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15614 if (!Bvec)
15615 return false;
15617 if (!FirstElt)
15618 return false;
15619 EVT VT = Bvec->getValueType(0);
15620 unsigned NumElts = VT.getVectorNumElements();
15621 for (unsigned i = 1; i < NumElts; ++i)
15622 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15623 return false;
15624 ConstVal = FirstElt->getZExtValue();
15625 return true;
15626}
15627
15629 // Look through cast.
15630 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15631 N = N.getOperand(0);
15632
15633 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15634}
15635
15637 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15638
15639 // Look through cast.
15640 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15641 N = N.getOperand(0);
15642 // When reinterpreting from a type with fewer elements the "new" elements
15643 // are not active, so bail if they're likely to be used.
15644 if (N.getValueType().getVectorMinNumElements() < NumElts)
15645 return false;
15646 }
15647
15648 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15649 return true;
15650
15651 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15652 // or smaller than the implicit element type represented by N.
15653 // NOTE: A larger element count implies a smaller element type.
15654 if (N.getOpcode() == AArch64ISD::PTRUE &&
15655 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15656 return N.getValueType().getVectorMinNumElements() >= NumElts;
15657
15658 return false;
15659}
15660
15661// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15662// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15663// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15664// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15665// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15666// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15668 EVT VT = N->getValueType(0);
15669
15670 if (!VT.isVector())
15671 return SDValue();
15672
15673 SDLoc DL(N);
15674
15675 SDValue And;
15676 SDValue Shift;
15677
15678 SDValue FirstOp = N->getOperand(0);
15679 unsigned FirstOpc = FirstOp.getOpcode();
15680 SDValue SecondOp = N->getOperand(1);
15681 unsigned SecondOpc = SecondOp.getOpcode();
15682
15683 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15684 // a BICi in order to use an immediate instead of a register.
15685 // Is the other operand an shl or lshr? This will have been turned into:
15686 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15687 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15688 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15689 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15690 SecondOpc == AArch64ISD::SHL_PRED ||
15691 SecondOpc == AArch64ISD::SRL_PRED)) {
15692 And = FirstOp;
15693 Shift = SecondOp;
15694
15695 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15696 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15697 FirstOpc == AArch64ISD::SHL_PRED ||
15698 FirstOpc == AArch64ISD::SRL_PRED)) {
15699 And = SecondOp;
15700 Shift = FirstOp;
15701 } else
15702 return SDValue();
15703
15704 bool IsAnd = And.getOpcode() == ISD::AND;
15705 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15706 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15707 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15708 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15709
15710 // Is the shift amount constant and are all lanes active?
15711 uint64_t C2;
15712 if (ShiftHasPredOp) {
15713 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15714 return SDValue();
15715 APInt C;
15717 return SDValue();
15718 C2 = C.getZExtValue();
15719 } else if (ConstantSDNode *C2node =
15721 C2 = C2node->getZExtValue();
15722 else
15723 return SDValue();
15724
15725 APInt C1AsAPInt;
15726 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15727 if (IsAnd) {
15728 // Is the and mask vector all constant?
15729 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15730 return SDValue();
15731 } else {
15732 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15733 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15734 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15735 assert(C1nodeImm && C1nodeShift);
15736 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15737 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15738 }
15739
15740 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15741 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15742 // how much one can shift elements of a particular size?
15743 if (C2 > ElemSizeInBits)
15744 return SDValue();
15745
15746 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15747 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15748 if (C1AsAPInt != RequiredC1)
15749 return SDValue();
15750
15751 SDValue X = And.getOperand(0);
15752 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15753 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15754 : Shift.getOperand(1);
15755
15756 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15757 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15758}
15759
15761 EVT VT = N->getValueType(0);
15762 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15763 SDLoc DL(N);
15764 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15765
15766 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15767 return SDValue();
15768
15769 SDValue N0 = N->getOperand(0);
15770 if (N0.getOpcode() != ISD::AND)
15771 return SDValue();
15772
15773 SDValue N1 = N->getOperand(1);
15774 if (N1.getOpcode() != ISD::AND)
15775 return SDValue();
15776
15777 // InstCombine does (not (neg a)) => (add a -1).
15778 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15779 // Loop over all combinations of AND operands.
15780 for (int i = 1; i >= 0; --i) {
15781 for (int j = 1; j >= 0; --j) {
15782 SDValue O0 = N0->getOperand(i);
15783 SDValue O1 = N1->getOperand(j);
15784 SDValue Sub, Add, SubSibling, AddSibling;
15785
15786 // Find a SUB and an ADD operand, one from each AND.
15787 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15788 Sub = O0;
15789 Add = O1;
15790 SubSibling = N0->getOperand(1 - i);
15791 AddSibling = N1->getOperand(1 - j);
15792 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15793 Add = O0;
15794 Sub = O1;
15795 AddSibling = N0->getOperand(1 - i);
15796 SubSibling = N1->getOperand(1 - j);
15797 } else
15798 continue;
15799
15800 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15801 continue;
15802
15803 // Constant ones is always righthand operand of the Add.
15804 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15805 continue;
15806
15807 if (Sub.getOperand(1) != Add.getOperand(0))
15808 continue;
15809
15810 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15811 }
15812 }
15813
15814 // (or (and a b) (and (not a) c)) => (bsl a b c)
15815 // We only have to look for constant vectors here since the general, variable
15816 // case can be handled in TableGen.
15817 unsigned Bits = VT.getScalarSizeInBits();
15818 for (int i = 1; i >= 0; --i)
15819 for (int j = 1; j >= 0; --j) {
15820 APInt Val1, Val2;
15821
15822 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15824 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15825 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15826 N0->getOperand(1 - i), N1->getOperand(1 - j));
15827 }
15830 if (!BVN0 || !BVN1)
15831 continue;
15832
15833 bool FoundMatch = true;
15834 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15837 if (!CN0 || !CN1 ||
15838 CN0->getAPIntValue().trunc(Bits) !=
15839 ~CN1->getAsAPIntVal().trunc(Bits)) {
15840 FoundMatch = false;
15841 break;
15842 }
15843 }
15844 if (FoundMatch)
15845 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15846 N0->getOperand(1 - i), N1->getOperand(1 - j));
15847 }
15848
15849 return SDValue();
15850}
15851
15852SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15853 SelectionDAG &DAG) const {
15854 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15855 !Subtarget->isNeonAvailable()))
15856 return LowerToScalableOp(Op, DAG);
15857
15858 if (SDValue Res = tryLowerToBSL(Op, DAG))
15859 return Res;
15860
15861 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15862 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15863 return Res;
15864
15865 EVT VT = Op.getValueType();
15866 if (VT.isScalableVector())
15867 return Op;
15868
15869 SDValue LHS = Op.getOperand(0);
15870 BuildVectorSDNode *BVN =
15871 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15872 if (!BVN) {
15873 // OR commutes, so try swapping the operands.
15874 LHS = Op.getOperand(1);
15875 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15876 }
15877 if (!BVN)
15878 return Op;
15879
15880 APInt DefBits(VT.getSizeInBits(), 0);
15881 APInt UndefBits(VT.getSizeInBits(), 0);
15882 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15883 SDValue NewOp;
15884
15885 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15886 DefBits, &LHS)) ||
15887 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15888 DefBits, &LHS)))
15889 return NewOp;
15890
15891 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15892 UndefBits, &LHS)) ||
15893 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15894 UndefBits, &LHS)))
15895 return NewOp;
15896 }
15897
15898 // We can always fall back to a non-immediate OR.
15899 return Op;
15900}
15901
15902// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15903// be truncated to fit element width.
15905 SelectionDAG &DAG) {
15906 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15907 SDLoc DL(Op);
15908 EVT VT = Op.getValueType();
15909 EVT EltTy= VT.getVectorElementType();
15910
15911 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15912 return Op;
15913
15915 for (SDValue Lane : Op->ops()) {
15916 // For integer vectors, type legalization would have promoted the
15917 // operands already. Otherwise, if Op is a floating-point splat
15918 // (with operands cast to integers), then the only possibilities
15919 // are constants and UNDEFs.
15920 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15921 Lane = DAG.getConstant(
15922 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15923 DL, MVT::i32);
15924 } else if (Lane.getOpcode() == ISD::POISON) {
15925 Lane = DAG.getPOISON(MVT::i32);
15926 } else if (Lane.getOpcode() == ISD::UNDEF) {
15927 Lane = DAG.getUNDEF(MVT::i32);
15928 } else {
15929 assert(Lane.getValueType() == MVT::i32 &&
15930 "Unexpected BUILD_VECTOR operand type");
15931 }
15932 Ops.push_back(Lane);
15933 }
15934 return DAG.getBuildVector(VT, DL, Ops);
15935}
15936
15938 const AArch64Subtarget *ST, APInt &DefBits) {
15939 EVT VT = Op.getValueType();
15940 // TODO: We should be able to support 64-bit destinations too
15941 if (!ST->hasSVE() || !VT.is128BitVector() ||
15942 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15943 return SDValue();
15944
15945 // See if we can make use of the SVE dup instruction.
15946 APInt Val64 = DefBits.trunc(64);
15947 int32_t ImmVal, ShiftVal;
15948 uint64_t Encoding;
15949 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal) &&
15950 !AArch64_AM::isSVELogicalImm(64, Val64.getZExtValue(), Encoding))
15951 return SDValue();
15952
15953 SDLoc DL(Op);
15954 SDValue SplatVal = DAG.getNode(AArch64ISD::DUP, DL, MVT::v2i64,
15955 DAG.getConstant(Val64, DL, MVT::i64));
15956 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, SplatVal);
15957}
15958
15960 const AArch64Subtarget *ST) {
15961 EVT VT = Op.getValueType();
15962 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15963 "Expected a legal NEON vector");
15964
15965 APInt DefBits(VT.getSizeInBits(), 0);
15966 APInt UndefBits(VT.getSizeInBits(), 0);
15968 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15969 auto TryMOVIWithBits = [&](APInt DefBits) {
15970 SDValue NewOp;
15971 if ((NewOp =
15972 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15973 (NewOp =
15974 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15975 (NewOp =
15976 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15977 (NewOp =
15978 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15979 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15980 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15981 return NewOp;
15982
15983 APInt NotDefBits = ~DefBits;
15984 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15985 NotDefBits)) ||
15986 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15987 NotDefBits)) ||
15988 (NewOp =
15989 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15990 return NewOp;
15991 return SDValue();
15992 };
15993 if (SDValue R = TryMOVIWithBits(DefBits))
15994 return R;
15995 if (SDValue R = TryMOVIWithBits(UndefBits))
15996 return R;
15997
15998 // Try to materialise the constant using SVE when available.
15999 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
16000 return R;
16001
16002 // See if a fneg of the constant can be materialized with a MOVI, etc
16003 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
16004 // FNegate each sub-element of the constant
16005 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
16006 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
16007 .zext(VT.getSizeInBits());
16008 APInt NegBits(VT.getSizeInBits(), 0);
16009 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
16010 for (unsigned i = 0; i < NumElts; i++)
16011 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
16012 NegBits = DefBits ^ NegBits;
16013
16014 // Try to create the new constants with MOVI, and if so generate a fneg
16015 // for it.
16016 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
16017 SDLoc DL(Op);
16018 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
16019 return DAG.getNode(
16020 AArch64ISD::NVCAST, DL, VT,
16021 DAG.getNode(ISD::FNEG, DL, VFVT,
16022 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
16023 }
16024 return SDValue();
16025 };
16026 SDValue R;
16027 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
16028 (R = TryWithFNeg(DefBits, MVT::f64)) ||
16029 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
16030 return R;
16031 }
16032
16033 return SDValue();
16034}
16035
16036SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
16037 SDValue Op, SelectionDAG &DAG) const {
16038 EVT VT = Op.getValueType();
16039 SDLoc DL(Op);
16040 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
16041 auto *BVN = cast<BuildVectorSDNode>(Op);
16042
16043 if (auto SeqInfo = BVN->isArithmeticSequence()) {
16044 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
16045 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
16046 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
16047 return convertFromScalableVector(DAG, VT, Seq);
16048 }
16049
16050 unsigned NumElems = VT.getVectorNumElements();
16051 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
16052 NumElems <= 1 || BVN->isConstant())
16053 return SDValue();
16054
16055 auto IsExtractElt = [](SDValue Op) {
16056 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
16057 };
16058
16059 // For integer types that are not already in vectors limit to at most four
16060 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
16061 if (VT.getScalarType().isInteger() &&
16062 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
16063 return SDValue();
16064
16065 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
16066 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
16068 Op->op_values(), [&, Poison = DAG.getPOISON(ContainerVT)](SDValue Op) {
16069 return Op.isUndef() ? Poison
16070 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
16071 ContainerVT, Poison, Op, ZeroI64);
16072 });
16073
16074 ElementCount ZipEC = ContainerVT.getVectorElementCount();
16075 while (Intermediates.size() > 1) {
16076 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
16077
16078 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
16079 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
16080 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
16081 Intermediates[I / 2] =
16082 Op1.isUndef() ? Op0
16083 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
16084 }
16085
16086 Intermediates.resize(Intermediates.size() / 2);
16087 ZipEC = ZipEC.divideCoefficientBy(2);
16088 }
16089
16090 assert(Intermediates.size() == 1);
16091 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
16092 return convertFromScalableVector(DAG, VT, Vec);
16093}
16094
16095SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
16096 SelectionDAG &DAG) const {
16097 EVT VT = Op.getValueType();
16098
16099 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16100 cast<BuildVectorSDNode>(Op)->isArithmeticSequence();
16101 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
16102 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
16103
16104 // Try to build a simple constant vector.
16105 Op = NormalizeBuildVector(Op, DAG);
16106 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
16107 // abort.
16108 if (Op.getOpcode() != ISD::BUILD_VECTOR)
16109 return SDValue();
16110
16111 // Certain vector constants, used to express things like logical NOT and
16112 // arithmetic NEG, are passed through unmodified. This allows special
16113 // patterns for these operations to match, which will lower these constants
16114 // to whatever is proven necessary.
16115 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
16116 if (BVN->isConstant()) {
16117 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
16118 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
16119 APInt Val(BitSize,
16120 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
16121 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
16122 return Op;
16123 }
16124 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
16125 if (Const->isZero() && !Const->isNegative())
16126 return Op;
16127 }
16128
16129 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
16130 return V;
16131
16132 // Scan through the operands to find some interesting properties we can
16133 // exploit:
16134 // 1) If only one value is used, we can use a DUP, or
16135 // 2) if only the low element is not undef, we can just insert that, or
16136 // 3) if only one constant value is used (w/ some non-constant lanes),
16137 // we can splat the constant value into the whole vector then fill
16138 // in the non-constant lanes.
16139 // 4) FIXME: If different constant values are used, but we can intelligently
16140 // select the values we'll be overwriting for the non-constant
16141 // lanes such that we can directly materialize the vector
16142 // some other way (MOVI, e.g.), we can be sneaky.
16143 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
16144 SDLoc DL(Op);
16145 unsigned NumElts = VT.getVectorNumElements();
16146 bool isOnlyLowElement = true;
16147 bool usesOnlyOneValue = true;
16148 bool usesOnlyOneConstantValue = true;
16149 bool isConstant = true;
16150 bool AllLanesExtractElt = true;
16151 unsigned NumConstantLanes = 0;
16152 unsigned NumDifferentLanes = 0;
16153 unsigned NumUndefLanes = 0;
16154 SDValue Value;
16155 SDValue ConstantValue;
16156 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
16157 unsigned ConsecutiveValCount = 0;
16158 SDValue PrevVal;
16159 auto IsZero = [&](SDValue V) {
16160 return isNullConstant(V) || isNullFPConstant(V);
16161 };
16162 bool MaybeLowHalfZeroHigh =
16163 VT.isFixedLengthVector() && VT.getSizeInBits() == 128;
16164 unsigned HalfElts = MaybeLowHalfZeroHigh ? (NumElts >> 1) : 0;
16165 SDValue LowHalfFirstVal = MaybeLowHalfZeroHigh ? Op.getOperand(0) : SDValue();
16166 for (unsigned i = 0; i < NumElts; ++i) {
16167 SDValue V = Op.getOperand(i);
16168 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16169 AllLanesExtractElt = false;
16170 if (V.isUndef()) {
16171 ++NumUndefLanes;
16172 MaybeLowHalfZeroHigh = false;
16173 continue;
16174 }
16175 if (i > 0)
16176 isOnlyLowElement = false;
16177 if (!isIntOrFPConstant(V))
16178 isConstant = false;
16179
16180 if (isIntOrFPConstant(V)) {
16181 ++NumConstantLanes;
16182 if (!ConstantValue.getNode())
16183 ConstantValue = V;
16184 else if (ConstantValue != V)
16185 usesOnlyOneConstantValue = false;
16186 }
16187
16188 if (!Value.getNode())
16189 Value = V;
16190 else if (V != Value) {
16191 usesOnlyOneValue = false;
16192 ++NumDifferentLanes;
16193 }
16194
16195 if (PrevVal != V) {
16196 ConsecutiveValCount = 0;
16197 PrevVal = V;
16198 }
16199 if (MaybeLowHalfZeroHigh) {
16200 if (i < HalfElts) {
16201 if (V != LowHalfFirstVal)
16202 MaybeLowHalfZeroHigh = false;
16203 } else if (!IsZero(V)) {
16204 MaybeLowHalfZeroHigh = false;
16205 }
16206 }
16207
16208 // Keep different values and its last consecutive count. For example,
16209 //
16210 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16211 // t24, t24, t24, t24, t24, t24, t24, t24
16212 // t23 = consecutive count 8
16213 // t24 = consecutive count 8
16214 // ------------------------------------------------------------------
16215 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
16216 // t24, t24, t24, t24, t24, t24, t24, t24
16217 // t23 = consecutive count 5
16218 // t24 = consecutive count 9
16219 DifferentValueMap[V] = ++ConsecutiveValCount;
16220 }
16221
16222 if (!Value.getNode()) {
16223 LLVM_DEBUG(
16224 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
16225 return DAG.getUNDEF(VT);
16226 }
16227
16228 // Convert BUILD_VECTOR where all elements but the lowest are undef into
16229 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
16230 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
16231 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
16232 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
16233 "SCALAR_TO_VECTOR node\n");
16234 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
16235 }
16236
16237 if (MaybeLowHalfZeroHigh && LowHalfFirstVal.getNode() &&
16238 !LowHalfFirstVal.isUndef() && !isIntOrFPConstant(LowHalfFirstVal)) {
16239 EVT LaneVT = VT.getVectorElementType();
16240 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16241
16242 SDValue HiZero = LaneVT.isInteger() ? DAG.getConstant(0, DL, HalfVT)
16243 : DAG.getConstantFP(0.0, DL, HalfVT);
16244
16245 SDValue LoHalf =
16246 LaneVT.getSizeInBits() == 64
16247 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, HalfVT, LowHalfFirstVal)
16248 : DAG.getNode(AArch64ISD::DUP, DL, HalfVT, LowHalfFirstVal);
16249
16250 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoHalf, HiZero);
16251 }
16252
16253 if (AllLanesExtractElt) {
16254 SDNode *Vector = nullptr;
16255 bool Even = false;
16256 bool Odd = false;
16257 // Check whether the extract elements match the Even pattern <0,2,4,...> or
16258 // the Odd pattern <1,3,5,...>.
16259 for (unsigned i = 0; i < NumElts; ++i) {
16260 SDValue V = Op.getOperand(i);
16261 const SDNode *N = V.getNode();
16262 if (!isa<ConstantSDNode>(N->getOperand(1))) {
16263 Even = false;
16264 Odd = false;
16265 break;
16266 }
16267 SDValue N0 = N->getOperand(0);
16268
16269 // All elements are extracted from the same vector.
16270 if (!Vector) {
16271 Vector = N0.getNode();
16272 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
16273 // BUILD_VECTOR.
16274 if (VT.getVectorElementType() !=
16276 break;
16277 } else if (Vector != N0.getNode()) {
16278 Odd = false;
16279 Even = false;
16280 break;
16281 }
16282
16283 // Extracted values are either at Even indices <0,2,4,...> or at Odd
16284 // indices <1,3,5,...>.
16285 uint64_t Val = N->getConstantOperandVal(1);
16286 if (Val == 2 * i) {
16287 Even = true;
16288 continue;
16289 }
16290 if (Val - 1 == 2 * i) {
16291 Odd = true;
16292 continue;
16293 }
16294
16295 // Something does not match: abort.
16296 Odd = false;
16297 Even = false;
16298 break;
16299 }
16300 if (Even || Odd) {
16301 SDValue LHS =
16303 DAG.getConstant(0, DL, MVT::i64));
16304 SDValue RHS =
16306 DAG.getConstant(NumElts, DL, MVT::i64));
16307
16308 if (Even && !Odd)
16309 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
16310 if (Odd && !Even)
16311 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
16312 }
16313 }
16314
16315 // Use DUP for non-constant splats. For f32 constant splats, reduce to
16316 // i32 and try again.
16317 if (usesOnlyOneValue) {
16318 if (!isConstant) {
16319 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16320 Value.getValueType() != VT) {
16321 LLVM_DEBUG(
16322 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
16323 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
16324 }
16325
16326 // This is actually a DUPLANExx operation, which keeps everything vectory.
16327
16328 SDValue Lane = Value.getOperand(1);
16329 Value = Value.getOperand(0);
16330 if (Value.getValueSizeInBits() == 64) {
16331 LLVM_DEBUG(
16332 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
16333 "widening it\n");
16334 Value = WidenVector(Value, DAG);
16335 }
16336
16337 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
16338 return DAG.getNode(Opcode, DL, VT, Value, Lane);
16339 }
16340
16343 EVT EltTy = VT.getVectorElementType();
16344 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
16345 EltTy == MVT::f64) && "Unsupported floating-point vector type");
16346 LLVM_DEBUG(
16347 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
16348 "BITCASTS, and try again\n");
16349 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
16350 for (unsigned i = 0; i < NumElts; ++i)
16351 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
16352 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
16353 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
16354 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
16355 Val.dump(););
16356 Val = LowerBUILD_VECTOR(Val, DAG);
16357 if (Val.getNode())
16358 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
16359 }
16360 }
16361
16362 // If we need to insert a small number of different non-constant elements and
16363 // the vector width is sufficiently large, prefer using DUP with the common
16364 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
16365 // skip the constant lane handling below.
16366 bool PreferDUPAndInsert =
16367 !isConstant && NumDifferentLanes >= 1 &&
16368 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
16369 NumDifferentLanes >= NumConstantLanes;
16370
16371 // If there was only one constant value used and for more than one lane,
16372 // start by splatting that value, then replace the non-constant lanes. This
16373 // is better than the default, which will perform a separate initialization
16374 // for each lane.
16375 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
16376 // Firstly, try to materialize the splat constant.
16377 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
16378 unsigned BitSize = VT.getScalarSizeInBits();
16379 APInt ConstantValueAPInt(1, 0);
16380 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
16381 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
16382 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
16383 !ConstantValueAPInt.isAllOnes()) {
16384 Val = ConstantBuildVector(Val, DAG, Subtarget);
16385 if (!Val)
16386 // Otherwise, materialize the constant and splat it.
16387 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
16388 }
16389
16390 // Now insert the non-constant lanes.
16391 for (unsigned i = 0; i < NumElts; ++i) {
16392 SDValue V = Op.getOperand(i);
16393 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16394 if (!isIntOrFPConstant(V) && !V.isUndef())
16395 // Note that type legalization likely mucked about with the VT of the
16396 // source operand, so we may have to convert it here before inserting.
16397 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
16398 }
16399 return Val;
16400 }
16401
16402 // Handle 64-bit constant BUILD_VECTORs by packing them into an i64 immediate.
16403 // This is cheaper than a load if the immediate can be materialized in a few
16404 // mov instructions. This optimization is disabled for big-endian targets for
16405 // now.
16406 if (BVN->isConstant() && VT.isFixedLengthVector() &&
16407 VT.getSizeInBits() == 64 && !DAG.getDataLayout().isBigEndian()) {
16408 const SDLoc DL(Op);
16409 APInt PackedVal(64, 0);
16410 unsigned BitPos = 0;
16411
16412 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16413 for (unsigned i = 0, e = BVN->getNumOperands(); i != e; ++i) {
16414 const SDValue &LaneOp = BVN->getOperand(i);
16415 APInt LaneBits;
16416 if (LaneOp.getOpcode() == ISD::UNDEF)
16417 LaneBits = APInt(EltSizeInBits, 0);
16418 else if (auto *C = dyn_cast<ConstantSDNode>(LaneOp))
16419 LaneBits = C->getAPIntValue();
16420 else if (auto *CFP = dyn_cast<ConstantFPSDNode>(LaneOp))
16421 LaneBits = CFP->getValueAPF().bitcastToAPInt();
16422 else
16423 return SDValue();
16424
16425 PackedVal |= LaneBits.trunc(VT.getScalarSizeInBits()).zext(64) << BitPos;
16426 BitPos += EltSizeInBits;
16427 }
16428
16429 // This optimization kicks in if the number of mov instructions
16430 // is under 2
16432 AArch64_IMM::expandMOVImm(PackedVal.getZExtValue(), 64, Insns);
16433 if (Insns.size() > 2)
16434 return SDValue();
16435
16436 SDValue ScalarConst = DAG.getConstant(PackedVal, DL, MVT::i64);
16437 // Use BITCAST to reinterpret the scalar constant's bits as a vector.
16438 return DAG.getNode(ISD::BITCAST, DL, VT, ScalarConst);
16439 }
16440
16441 // This will generate a load from the constant pool.
16442 if (isConstant) {
16443 LLVM_DEBUG(
16444 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
16445 "expansion\n");
16446 return SDValue();
16447 }
16448
16449 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
16450 // v4i32s. This is really a truncate, which we can construct out of (legal)
16451 // concats and truncate nodes.
16453 return M;
16454
16455 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
16456 if (NumElts >= 4) {
16457 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
16458 return Shuffle;
16459
16460 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
16461 return Shuffle;
16462 }
16463
16464 if (PreferDUPAndInsert) {
16465 // First, build a constant vector with the common element.
16467 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
16468 // Next, insert the elements that do not match the common value.
16469 for (unsigned I = 0; I < NumElts; ++I)
16470 if (Op.getOperand(I) != Value)
16471 NewVector =
16472 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
16473 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
16474
16475 return NewVector;
16476 }
16477
16478 // If vector consists of two different values, try to generate two DUPs and
16479 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
16480 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
16482 // Check the consecutive count of the value is the half number of vector
16483 // elements. In this case, we can use CONCAT_VECTORS. For example,
16484 //
16485 // canUseVECTOR_CONCAT = true;
16486 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16487 // t24, t24, t24, t24, t24, t24, t24, t24
16488 //
16489 // canUseVECTOR_CONCAT = false;
16490 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
16491 // t24, t24, t24, t24, t24, t24, t24, t24
16492 bool canUseVECTOR_CONCAT = true;
16493 for (auto Pair : DifferentValueMap) {
16494 // Check different values have same length which is NumElts / 2.
16495 if (Pair.second != NumElts / 2)
16496 canUseVECTOR_CONCAT = false;
16497 Vals.push_back(Pair.first);
16498 }
16499
16500 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
16501 // CONCAT_VECTORs. For example,
16502 //
16503 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
16504 // t24, t24, t24, t24, t24, t24, t24, t24
16505 // ==>
16506 // t26: v8i8 = AArch64ISD::DUP t23
16507 // t28: v8i8 = AArch64ISD::DUP t24
16508 // t29: v16i8 = concat_vectors t26, t28
16509 if (canUseVECTOR_CONCAT) {
16510 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16511 if (isTypeLegal(SubVT) && SubVT.isVector() &&
16512 SubVT.getVectorNumElements() >= 2) {
16513 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
16514 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
16515 SDValue DUP1 =
16516 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
16517 SDValue DUP2 =
16518 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
16520 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
16521 return CONCAT_VECTORS;
16522 }
16523 }
16524
16525 // Let's try to generate VECTOR_SHUFFLE. For example,
16526 //
16527 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
16528 // ==>
16529 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
16530 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
16531 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
16532 if (NumElts >= 8) {
16533 SmallVector<int, 16> MaskVec;
16534 // Build mask for VECTOR_SHUFLLE.
16535 SDValue FirstLaneVal = Op.getOperand(0);
16536 for (unsigned i = 0; i < NumElts; ++i) {
16537 SDValue Val = Op.getOperand(i);
16538 if (FirstLaneVal == Val)
16539 MaskVec.push_back(i);
16540 else
16541 MaskVec.push_back(i + NumElts);
16542 }
16543
16544 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
16545 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
16546 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
16547 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
16549 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
16550 return VECTOR_SHUFFLE;
16551 }
16552 }
16553
16554 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
16555 // know the default expansion would otherwise fall back on something even
16556 // worse. For a vector with one or two non-undef values, that's
16557 // scalar_to_vector for the elements followed by a shuffle (provided the
16558 // shuffle is valid for the target) and materialization element by element
16559 // on the stack followed by a load for everything else.
16560 if (!isConstant && !usesOnlyOneValue) {
16561 LLVM_DEBUG(
16562 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
16563 "of INSERT_VECTOR_ELT\n");
16564
16565 SDValue Vec = DAG.getPOISON(VT);
16566 SDValue Op0 = Op.getOperand(0);
16567 unsigned i = 0;
16568
16569 // Use SCALAR_TO_VECTOR for lane zero to
16570 // a) Avoid a RMW dependency on the full vector register, and
16571 // b) Allow the register coalescer to fold away the copy if the
16572 // value is already in an S or D register, and we're forced to emit an
16573 // INSERT_SUBREG that we can't fold anywhere.
16574 //
16575 // We also allow types like i8 and i16 which are illegal scalar but legal
16576 // vector element types. After type-legalization the inserted value is
16577 // extended (i32) and it is safe to cast them to the vector type by ignoring
16578 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
16579 if (!Op0.isUndef()) {
16580 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
16581 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
16582 ++i;
16583 }
16584 LLVM_DEBUG({
16585 if (i < NumElts)
16586 dbgs() << "Creating nodes for the other vector elements:\n";
16587 });
16588 for (; i < NumElts; ++i) {
16589 SDValue V = Op.getOperand(i);
16590 if (V.isUndef())
16591 continue;
16592 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16593 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
16594 }
16595 return Vec;
16596 }
16597
16598 LLVM_DEBUG(
16599 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
16600 "better alternative\n");
16601 return SDValue();
16602}
16603
16604SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
16605 SelectionDAG &DAG) const {
16606 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16607 !Subtarget->isNeonAvailable()))
16608 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
16609
16610 assert(Op.getValueType().isScalableVector() &&
16611 isTypeLegal(Op.getValueType()) &&
16612 "Expected legal scalable vector type!");
16613
16614 if (isTypeLegal(Op.getOperand(0).getValueType())) {
16615 unsigned NumOperands = Op->getNumOperands();
16616 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
16617 "Unexpected number of operands in CONCAT_VECTORS");
16618
16619 if (NumOperands == 2)
16620 return Op;
16621
16622 // Concat each pair of subvectors and pack into the lower half of the array.
16623 SmallVector<SDValue> ConcatOps(Op->ops());
16624 while (ConcatOps.size() > 1) {
16625 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
16626 SDValue V1 = ConcatOps[I];
16627 SDValue V2 = ConcatOps[I + 1];
16628 EVT SubVT = V1.getValueType();
16629 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
16630 ConcatOps[I / 2] =
16631 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
16632 }
16633 ConcatOps.resize(ConcatOps.size() / 2);
16634 }
16635 return ConcatOps[0];
16636 }
16637
16638 return SDValue();
16639}
16640
16641SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
16642 SelectionDAG &DAG) const {
16643 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
16644
16645 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16646 !Subtarget->isNeonAvailable()))
16647 return LowerFixedLengthInsertVectorElt(Op, DAG);
16648
16649 EVT VT = Op.getValueType();
16650 SDValue Vec = Op.getOperand(0);
16651 SDValue Elt = Op.getOperand(1);
16652 SDValue Idx = Op.getOperand(2);
16653
16654 if (VT.getScalarType() == MVT::i1) {
16655 SDLoc DL(Op);
16656
16657 // MVT::nxv1i8 is not a legal type so widen->insert->shrink instead.
16658 if (VT == MVT::nxv1i1) {
16659 SDValue WidenVec =
16660 DAG.getInsertSubvector(DL, DAG.getPOISON(MVT::nxv2i1), Vec, 0);
16661 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::nxv2i1,
16662 WidenVec, Elt, Idx);
16663 return DAG.getExtractSubvector(DL, MVT::nxv1i1, Insert, 0);
16664 }
16665
16666 EVT PromoteVT = getPromotedVTForPredicate(VT);
16667 SDValue PromoteVec = DAG.getNode(ISD::ANY_EXTEND, DL, PromoteVT, Vec);
16668 if (PromoteVT.getVectorElementType() == MVT::i64)
16669 Elt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Elt);
16671 PromoteVec, Elt, Idx);
16672 return DAG.getNode(ISD::TRUNCATE, DL, VT, Insert);
16673 }
16674
16675 // Check for non-constant or out of range lane.
16676 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Idx);
16677 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16678 return SDValue();
16679
16680 return Op;
16681}
16682
16683SDValue
16684AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16685 SelectionDAG &DAG) const {
16686 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16687 EVT VT = Op.getOperand(0).getValueType();
16688
16689 if (VT.getScalarType() == MVT::i1) {
16690 SDLoc DL(Op);
16691 // There are no operations to extend a nxv1i1 predicate to a nxv1i128 vector
16692 // An easy lowering is widening the input predicate to nxv2i1.
16693 if (VT == MVT::nxv1i1) {
16694 SDValue WidenedPred = DAG.getInsertSubvector(
16695 DL, DAG.getPOISON(MVT::nxv2i1), Op->getOperand(0), 0);
16696 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
16697 WidenedPred, Op.getOperand(1));
16698 }
16699 // We can't directly extract from an SVE predicate; extend it first.
16700 // (This isn't the only possible lowering, but it's straightforward.)
16701 EVT VectorVT = getPromotedVTForPredicate(VT);
16702 SDValue Extend =
16703 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16704 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16705 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16706 Extend, Op.getOperand(1));
16707 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16708 }
16709
16710 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16711 return LowerFixedLengthExtractVectorElt(Op, DAG);
16712
16713 // Check for non-constant or out of range lane.
16714 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16715 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16716 return SDValue();
16717
16718 // Insertion/extraction are legal for V128 types.
16719 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16720 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16721 VT == MVT::v8f16 || VT == MVT::v8bf16)
16722 return Op;
16723
16724 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16725 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16726 VT != MVT::v4bf16)
16727 return SDValue();
16728
16729 // For V64 types, we perform extraction by expanding the value
16730 // to a V128 type and perform the extraction on that.
16731 SDLoc DL(Op);
16732 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16733 EVT WideTy = WideVec.getValueType();
16734
16735 EVT ExtrTy = WideTy.getVectorElementType();
16736 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16737 ExtrTy = MVT::i32;
16738
16739 // For extractions, we just return the result directly.
16740 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16741 Op.getOperand(1));
16742}
16743
16744SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16745 SelectionDAG &DAG) const {
16746 EVT VT = Op.getValueType();
16748 "Only cases that extract a fixed length vector are supported!");
16749 EVT InVT = Op.getOperand(0).getValueType();
16750
16751 // If we don't have legal types yet, do nothing
16752 if (!isTypeLegal(InVT))
16753 return SDValue();
16754
16755 if (InVT.is128BitVector()) {
16756 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16757 unsigned Idx = Op.getConstantOperandVal(1);
16758
16759 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16760 if (Idx == 0)
16761 return Op;
16762
16763 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16764 // that directly.
16765 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16766 return Op;
16767 }
16768
16769 if (InVT.isScalableVector() ||
16770 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16771 SDLoc DL(Op);
16772 SDValue Vec = Op.getOperand(0);
16773 SDValue Idx = Op.getOperand(1);
16774
16775 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16776 if (PackedVT != InVT) {
16777 // Pack input into the bottom part of an SVE register and try again.
16778 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16779 DAG.getPOISON(PackedVT), Vec,
16780 DAG.getVectorIdxConstant(0, DL));
16781 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16782 }
16783
16784 // This will get matched by custom code during ISelDAGToDAG.
16785 if (isNullConstant(Idx))
16786 return Op;
16787
16788 assert(InVT.isScalableVector() && "Unexpected vector type!");
16789 // Move requested subvector to the start of the vector and try again.
16790 SDValue Splice =
16791 DAG.getNode(ISD::VECTOR_SPLICE_LEFT, DL, InVT, Vec, Vec, Idx);
16792 return convertFromScalableVector(DAG, VT, Splice);
16793 }
16794
16795 return SDValue();
16796}
16797
16798SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16799 SelectionDAG &DAG) const {
16800 assert(Op.getValueType().isScalableVector() &&
16801 "Only expect to lower inserts into scalable vectors!");
16802
16803 EVT InVT = Op.getOperand(1).getValueType();
16804 unsigned Idx = Op.getConstantOperandVal(2);
16805
16806 SDValue Vec0 = Op.getOperand(0);
16807 SDValue Vec1 = Op.getOperand(1);
16808 SDLoc DL(Op);
16809 EVT VT = Op.getValueType();
16810
16811 if (InVT.isScalableVector()) {
16812 if (!isTypeLegal(VT))
16813 return SDValue();
16814
16815 // Break down insert_subvector into simpler parts.
16816 if (VT.getVectorElementType() == MVT::i1) {
16817 unsigned NumElts = VT.getVectorMinNumElements();
16818 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16819
16820 SDValue Lo, Hi;
16821 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16822 DAG.getVectorIdxConstant(0, DL));
16823 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16824 DAG.getVectorIdxConstant(NumElts / 2, DL));
16825 if (Idx < (NumElts / 2))
16826 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16827 DAG.getVectorIdxConstant(Idx, DL));
16828 else
16829 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16830 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16831
16832 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16833 }
16834
16835 // We can select these directly.
16836 if (isTypeLegal(InVT) && Vec0.isUndef())
16837 return Op;
16838
16839 // Ensure the subvector is half the size of the main vector.
16840 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16841 return SDValue();
16842
16843 // Here narrow and wide refers to the vector element types. After "casting"
16844 // both vectors must have the same bit length and so because the subvector
16845 // has fewer elements, those elements need to be bigger.
16846 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16847 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16848
16849 // NOP cast operands to the largest legal vector of the same element count.
16850 if (VT.isFloatingPoint()) {
16851 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16852 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16853 } else {
16854 // Legal integer vectors are already their largest so Vec0 is fine as is.
16855 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16856 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16857 }
16858
16859 // To replace the top/bottom half of vector V with vector SubV we widen the
16860 // preserved half of V, concatenate this to SubV (the order depending on the
16861 // half being replaced) and then narrow the result.
16862 SDValue Narrow;
16863 if (Idx == 0) {
16864 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16865 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16866 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16867 } else {
16868 assert(Idx == InVT.getVectorMinNumElements() &&
16869 "Invalid subvector index!");
16870 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16871 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16872 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16873 }
16874
16875 return getSVESafeBitCast(VT, Narrow, DAG);
16876 }
16877
16878 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16879 // This will be matched by custom code during ISelDAGToDAG.
16880 if (Vec0.isUndef())
16881 return Op;
16882
16883 std::optional<unsigned> PredPattern =
16885 auto PredTy = VT.changeVectorElementType(*DAG.getContext(), MVT::i1);
16886 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16887 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16888 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16889 }
16890
16891 return SDValue();
16892}
16893
16894static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16895 if (Op.getOpcode() != AArch64ISD::DUP &&
16896 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16897 Op.getOpcode() != ISD::BUILD_VECTOR)
16898 return false;
16899
16900 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16901 !isAllConstantBuildVector(Op, SplatVal))
16902 return false;
16903
16904 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16905 !isa<ConstantSDNode>(Op->getOperand(0)))
16906 return false;
16907
16908 SplatVal = Op->getConstantOperandVal(0);
16909 if (Op.getValueType().getVectorElementType() != MVT::i64)
16910 SplatVal = (int32_t)SplatVal;
16911
16912 Negated = false;
16913 if (isPowerOf2_64(SplatVal))
16914 return true;
16915
16916 Negated = true;
16917 if (isPowerOf2_64(-SplatVal)) {
16918 SplatVal = -SplatVal;
16919 return true;
16920 }
16921
16922 return false;
16923}
16924
16925SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16926 EVT VT = Op.getValueType();
16927 SDLoc DL(Op);
16928
16929 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16930 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16931
16932 unsigned Opc = Op.getOpcode();
16933 assert((Opc == ISD::SDIV || Opc == ISD::UDIV) && "Expected a DIV opcode.");
16934 assert(VT.isScalableVector() && "Expected a scalable vector.");
16935 bool Signed = Opc == ISD::SDIV;
16936
16937 bool Negated;
16938 uint64_t SplatVal;
16939 // NOTE: SRAD cannot be used to represent sdiv-by-one.
16940 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
16941 SplatVal > 1) {
16943 SDValue Res =
16944 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16945 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16946 if (Negated)
16947 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16948
16949 return Res;
16950 }
16951
16952 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) {
16953 unsigned MaskedOpcode = Signed ? ISD::MASKED_SDIV : ISD::MASKED_UDIV;
16954 return DAG.getNode(MaskedOpcode, DL, VT, Op.getOperand(0), Op.getOperand(1),
16955 getPredicateForVector(DAG, DL, VT));
16956 }
16957
16958 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16959 // operations, and truncate the result.
16960 EVT WidenedVT;
16961 if (VT == MVT::nxv16i8)
16962 WidenedVT = MVT::nxv8i16;
16963 else if (VT == MVT::nxv8i16)
16964 WidenedVT = MVT::nxv4i32;
16965 else
16966 llvm_unreachable("Unexpected Custom DIV operation");
16967
16968 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16969 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16970 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16971 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16972 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16973 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16974 SDValue ResultLo = DAG.getNode(Opc, DL, WidenedVT, Op0Lo, Op1Lo);
16975 SDValue ResultHi = DAG.getNode(Opc, DL, WidenedVT, Op0Hi, Op1Hi);
16976 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16977 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16978 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16979}
16980
16981bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16982 EVT VT, unsigned DefinedValues) const {
16983 if (!Subtarget->isNeonAvailable())
16984 return false;
16986}
16987
16989 // Currently no fixed length shuffles that require SVE are legal.
16990 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16991 return false;
16992
16993 if (VT.getVectorNumElements() == 4 &&
16994 (VT.is128BitVector() || VT.is64BitVector())) {
16995 unsigned Cost = getPerfectShuffleCost(M);
16996 if (Cost <= 1)
16997 return true;
16998 }
16999
17000 bool DummyBool;
17001 int DummyInt;
17002 unsigned DummyUnsigned;
17003
17004 unsigned EltSize = VT.getScalarSizeInBits();
17005 unsigned NumElts = VT.getVectorNumElements();
17007 isREVMask(M, EltSize, NumElts, 64) ||
17008 isREVMask(M, EltSize, NumElts, 32) ||
17009 isREVMask(M, EltSize, NumElts, 16) ||
17010 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
17011 isSingletonEXTMask(M, VT, DummyUnsigned) ||
17012 isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
17013 isUZPMask(M, NumElts, DummyUnsigned) ||
17014 isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
17015 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
17016 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
17017 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
17018 isINSMask(M, NumElts, DummyBool, DummyInt) ||
17019 isConcatMask(M, VT, VT.getSizeInBits() == 128));
17020}
17021
17023 EVT VT) const {
17024 // Just delegate to the generic legality, clear masks aren't special.
17025 return isShuffleMaskLegal(M, VT);
17026}
17027
17028/// getVShiftImm - Check if this is a valid build_vector for the immediate
17029/// operand of a vector shift operation, where all the elements of the
17030/// build_vector must have the same constant integer value.
17031static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
17032 // Ignore bit_converts.
17033 while (Op.getOpcode() == ISD::BITCAST)
17034 Op = Op.getOperand(0);
17036 APInt SplatBits, SplatUndef;
17037 unsigned SplatBitSize;
17038 bool HasAnyUndefs;
17039 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
17040 HasAnyUndefs, ElementBits) ||
17041 SplatBitSize > ElementBits)
17042 return false;
17043 Cnt = SplatBits.getSExtValue();
17044 return true;
17045}
17046
17047/// isVShiftLImm - Check if this is a valid build_vector for the immediate
17048/// operand of a vector shift left operation. That value must be in the range:
17049/// 0 <= Value < ElementBits for a left shift; or
17050/// 0 <= Value <= ElementBits for a long left shift.
17051static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
17052 assert(VT.isVector() && "vector shift count is not a vector type");
17053 int64_t ElementBits = VT.getScalarSizeInBits();
17054 if (!getVShiftImm(Op, ElementBits, Cnt))
17055 return false;
17056 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
17057}
17058
17059/// isVShiftRImm - Check if this is a valid build_vector for the immediate
17060/// operand of a vector shift right operation. The value must be in the range:
17061/// 1 <= Value <= ElementBits for a right shift; or
17062static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
17063 assert(VT.isVector() && "vector shift count is not a vector type");
17064 int64_t ElementBits = VT.getScalarSizeInBits();
17065 if (!getVShiftImm(Op, ElementBits, Cnt))
17066 return false;
17067 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
17068}
17069
17070SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
17071 SelectionDAG &DAG) const {
17072 EVT VT = Op.getValueType();
17073
17074 if (VT.getScalarType() == MVT::i1) {
17075 // Lower i1 truncate to `(x & 1) != 0`.
17076 SDLoc DL(Op);
17077 EVT OpVT = Op.getOperand(0).getValueType();
17078 SDValue Zero = DAG.getConstant(0, DL, OpVT);
17079 SDValue One = DAG.getConstant(1, DL, OpVT);
17080 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
17081 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
17082 }
17083
17084 if (!VT.isVector() || VT.isScalableVector())
17085 return SDValue();
17086
17087 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
17088 !Subtarget->isNeonAvailable()))
17089 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
17090
17091 // We can select these directly.
17092 if (VT.is64BitVector() && Op.getOperand(0).getValueType().is128BitVector())
17093 return Op;
17094
17095 return SDValue();
17096}
17097
17098// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
17099// possibly a truncated type, it tells how many bits of the value are to be
17100// used.
17102 SelectionDAG &DAG,
17103 unsigned &ShiftValue,
17104 SDValue &RShOperand) {
17105 if (Shift->getOpcode() != ISD::SRL)
17106 return false;
17107
17108 EVT VT = Shift.getValueType();
17109 assert(VT.isScalableVT());
17110
17111 auto ShiftOp1 =
17113 if (!ShiftOp1)
17114 return false;
17115
17116 ShiftValue = ShiftOp1->getZExtValue();
17117 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
17118 return false;
17119
17120 SDValue Add = Shift->getOperand(0);
17121 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
17122 return false;
17123
17125 "ResVT must be truncated or same type as the shift.");
17126 // Check if an overflow can lead to incorrect results.
17127 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
17128 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
17129 return false;
17130
17131 auto AddOp1 =
17133 if (!AddOp1)
17134 return false;
17135 uint64_t AddValue = AddOp1->getZExtValue();
17136 if (AddValue != 1ULL << (ShiftValue - 1))
17137 return false;
17138
17139 RShOperand = Add->getOperand(0);
17140 return true;
17141}
17142
17143SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
17144 SelectionDAG &DAG) const {
17145 EVT VT = Op.getValueType();
17146 SDLoc DL(Op);
17147 int64_t Cnt;
17148
17149 if (!Op.getOperand(1).getValueType().isVector())
17150 return Op;
17151 unsigned EltSize = VT.getScalarSizeInBits();
17152
17153 switch (Op.getOpcode()) {
17154 case ISD::SHL:
17155 if (VT.isScalableVector() ||
17156 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
17157 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
17158
17159 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
17160 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
17161 DAG.getTargetConstant(Cnt, DL, MVT::i32));
17162 return DAG.getNode(
17164 DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
17165 Op.getOperand(0), Op.getOperand(1));
17166 case ISD::SRA:
17167 case ISD::SRL:
17168 if (VT.isScalableVector() &&
17169 (Subtarget->hasSVE2() ||
17170 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
17171 SDValue RShOperand;
17172 unsigned ShiftValue;
17173 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
17174 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
17175 getPredicateForVector(DAG, DL, VT), RShOperand,
17176 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
17177 }
17178
17179 if (VT.isScalableVector() ||
17180 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
17181 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
17182 : AArch64ISD::SRL_PRED;
17183 return LowerToPredicatedOp(Op, DAG, Opc);
17184 }
17185
17186 // Right shift immediate
17187 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
17188 unsigned Opc =
17189 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
17190 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
17191 DAG.getTargetConstant(Cnt, DL, MVT::i32),
17192 Op->getFlags());
17193 }
17194
17195 // Right shift register. Note, there is not a shift right register
17196 // instruction, but the shift left register instruction takes a signed
17197 // value, where negative numbers specify a right shift.
17198 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
17199 : Intrinsic::aarch64_neon_ushl;
17200 // negate the shift amount
17201 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17202 Op.getOperand(1));
17203 SDValue NegShiftLeft =
17205 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
17206 NegShift);
17207 return NegShiftLeft;
17208 }
17209
17210 llvm_unreachable("unexpected shift opcode");
17211}
17212
17213SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
17214 SelectionDAG &DAG) const {
17215 if (Op.getValueType().isScalableVector())
17216 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
17217
17218 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
17219 !Subtarget->isNeonAvailable()))
17220 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
17221
17222 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17223 SDValue LHS = Op.getOperand(0);
17224 SDValue RHS = Op.getOperand(1);
17225 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
17226 SDLoc DL(Op);
17227
17228 if (LHS.getValueType().getVectorElementType().isInteger())
17229 return Op;
17230
17231 assert(((!Subtarget->hasFullFP16() &&
17232 LHS.getValueType().getVectorElementType() != MVT::f16) ||
17233 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
17234 LHS.getValueType().getVectorElementType() != MVT::f128) &&
17235 "Unexpected type!");
17236
17237 // Lower isnan(x) | isnan(never-nan) to x != x.
17238 // Lower !isnan(x) & !isnan(never-nan) to x == x.
17239 if (CC == ISD::SETUO || CC == ISD::SETO) {
17240 bool OneNaN = false;
17241 if (LHS == RHS) {
17242 OneNaN = true;
17243 } else if (DAG.isKnownNeverNaN(RHS)) {
17244 OneNaN = true;
17245 RHS = LHS;
17246 } else if (DAG.isKnownNeverNaN(LHS)) {
17247 OneNaN = true;
17248 LHS = RHS;
17249 }
17250 if (OneNaN) {
17251 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
17252 }
17253 }
17254
17255 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
17256 // clean. Some of them require two branches to implement.
17257 AArch64CC::CondCode CC1, CC2;
17258 bool ShouldInvert;
17259 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
17260
17261 bool NoNaNs = Op->getFlags().hasNoNaNs();
17262 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
17263 if (!Cmp.getNode())
17264 return SDValue();
17265
17266 if (CC2 != AArch64CC::AL) {
17267 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
17268 if (!Cmp2.getNode())
17269 return SDValue();
17270
17271 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
17272 }
17273
17274 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
17275
17276 if (ShouldInvert)
17277 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
17278
17279 return Cmp;
17280}
17281
17282static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
17283 SelectionDAG &DAG) {
17284 SDValue VecOp = ScalarOp.getOperand(0);
17285 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
17286 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
17287 DAG.getConstant(0, DL, MVT::i64));
17288}
17289
17290static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
17291 SDLoc DL, SelectionDAG &DAG) {
17292 unsigned ScalarOpcode;
17293 switch (Opcode) {
17294 case ISD::VECREDUCE_AND:
17295 ScalarOpcode = ISD::AND;
17296 break;
17297 case ISD::VECREDUCE_OR:
17298 ScalarOpcode = ISD::OR;
17299 break;
17300 case ISD::VECREDUCE_XOR:
17301 ScalarOpcode = ISD::XOR;
17302 break;
17303 default:
17304 llvm_unreachable("Expected bitwise vector reduction");
17305 return SDValue();
17306 }
17307
17308 EVT VecVT = Vec.getValueType();
17309 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
17310 "Expected power-of-2 length vector");
17311
17312 EVT ElemVT = VecVT.getVectorElementType();
17313
17314 SDValue Result;
17315 unsigned NumElems = VecVT.getVectorNumElements();
17316
17317 // Special case for boolean reductions
17318 if (ElemVT == MVT::i1) {
17319 // Split large vectors into smaller ones
17320 if (NumElems > 16) {
17321 SDValue Lo, Hi;
17322 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17323 EVT HalfVT = Lo.getValueType();
17324 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
17325 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
17326 }
17327
17328 // Results of setcc operations get widened to 128 bits if their input
17329 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
17330 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
17331 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
17332 // size leads to the best codegen, since e.g. setcc results might need to be
17333 // truncated otherwise.
17334 unsigned ExtendedWidth = 64;
17335 if (Vec.getOpcode() == ISD::SETCC &&
17336 Vec.getOperand(0).getValueSizeInBits() >= 128) {
17337 ExtendedWidth = 128;
17338 }
17339 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
17340
17341 // any_ext doesn't work with umin/umax, so only use it for uadd.
17342 unsigned ExtendOp =
17343 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
17344 SDValue Extended = DAG.getNode(
17345 ExtendOp, DL,
17346 VecVT.changeVectorElementType(*DAG.getContext(), ExtendedVT), Vec);
17347 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
17348 // in that case we bitcast the sign extended values from v2i64 to v4i32
17349 // before reduction for optimal code generation.
17350 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
17351 NumElems == 2 && ExtendedWidth == 128) {
17352 Extended = DAG.getBitcast(MVT::v4i32, Extended);
17353 ExtendedVT = MVT::i32;
17354 }
17355 switch (ScalarOpcode) {
17356 case ISD::AND:
17357 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
17358 break;
17359 case ISD::OR:
17360 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
17361 break;
17362 case ISD::XOR:
17363 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
17364 break;
17365 default:
17366 llvm_unreachable("Unexpected Opcode");
17367 }
17368
17369 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
17370 } else {
17371 // Iteratively split the vector in half and combine using the bitwise
17372 // operation until it fits in a 64 bit register.
17373 while (VecVT.getSizeInBits() > 64) {
17374 SDValue Lo, Hi;
17375 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17376 VecVT = Lo.getValueType();
17377 NumElems = VecVT.getVectorNumElements();
17378 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
17379 }
17380
17381 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
17382
17383 // Do the remaining work on a scalar since it allows the code generator to
17384 // combine the shift and bitwise operation into one instruction and since
17385 // integer instructions can have higher throughput than vector instructions.
17386 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
17387
17388 // Iteratively combine the lower and upper halves of the scalar using the
17389 // bitwise operation, halving the relevant region of the scalar in each
17390 // iteration, until the relevant region is just one element of the original
17391 // vector.
17392 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
17393 SDValue ShiftAmount =
17394 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
17395 SDValue Shifted =
17396 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
17397 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
17398 }
17399
17400 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
17401 }
17402
17403 return DAG.getAnyExtOrTrunc(Result, DL, VT);
17404}
17405
17406SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
17407 SelectionDAG &DAG) const {
17408 SDLoc DL(Op);
17409 SDValue Src = Op.getOperand(0);
17410 EVT SrcVT = Src.getValueType();
17411
17412 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
17413 // widening by inserting zeroes.
17414 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
17415 SrcVT == MVT::v2f16) {
17416 return DAG.getNode(ISD::FADD, DL, MVT::f16,
17417 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
17418 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
17419 }
17420
17421 // Try lowering the reduction to SVE. This will fail for NEON reductions where
17422 // SVE is not preferred.
17423 if (SDValue Result = LowerReductionToSVE(Op, DAG))
17424 return Result;
17425
17426 // Lower NEON reductions.
17427 switch (Op.getOpcode()) {
17428 case ISD::VECREDUCE_AND:
17429 case ISD::VECREDUCE_OR:
17430 case ISD::VECREDUCE_XOR:
17431 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
17432 Op.getValueType(), DL, DAG);
17433 case ISD::VECREDUCE_ADD:
17434 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
17436 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
17438 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
17440 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
17442 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
17443 default:
17444 llvm_unreachable("Unhandled reduction");
17445 }
17446}
17447
17448SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
17449 SelectionDAG &DAG) const {
17450 SDLoc DL(Op);
17451 SDValue Src = Op.getOperand(0);
17452 EVT SrcVT = Src.getValueType();
17453 assert(SrcVT.isScalableVector() && "Unexpected operand type!");
17454
17455 SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
17456 unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
17457 SDValue Identity = DAG.getIdentityElement(BaseOpc, DL, SrcVT, Op->getFlags());
17458
17459 // Whilst we don't know the size of the vector we do know the maximum size so
17460 // can perform a tree reduction with an identity vector, which means once we
17461 // arrive at the result the remaining stages (when the vector is smaller than
17462 // the maximum) have no affect.
17463
17465 unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
17466
17467 for (unsigned I = 0; I < Stages; ++I) {
17468 Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
17469 Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
17470 }
17471
17472 return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
17473}
17474
17475SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
17476 SelectionDAG &DAG) const {
17477 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17478 // No point replacing if we don't have the relevant instruction/libcall anyway
17479 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
17480 return SDValue();
17481
17482 // LSE has an atomic load-clear instruction, but not a load-and.
17483 SDLoc DL(Op);
17484 MVT VT = Op.getSimpleValueType();
17485 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
17486 SDValue RHS = Op.getOperand(2);
17487 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
17488 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
17489 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
17490 Op.getOperand(0), Op.getOperand(1), RHS,
17491 AN->getMemOperand());
17492}
17493
17494SDValue
17495AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
17496 SelectionDAG &DAG) const {
17497
17498 SDLoc DL(Op);
17499 // Get the inputs.
17500 SDNode *Node = Op.getNode();
17501 SDValue Chain = Op.getOperand(0);
17502 SDValue Size = Op.getOperand(1);
17503 MaybeAlign Align =
17504 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17505 EVT VT = Node->getValueType(0);
17506
17508 "no-stack-arg-probe")) {
17509 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17510 Chain = SP.getValue(1);
17511 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17512 if (Align)
17513 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17514 DAG.getSignedConstant(-Align->value(), DL, VT));
17515 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17516 SDValue Ops[2] = {SP, Chain};
17517 return DAG.getMergeValues(Ops, DL);
17518 }
17519
17520 RTLIB::LibcallImpl ChkStkImpl = getLibcallImpl(RTLIB::STACK_PROBE);
17521 if (ChkStkImpl == RTLIB::Unsupported)
17522 return SDValue();
17523
17524 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
17525
17526 EVT PtrVT = getPointerTy(DAG.getDataLayout());
17528 getLibcallImplName(ChkStkImpl).data(), PtrVT, 0);
17529
17530 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17531 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
17532 if (Subtarget->hasCustomCallingConv())
17533 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
17534
17535 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
17536 DAG.getConstant(4, DL, MVT::i64));
17537 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
17538 Chain =
17539 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
17540 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
17541 DAG.getRegisterMask(Mask), Chain.getValue(1));
17542 // To match the actual intent better, we should read the output from X15 here
17543 // again (instead of potentially spilling it to the stack), but rereading Size
17544 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
17545 // here.
17546
17547 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
17548 DAG.getConstant(4, DL, MVT::i64));
17549
17550 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17551 Chain = SP.getValue(1);
17552 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17553 if (Align)
17554 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17555 DAG.getSignedConstant(-Align->value(), DL, VT));
17556 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17557
17558 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
17559
17560 SDValue Ops[2] = {SP, Chain};
17561 return DAG.getMergeValues(Ops, DL);
17562}
17563
17564SDValue
17565AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
17566 SelectionDAG &DAG) const {
17567 // Get the inputs.
17568 SDNode *Node = Op.getNode();
17569 SDValue Chain = Op.getOperand(0);
17570 SDValue Size = Op.getOperand(1);
17571
17572 MaybeAlign Align =
17573 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17574 SDLoc DL(Op);
17575 EVT VT = Node->getValueType(0);
17576
17577 // Construct the new SP value in a GPR.
17578 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17579 Chain = SP.getValue(1);
17580 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17581 if (Align)
17582 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17583 DAG.getSignedConstant(-Align->value(), DL, VT));
17584
17585 // Set the real SP to the new value with a probing loop.
17586 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
17587 SDValue Ops[2] = {SP, Chain};
17588 return DAG.getMergeValues(Ops, DL);
17589}
17590
17591SDValue
17592AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17593 SelectionDAG &DAG) const {
17594 MachineFunction &MF = DAG.getMachineFunction();
17595
17596 if (Subtarget->isTargetWindows())
17597 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
17598 else if (hasInlineStackProbe(MF))
17599 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
17600 else
17601 return SDValue();
17602}
17603
17604SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
17605 unsigned NewOp) const {
17606 if (Subtarget->hasSVE2())
17607 return LowerToPredicatedOp(Op, DAG, NewOp);
17608
17609 // Default to expand.
17610 return SDValue();
17611}
17612
17613SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
17614 SelectionDAG &DAG) const {
17615 EVT VT = Op.getValueType();
17616 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
17617
17618 SDLoc DL(Op);
17619 APInt MulImm = Op.getConstantOperandAPInt(0);
17620 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
17621 VT);
17622}
17623
17624/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
17625template <unsigned NumVecs>
17626static void
17629 Info.opc = ISD::INTRINSIC_VOID;
17630 // Retrieve EC from first vector argument.
17631 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
17633#ifndef NDEBUG
17634 // Check the assumption that all input vectors are the same type.
17635 for (unsigned I = 0; I < NumVecs; ++I)
17636 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
17637 "Invalid type.");
17638#endif
17639 // memVT is `NumVecs * VT`.
17640 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
17641 EC * NumVecs);
17642 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
17643 Info.offset = 0;
17644 Info.align.reset();
17645 Info.flags = MachineMemOperand::MOStore;
17646}
17647
17648/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
17649/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
17650/// specified in the intrinsic calls.
17653 MachineFunction &MF, unsigned Intrinsic) const {
17654 IntrinsicInfo Info;
17655 auto &DL = I.getDataLayout();
17656 switch (Intrinsic) {
17657 case Intrinsic::aarch64_sve_st2:
17658 setInfoSVEStN<2>(*this, DL, Info, I);
17659 Infos.push_back(Info);
17660 return;
17661 case Intrinsic::aarch64_sve_st3:
17662 setInfoSVEStN<3>(*this, DL, Info, I);
17663 Infos.push_back(Info);
17664 return;
17665 case Intrinsic::aarch64_sve_st4:
17666 setInfoSVEStN<4>(*this, DL, Info, I);
17667 Infos.push_back(Info);
17668 return;
17669 case Intrinsic::aarch64_neon_ld2:
17670 case Intrinsic::aarch64_neon_ld3:
17671 case Intrinsic::aarch64_neon_ld4:
17672 case Intrinsic::aarch64_neon_ld1x2:
17673 case Intrinsic::aarch64_neon_ld1x3:
17674 case Intrinsic::aarch64_neon_ld1x4: {
17675 Info.opc = ISD::INTRINSIC_W_CHAIN;
17676 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
17677 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17678 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17679 Info.offset = 0;
17680 Info.align.reset();
17681 // volatile loads with NEON intrinsics not supported
17682 Info.flags = MachineMemOperand::MOLoad;
17683 Infos.push_back(Info);
17684 return;
17685 }
17686 case Intrinsic::aarch64_neon_ld2lane:
17687 case Intrinsic::aarch64_neon_ld3lane:
17688 case Intrinsic::aarch64_neon_ld4lane:
17689 case Intrinsic::aarch64_neon_ld2r:
17690 case Intrinsic::aarch64_neon_ld3r:
17691 case Intrinsic::aarch64_neon_ld4r: {
17692 Info.opc = ISD::INTRINSIC_W_CHAIN;
17693 // ldx return struct with the same vec type
17694 Type *RetTy = I.getType();
17695 auto *StructTy = cast<StructType>(RetTy);
17696 unsigned NumElts = StructTy->getNumElements();
17697 Type *VecTy = StructTy->getElementType(0);
17698 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17699 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17700 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17701 Info.offset = 0;
17702 Info.align.reset();
17703 // volatile loads with NEON intrinsics not supported
17704 Info.flags = MachineMemOperand::MOLoad;
17705 Infos.push_back(Info);
17706 return;
17707 }
17708 case Intrinsic::aarch64_neon_st2:
17709 case Intrinsic::aarch64_neon_st3:
17710 case Intrinsic::aarch64_neon_st4:
17711 case Intrinsic::aarch64_neon_st1x2:
17712 case Intrinsic::aarch64_neon_st1x3:
17713 case Intrinsic::aarch64_neon_st1x4: {
17714 Info.opc = ISD::INTRINSIC_VOID;
17715 unsigned NumElts = 0;
17716 for (const Value *Arg : I.args()) {
17717 Type *ArgTy = Arg->getType();
17718 if (!ArgTy->isVectorTy())
17719 break;
17720 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17721 }
17722 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17723 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17724 Info.offset = 0;
17725 Info.align.reset();
17726 // volatile stores with NEON intrinsics not supported
17727 Info.flags = MachineMemOperand::MOStore;
17728 Infos.push_back(Info);
17729 return;
17730 }
17731 case Intrinsic::aarch64_neon_st2lane:
17732 case Intrinsic::aarch64_neon_st3lane:
17733 case Intrinsic::aarch64_neon_st4lane: {
17734 Info.opc = ISD::INTRINSIC_VOID;
17735 unsigned NumElts = 0;
17736 // all the vector type is same
17737 Type *VecTy = I.getArgOperand(0)->getType();
17738 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17739
17740 for (const Value *Arg : I.args()) {
17741 Type *ArgTy = Arg->getType();
17742 if (!ArgTy->isVectorTy())
17743 break;
17744 NumElts += 1;
17745 }
17746
17747 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17748 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17749 Info.offset = 0;
17750 Info.align.reset();
17751 // volatile stores with NEON intrinsics not supported
17752 Info.flags = MachineMemOperand::MOStore;
17753 Infos.push_back(Info);
17754 return;
17755 }
17756 case Intrinsic::aarch64_ldaxr:
17757 case Intrinsic::aarch64_ldxr: {
17758 Type *ValTy = I.getParamElementType(0);
17759 Info.opc = ISD::INTRINSIC_W_CHAIN;
17760 Info.memVT = MVT::getVT(ValTy);
17761 Info.ptrVal = I.getArgOperand(0);
17762 Info.offset = 0;
17763 Info.align = DL.getABITypeAlign(ValTy);
17765 Infos.push_back(Info);
17766 return;
17767 }
17768 case Intrinsic::aarch64_stlxr:
17769 case Intrinsic::aarch64_stxr: {
17770 Type *ValTy = I.getParamElementType(1);
17771 Info.opc = ISD::INTRINSIC_W_CHAIN;
17772 Info.memVT = MVT::getVT(ValTy);
17773 Info.ptrVal = I.getArgOperand(1);
17774 Info.offset = 0;
17775 Info.align = DL.getABITypeAlign(ValTy);
17777 Infos.push_back(Info);
17778 return;
17779 }
17780 case Intrinsic::aarch64_ldaxp:
17781 case Intrinsic::aarch64_ldxp:
17782 Info.opc = ISD::INTRINSIC_W_CHAIN;
17783 Info.memVT = MVT::i128;
17784 Info.ptrVal = I.getArgOperand(0);
17785 Info.offset = 0;
17786 Info.align = Align(16);
17788 Infos.push_back(Info);
17789 return;
17790 case Intrinsic::aarch64_stlxp:
17791 case Intrinsic::aarch64_stxp:
17792 Info.opc = ISD::INTRINSIC_W_CHAIN;
17793 Info.memVT = MVT::i128;
17794 Info.ptrVal = I.getArgOperand(2);
17795 Info.offset = 0;
17796 Info.align = Align(16);
17798 Infos.push_back(Info);
17799 return;
17800 case Intrinsic::aarch64_sve_ldnt1: {
17801 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17802 Info.opc = ISD::INTRINSIC_W_CHAIN;
17803 Info.memVT = MVT::getVT(I.getType());
17804 Info.ptrVal = I.getArgOperand(1);
17805 Info.offset = 0;
17806 Info.align = DL.getABITypeAlign(ElTy);
17808 Infos.push_back(Info);
17809 return;
17810 }
17811 case Intrinsic::aarch64_sve_stnt1: {
17812 Type *ElTy =
17813 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17814 Info.opc = ISD::INTRINSIC_W_CHAIN;
17815 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17816 Info.ptrVal = I.getArgOperand(2);
17817 Info.offset = 0;
17818 Info.align = DL.getABITypeAlign(ElTy);
17820 Infos.push_back(Info);
17821 return;
17822 }
17823 case Intrinsic::aarch64_mops_memset_tag: {
17824 Value *Dst = I.getArgOperand(0);
17825 Value *Val = I.getArgOperand(1);
17826 Info.opc = ISD::INTRINSIC_W_CHAIN;
17827 Info.memVT = MVT::getVT(Val->getType());
17828 Info.ptrVal = Dst;
17829 Info.offset = 0;
17830 Info.align = I.getParamAlign(0).valueOrOne();
17831 Info.flags = MachineMemOperand::MOStore;
17832 // The size of the memory being operated on is unknown at this point
17833 Info.size = MemoryLocation::UnknownSize;
17834 Infos.push_back(Info);
17835 return;
17836 }
17837 default:
17838 break;
17839 }
17840}
17841
17843 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17844 std::optional<unsigned> ByteOffset) const {
17845 // TODO: This may be worth removing. Check regression tests for diffs.
17846 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17847 ByteOffset))
17848 return false;
17849
17850 // If we're reducing the load width in order to avoid having to use an extra
17851 // instruction to do extension then it's probably a good idea.
17852 if (ExtTy != ISD::NON_EXTLOAD)
17853 return true;
17854 // Don't reduce load width if it would prevent us from combining a shift into
17855 // the offset.
17856 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17857 assert(Mem);
17858 const SDValue &Base = Mem->getBasePtr();
17859 if (Base.getOpcode() == ISD::ADD &&
17860 Base.getOperand(1).getOpcode() == ISD::SHL &&
17861 Base.getOperand(1).hasOneUse() &&
17862 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17863 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17864 if (Mem->getMemoryVT().isScalableVector())
17865 return false;
17866 // The shift can be combined if it matches the size of the value being
17867 // loaded (and so reducing the width would make it not match).
17868 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17869 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17870 if (ShiftAmount == Log2_32(LoadBytes))
17871 return false;
17872 }
17873 // We have no reason to disallow reducing the load width, so allow it.
17874 return true;
17875}
17876
17877// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17879 EVT VT = Extend.getValueType();
17880 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17881 SDValue Extract = Extend.getOperand(0);
17882 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17883 Extract = Extract.getOperand(0);
17884 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17885 EVT VecVT = Extract.getOperand(0).getValueType();
17886 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17887 return false;
17888 }
17889 }
17890 return true;
17891}
17892
17893// Truncations from 64-bit GPR to 32-bit GPR is free.
17895 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17896 return false;
17897 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17898 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17899 return NumBits1 > NumBits2;
17900}
17902 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17903 return false;
17904 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17905 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17906 return NumBits1 > NumBits2;
17907}
17908
17909/// Check if it is profitable to hoist instruction in then/else to if.
17910/// Not profitable if I and it's user can form a FMA instruction
17911/// because we prefer FMSUB/FMADD.
17913 if (I->getOpcode() != Instruction::FMul)
17914 return true;
17915
17916 if (!I->hasOneUse())
17917 return true;
17918
17919 Instruction *User = I->user_back();
17920
17921 if (!(User->getOpcode() == Instruction::FSub ||
17922 User->getOpcode() == Instruction::FAdd))
17923 return true;
17924
17926 const Function *F = I->getFunction();
17927 const DataLayout &DL = F->getDataLayout();
17928 Type *Ty = User->getOperand(0)->getType();
17929
17930 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17932 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17933 I->getFastMathFlags().allowContract()));
17934}
17935
17936// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17937// 64-bit GPR.
17939 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17940 return false;
17941 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17942 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17943 return NumBits1 == 32 && NumBits2 == 64;
17944}
17946 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17947 return false;
17948 unsigned NumBits1 = VT1.getSizeInBits();
17949 unsigned NumBits2 = VT2.getSizeInBits();
17950 return NumBits1 == 32 && NumBits2 == 64;
17951}
17952
17954 EVT VT1 = Val.getValueType();
17955 if (isZExtFree(VT1, VT2)) {
17956 return true;
17957 }
17958
17959 if (Val.getOpcode() != ISD::LOAD)
17960 return false;
17961
17962 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17963 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17964 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17965 VT1.getSizeInBits() <= 32);
17966}
17967
17968bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17969 if (isa<FPExtInst>(Ext))
17970 return false;
17971
17972 // Vector types are not free.
17973 if (Ext->getType()->isVectorTy())
17974 return false;
17975
17976 for (const Use &U : Ext->uses()) {
17977 // The extension is free if we can fold it with a left shift in an
17978 // addressing mode or an arithmetic operation: add, sub, and cmp.
17979
17980 // Is there a shift?
17981 const Instruction *Instr = cast<Instruction>(U.getUser());
17982
17983 // Is this a constant shift?
17984 switch (Instr->getOpcode()) {
17985 case Instruction::Shl:
17986 if (!isa<ConstantInt>(Instr->getOperand(1)))
17987 return false;
17988 break;
17989 case Instruction::GetElementPtr: {
17990 gep_type_iterator GTI = gep_type_begin(Instr);
17991 auto &DL = Ext->getDataLayout();
17992 std::advance(GTI, U.getOperandNo()-1);
17993 Type *IdxTy = GTI.getIndexedType();
17994 // This extension will end up with a shift because of the scaling factor.
17995 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17996 // Get the shift amount based on the scaling factor:
17997 // log2(sizeof(IdxTy)) - log2(8).
17998 if (IdxTy->isScalableTy())
17999 return false;
18000 uint64_t ShiftAmt =
18001 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
18002 3;
18003 // Is the constant foldable in the shift of the addressing mode?
18004 // I.e., shift amount is between 1 and 4 inclusive.
18005 if (ShiftAmt == 0 || ShiftAmt > 4)
18006 return false;
18007 break;
18008 }
18009 case Instruction::Trunc:
18010 // Check if this is a noop.
18011 // trunc(sext ty1 to ty2) to ty1.
18012 if (Instr->getType() == Ext->getOperand(0)->getType())
18013 continue;
18014 [[fallthrough]];
18015 default:
18016 return false;
18017 }
18018
18019 // At this point we can use the bfm family, so this extension is free
18020 // for that use.
18021 }
18022 return true;
18023}
18024
18025static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
18026 unsigned NumElts, bool IsLittleEndian,
18027 SmallVectorImpl<int> &Mask) {
18028 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
18029 return false;
18030
18031 assert(DstWidth % SrcWidth == 0 &&
18032 "TBL lowering is not supported for a conversion instruction with this "
18033 "source and destination element type.");
18034
18035 unsigned Factor = DstWidth / SrcWidth;
18036 unsigned MaskLen = NumElts * Factor;
18037
18038 Mask.clear();
18039 Mask.resize(MaskLen, NumElts);
18040
18041 unsigned SrcIndex = 0;
18042 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
18043 Mask[I] = SrcIndex++;
18044
18045 return true;
18046}
18047
18049 FixedVectorType *ZExtTy,
18050 FixedVectorType *DstTy,
18051 bool IsLittleEndian) {
18052 auto *SrcTy = cast<FixedVectorType>(Op->getType());
18053 unsigned NumElts = SrcTy->getNumElements();
18054 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
18055 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
18056
18057 SmallVector<int> Mask;
18058 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
18059 return nullptr;
18060
18061 auto *FirstEltZero = Builder.CreateInsertElement(
18062 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
18063 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
18064 Result = Builder.CreateBitCast(Result, DstTy);
18065 if (DstTy != ZExtTy)
18066 Result = Builder.CreateZExt(Result, ZExtTy);
18067 return Result;
18068}
18069
18071 FixedVectorType *DstTy,
18072 bool IsLittleEndian) {
18073 auto *SrcTy = cast<FixedVectorType>(Op->getType());
18074 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
18075 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
18076
18077 SmallVector<int> Mask;
18078 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
18079 !IsLittleEndian, Mask))
18080 return nullptr;
18081
18082 auto *FirstEltZero = Builder.CreateInsertElement(
18083 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
18084
18085 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
18086}
18087
18088static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
18089 IRBuilder<> Builder(TI);
18091 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
18092 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
18093 auto *DstTy = cast<FixedVectorType>(TI->getType());
18094 assert(SrcTy->getElementType()->isIntegerTy() &&
18095 "Non-integer type source vector element is not supported");
18096 assert(DstTy->getElementType()->isIntegerTy(8) &&
18097 "Unsupported destination vector element type");
18098 unsigned SrcElemTySz =
18099 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
18100 unsigned DstElemTySz =
18101 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
18102 assert((SrcElemTySz % DstElemTySz == 0) &&
18103 "Cannot lower truncate to tbl instructions for a source element size "
18104 "that is not divisible by the destination element size");
18105 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
18106 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
18107 "Unsupported source vector element type size");
18108 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
18109
18110 // Create a mask to choose every nth byte from the source vector table of
18111 // bytes to create the truncated destination vector, where 'n' is the truncate
18112 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
18113 // 0,8,16,..Y*8th bytes for the little-endian format
18115 for (int Itr = 0; Itr < 16; Itr++) {
18116 if (Itr < NumElements)
18117 MaskConst.push_back(Builder.getInt8(
18118 IsLittleEndian ? Itr * TruncFactor
18119 : Itr * TruncFactor + (TruncFactor - 1)));
18120 else
18121 MaskConst.push_back(Builder.getInt8(255));
18122 }
18123
18124 int MaxTblSz = 128 * 4;
18125 int MaxSrcSz = SrcElemTySz * NumElements;
18126 int ElemsPerTbl =
18127 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
18128 assert(ElemsPerTbl <= 16 &&
18129 "Maximum elements selected using TBL instruction cannot exceed 16!");
18130
18131 int ShuffleCount = 128 / SrcElemTySz;
18132 SmallVector<int> ShuffleLanes;
18133 for (int i = 0; i < ShuffleCount; ++i)
18134 ShuffleLanes.push_back(i);
18135
18136 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
18137 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
18138 // call TBL & save the result in a vector of TBL results for combining later.
18140 while (ShuffleLanes.back() < NumElements) {
18141 Parts.push_back(Builder.CreateBitCast(
18142 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
18143
18144 if (Parts.size() == 4) {
18145 Parts.push_back(ConstantVector::get(MaskConst));
18146 Results.push_back(
18147 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
18148 Parts.clear();
18149 }
18150
18151 for (int i = 0; i < ShuffleCount; ++i)
18152 ShuffleLanes[i] += ShuffleCount;
18153 }
18154
18155 assert((Parts.empty() || Results.empty()) &&
18156 "Lowering trunc for vectors requiring different TBL instructions is "
18157 "not supported!");
18158 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
18159 // registers
18160 if (!Parts.empty()) {
18161 Intrinsic::ID TblID;
18162 switch (Parts.size()) {
18163 case 1:
18164 TblID = Intrinsic::aarch64_neon_tbl1;
18165 break;
18166 case 2:
18167 TblID = Intrinsic::aarch64_neon_tbl2;
18168 break;
18169 case 3:
18170 TblID = Intrinsic::aarch64_neon_tbl3;
18171 break;
18172 }
18173
18174 Parts.push_back(ConstantVector::get(MaskConst));
18175 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
18176 }
18177
18178 // Extract the destination vector from TBL result(s) after combining them
18179 // where applicable. Currently, at most two TBLs are supported.
18180 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
18181 "more than 2 tbl instructions!");
18182 Value *FinalResult = Results[0];
18183 if (Results.size() == 1) {
18184 if (ElemsPerTbl < 16) {
18185 SmallVector<int> FinalMask(ElemsPerTbl);
18186 std::iota(FinalMask.begin(), FinalMask.end(), 0);
18187 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
18188 }
18189 } else {
18190 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
18191 if (ElemsPerTbl < 16) {
18192 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
18193 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
18194 } else {
18195 std::iota(FinalMask.begin(), FinalMask.end(), 0);
18196 }
18197 FinalResult =
18198 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
18199 }
18200
18201 TI->replaceAllUsesWith(FinalResult);
18202 TI->eraseFromParent();
18203}
18204
18206 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
18207 // shuffle_vector instructions are serialized when targeting SVE,
18208 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
18209 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
18210 return false;
18211
18212 // Try to optimize conversions using tbl. This requires materializing constant
18213 // index vectors, which can increase code size and add loads. Skip the
18214 // transform unless the conversion is in a loop block guaranteed to execute
18215 // and we are not optimizing for size.
18216 Function *F = I->getParent()->getParent();
18217 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
18218 return false;
18219
18220 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
18221 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
18222 if (!SrcTy || !DstTy)
18223 return false;
18224
18225 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
18226 // lowered to tbl instructions to insert the original i8 elements
18227 // into i8x lanes. This is enabled for cases where it is beneficial.
18228 auto *ZExt = dyn_cast<ZExtInst>(I);
18229 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
18230 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
18231 if (DstWidth % 8 != 0)
18232 return false;
18233
18234 auto *TruncDstType =
18236 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
18237 // the remaining ZExt folded into the user, don't use tbl lowering.
18238 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
18239 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
18242 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
18243 return false;
18244
18245 DstTy = TruncDstType;
18246 }
18247
18248 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
18249 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
18250 // most one extra extend step is needed and using tbl is not profitable.
18251 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
18252 // udot instruction.
18253 if (SrcWidth * 4 <= DstWidth) {
18254 if (all_of(I->users(), [&](auto *U) {
18255 using namespace llvm::PatternMatch;
18256 auto *SingleUser = cast<Instruction>(&*U);
18257 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
18258 return true;
18259 if (match(SingleUser,
18260 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
18261 m_Value(), m_Specific(I))))
18262 return true;
18263 return false;
18264 }))
18265 return false;
18266 }
18267
18268 if (DstTy->getScalarSizeInBits() >= 64)
18269 return false;
18270
18271 IRBuilder<> Builder(ZExt);
18273 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
18274 DstTy, Subtarget->isLittleEndian());
18275 if (!Result)
18276 return false;
18277 ZExt->replaceAllUsesWith(Result);
18278 ZExt->eraseFromParent();
18279 return true;
18280 }
18281
18282 auto *UIToFP = dyn_cast<UIToFPInst>(I);
18283 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
18284 DstTy->getElementType()->isFloatTy()) ||
18285 (SrcTy->getElementType()->isIntegerTy(16) &&
18286 DstTy->getElementType()->isDoubleTy()))) {
18287 IRBuilder<> Builder(I);
18289 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
18290 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
18291 assert(ZExt && "Cannot fail for the i8 to float conversion");
18292 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
18293 I->replaceAllUsesWith(UI);
18294 I->eraseFromParent();
18295 return true;
18296 }
18297
18298 auto *SIToFP = dyn_cast<SIToFPInst>(I);
18299 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
18300 DstTy->getElementType()->isFloatTy()) {
18301 IRBuilder<> Builder(I);
18302 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
18304 Subtarget->isLittleEndian());
18305 assert(Shuffle && "Cannot fail for the i8 to float conversion");
18306 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
18307 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
18308 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
18309 I->replaceAllUsesWith(SI);
18310 I->eraseFromParent();
18311 return true;
18312 }
18313
18314 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
18315 // followed by a truncate lowered to using tbl.4.
18316 auto *FPToUI = dyn_cast<FPToUIInst>(I);
18317 if (FPToUI &&
18318 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
18319 SrcTy->getElementType()->isFloatTy() &&
18320 DstTy->getElementType()->isIntegerTy(8)) {
18321 IRBuilder<> Builder(I);
18322 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
18323 VectorType::getInteger(SrcTy));
18324 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
18325 I->replaceAllUsesWith(TruncI);
18326 I->eraseFromParent();
18327 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
18328 return true;
18329 }
18330
18331 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
18332 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
18333 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
18334 // registers
18335 auto *TI = dyn_cast<TruncInst>(I);
18336 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
18337 ((SrcTy->getElementType()->isIntegerTy(32) ||
18338 SrcTy->getElementType()->isIntegerTy(64)) &&
18339 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
18340 createTblForTrunc(TI, Subtarget->isLittleEndian());
18341 return true;
18342 }
18343
18344 return false;
18345}
18346
18348 Align &RequiredAlignment) const {
18349 if (!LoadedType.isSimple() ||
18350 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
18351 return false;
18352 // Cyclone supports unaligned accesses.
18353 RequiredAlignment = Align(1);
18354 unsigned NumBits = LoadedType.getSizeInBits();
18355 return NumBits == 32 || NumBits == 64;
18356}
18357
18358/// A helper function for determining the number of interleaved accesses we
18359/// will generate when lowering accesses of the given type.
18361 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
18362 unsigned VecSize = 128;
18363 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18364 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
18365 if (UseScalable && isa<FixedVectorType>(VecTy))
18366 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18367 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
18368}
18369
18372 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
18373 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
18374 return MOStridedAccess;
18376}
18377
18379 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
18380 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18381 auto EC = VecTy->getElementCount();
18382 unsigned MinElts = EC.getKnownMinValue();
18383
18384 UseScalable = false;
18385
18386 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
18387 (!Subtarget->useSVEForFixedLengthVectors() ||
18389 return false;
18390
18391 if (isa<ScalableVectorType>(VecTy) &&
18392 !Subtarget->isSVEorStreamingSVEAvailable())
18393 return false;
18394
18395 // Ensure the number of vector elements is greater than 1.
18396 if (MinElts < 2)
18397 return false;
18398
18399 // Ensure the element type is legal.
18400 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
18401 return false;
18402
18403 if (EC.isScalable()) {
18404 UseScalable = true;
18405 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
18406 }
18407
18408 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
18409 if (Subtarget->useSVEForFixedLengthVectors()) {
18410 unsigned MinSVEVectorSize =
18411 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18412 if (VecSize % MinSVEVectorSize == 0 ||
18413 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
18414 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
18415 UseScalable = true;
18416 return true;
18417 }
18418 }
18419
18420 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
18421 // 128 will be split into multiple interleaved accesses.
18422 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
18423}
18424
18426 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
18427 return ScalableVectorType::get(VTy->getElementType(), 2);
18428
18429 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
18430 return ScalableVectorType::get(VTy->getElementType(), 4);
18431
18432 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
18433 return ScalableVectorType::get(VTy->getElementType(), 8);
18434
18435 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
18436 return ScalableVectorType::get(VTy->getElementType(), 8);
18437
18438 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
18439 return ScalableVectorType::get(VTy->getElementType(), 2);
18440
18441 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
18442 return ScalableVectorType::get(VTy->getElementType(), 4);
18443
18444 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
18445 return ScalableVectorType::get(VTy->getElementType(), 8);
18446
18447 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
18448 return ScalableVectorType::get(VTy->getElementType(), 16);
18449
18450 llvm_unreachable("Cannot handle input vector type");
18451}
18452
18453static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
18454 bool Scalable, Type *LDVTy,
18455 Type *PtrTy) {
18456 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18457 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
18458 Intrinsic::aarch64_sve_ld3_sret,
18459 Intrinsic::aarch64_sve_ld4_sret};
18460 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
18461 Intrinsic::aarch64_neon_ld3,
18462 Intrinsic::aarch64_neon_ld4};
18463 if (Scalable)
18464 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2],
18465 {LDVTy, PtrTy});
18466
18467 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
18468 {LDVTy, PtrTy});
18469}
18470
18471static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
18472 bool Scalable, Type *STVTy,
18473 Type *PtrTy) {
18474 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18475 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
18476 Intrinsic::aarch64_sve_st3,
18477 Intrinsic::aarch64_sve_st4};
18478 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
18479 Intrinsic::aarch64_neon_st3,
18480 Intrinsic::aarch64_neon_st4};
18481 if (Scalable)
18482 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2],
18483 {STVTy, PtrTy});
18484
18485 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
18486 {STVTy, PtrTy});
18487}
18488
18489/// Lower an interleaved load into a ldN intrinsic.
18490///
18491/// E.g. Lower an interleaved load (Factor = 2):
18492/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
18493/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
18494/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
18495///
18496/// Into:
18497/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
18498/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
18499/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
18501 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
18502 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
18503 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18504 "Invalid interleave factor");
18505 assert(!Shuffles.empty() && "Empty shufflevector input");
18506 assert(Shuffles.size() == Indices.size() &&
18507 "Unmatched number of shufflevectors and indices");
18508
18509 auto *LI = dyn_cast<LoadInst>(Load);
18510 if (!LI)
18511 return false;
18512 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
18513
18514 const DataLayout &DL = LI->getDataLayout();
18515
18516 VectorType *VTy = Shuffles[0]->getType();
18517
18518 // Skip if we do not have NEON and skip illegal vector types. We can
18519 // "legalize" wide vector types into multiple interleaved accesses as long as
18520 // the vector types are divisible by 128.
18521 bool UseScalable;
18522 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18523 return false;
18524
18525 // Check if the interleave is a zext(shuffle), that can be better optimized
18526 // into shift / and masks. For the moment we do this just for uitofp (not
18527 // zext) to avoid issues with widening instructions.
18528 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
18529 using namespace llvm::PatternMatch;
18530 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
18531 SI->getType()->getScalarSizeInBits() * 4 ==
18532 SI->user_back()->getType()->getScalarSizeInBits();
18533 }))
18534 return false;
18535
18536 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18537
18538 auto *FVTy = cast<FixedVectorType>(VTy);
18539
18540 // A pointer vector can not be the return type of the ldN intrinsics. Need to
18541 // load integer vectors first and then convert to pointer vectors.
18542 Type *EltTy = FVTy->getElementType();
18543 if (EltTy->isPointerTy())
18544 FVTy =
18545 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
18546
18547 // If we're going to generate more than one load, reset the sub-vector type
18548 // to something legal.
18549 FVTy = FixedVectorType::get(FVTy->getElementType(),
18550 FVTy->getNumElements() / NumLoads);
18551
18552 auto *LDVTy =
18553 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
18554
18555 IRBuilder<> Builder(LI);
18556
18557 // The base address of the load.
18558 Value *BaseAddr = LI->getPointerOperand();
18559
18560 Type *PtrTy = LI->getPointerOperandType();
18561 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
18562 LDVTy->getElementCount());
18563
18564 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18565 UseScalable, LDVTy, PtrTy);
18566
18567 // Holds sub-vectors extracted from the load intrinsic return values. The
18568 // sub-vectors are associated with the shufflevector instructions they will
18569 // replace.
18571
18572 Value *PTrue = nullptr;
18573 if (UseScalable) {
18574 std::optional<unsigned> PgPattern =
18575 getSVEPredPatternFromNumElements(FVTy->getNumElements());
18576 if (Subtarget->getMinSVEVectorSizeInBits() ==
18577 Subtarget->getMaxSVEVectorSizeInBits() &&
18578 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
18579 PgPattern = AArch64SVEPredPattern::all;
18580
18581 auto *PTruePat =
18582 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
18583 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18584 {PTruePat});
18585 }
18586
18587 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
18588
18589 // If we're generating more than one load, compute the base address of
18590 // subsequent loads as an offset from the previous.
18591 if (LoadCount > 0)
18592 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
18593 FVTy->getNumElements() * Factor);
18594
18595 CallInst *LdN;
18596 if (UseScalable)
18597 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
18598 else
18599 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18600
18601 // Extract and store the sub-vectors returned by the load intrinsic.
18602 for (unsigned i = 0; i < Shuffles.size(); i++) {
18603 ShuffleVectorInst *SVI = Shuffles[i];
18604 unsigned Index = Indices[i];
18605
18606 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
18607
18608 if (UseScalable)
18609 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
18610
18611 // Convert the integer vector to pointer vector if the element is pointer.
18612 if (EltTy->isPointerTy())
18613 SubVec = Builder.CreateIntToPtr(
18615 FVTy->getNumElements()));
18616
18617 SubVecs[SVI].push_back(SubVec);
18618 }
18619 }
18620
18621 // Replace uses of the shufflevector instructions with the sub-vectors
18622 // returned by the load intrinsic. If a shufflevector instruction is
18623 // associated with more than one sub-vector, those sub-vectors will be
18624 // concatenated into a single wide vector.
18625 for (ShuffleVectorInst *SVI : Shuffles) {
18626 auto &SubVec = SubVecs[SVI];
18627 auto *WideVec =
18628 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
18629 SVI->replaceAllUsesWith(WideVec);
18630 }
18631
18632 return true;
18633}
18634
18635template <typename Iter>
18636bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
18637 int MaxLookupDist = 20;
18638 unsigned IdxWidth = DL.getIndexSizeInBits(0);
18639 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
18640 const Value *PtrA1 =
18642
18643 while (++It != End) {
18644 if (It->isDebugOrPseudoInst())
18645 continue;
18646 if (MaxLookupDist-- == 0)
18647 break;
18648 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
18649 const Value *PtrB1 =
18650 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
18651 DL, OffsetB);
18652 if (PtrA1 == PtrB1 &&
18653 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
18654 .abs() == 16)
18655 return true;
18656 }
18657 }
18658
18659 return false;
18660}
18661
18662/// Lower an interleaved store into a stN intrinsic.
18663///
18664/// E.g. Lower an interleaved store (Factor = 3):
18665/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
18666/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
18667/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18668///
18669/// Into:
18670/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
18671/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
18672/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
18673/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18674///
18675/// Note that the new shufflevectors will be removed and we'll only generate one
18676/// st3 instruction in CodeGen.
18677///
18678/// Example for a more general valid mask (Factor 3). Lower:
18679/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
18680/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
18681/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18682///
18683/// Into:
18684/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
18685/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
18686/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
18687/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18689 Value *LaneMask,
18690 ShuffleVectorInst *SVI,
18691 unsigned Factor,
18692 const APInt &GapMask) const {
18693
18694 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18695 "Invalid interleave factor");
18696 auto *SI = dyn_cast<StoreInst>(Store);
18697 if (!SI)
18698 return false;
18699 assert(!LaneMask && GapMask.popcount() == Factor &&
18700 "Unexpected mask on store");
18701
18702 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18703 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18704
18705 unsigned LaneLen = VecTy->getNumElements() / Factor;
18706 Type *EltTy = VecTy->getElementType();
18707 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18708
18709 const DataLayout &DL = SI->getDataLayout();
18710 bool UseScalable;
18711
18712 // Skip if we do not have NEON and skip illegal vector types. We can
18713 // "legalize" wide vector types into multiple interleaved accesses as long as
18714 // the vector types are divisible by 128.
18715 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18716 return false;
18717
18718 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18719
18720 Value *Op0 = SVI->getOperand(0);
18721 Value *Op1 = SVI->getOperand(1);
18722 IRBuilder<> Builder(SI);
18723
18724 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18725 // vectors to integer vectors.
18726 if (EltTy->isPointerTy()) {
18727 Type *IntTy = DL.getIntPtrType(EltTy);
18728 unsigned NumOpElts =
18729 cast<FixedVectorType>(Op0->getType())->getNumElements();
18730
18731 // Convert to the corresponding integer vector.
18732 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18733 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18734 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18735
18736 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18737 }
18738
18739 // If we're going to generate more than one store, reset the lane length
18740 // and sub-vector type to something legal.
18741 LaneLen /= NumStores;
18742 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18743
18744 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18745 : SubVecTy;
18746
18747 // The base address of the store.
18748 Value *BaseAddr = SI->getPointerOperand();
18749
18750 auto Mask = SVI->getShuffleMask();
18751
18752 // Sanity check if all the indices are NOT in range.
18753 // If mask is `poison`, `Mask` may be a vector of -1s.
18754 // If all of them are `poison`, OOB read will happen later.
18755 if (llvm::all_of(Mask, equal_to(PoisonMaskElem))) {
18756 return false;
18757 }
18758 // A 64bit st2 which does not start at element 0 will involved adding extra
18759 // ext elements making the st2 unprofitable, and if there is a nearby store
18760 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18761 // zip;ldp pair which has higher throughput.
18762 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18763 (Mask[0] != 0 ||
18764 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18765 DL) ||
18766 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18767 BaseAddr, DL)))
18768 return false;
18769
18770 // Conditionally skip nontemporal stores to prioritize emitting non-temporal
18771 // store instructions, even though AArch64 doesn't have non-temporal
18772 // interleaved stores.
18773 //
18774 // The check is conservative:
18775 //
18776 // - Only when not optimizing for size, as STNP lowering can increase size.
18777 // - Don't skip if the interleaving factor is greater than 2, as the shuffling
18778 // overhead becomes higher.
18779 // - Don't skip if the store value types which are not directly legal.
18780 Function *F = SI->getFunction();
18781 if (Factor == 2 && SI->hasMetadata(LLVMContext::MD_nontemporal) &&
18782 !F->hasOptSize() && !F->hasMinSize() &&
18783 isLegalNTStore(SI->getValueOperand()->getType(), SI->getAlign(), DL))
18784 return false;
18785
18786 Type *PtrTy = SI->getPointerOperandType();
18787 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18788 STVTy->getElementCount());
18789
18790 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18791 UseScalable, STVTy, PtrTy);
18792
18793 Value *PTrue = nullptr;
18794 if (UseScalable) {
18795 std::optional<unsigned> PgPattern =
18796 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18797 if (Subtarget->getMinSVEVectorSizeInBits() ==
18798 Subtarget->getMaxSVEVectorSizeInBits() &&
18799 Subtarget->getMinSVEVectorSizeInBits() ==
18800 DL.getTypeSizeInBits(SubVecTy))
18801 PgPattern = AArch64SVEPredPattern::all;
18802
18803 auto *PTruePat =
18804 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18805 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18806 {PTruePat});
18807 }
18808
18809 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18810
18812
18813 // Split the shufflevector operands into sub vectors for the new stN call.
18814 for (unsigned i = 0; i < Factor; i++) {
18815 Value *Shuffle;
18816 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18817 if (Mask[IdxI] >= 0) {
18818 Shuffle = Builder.CreateShuffleVector(
18819 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18820 } else {
18821 unsigned StartMask = 0;
18822 for (unsigned j = 1; j < LaneLen; j++) {
18823 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18824 if (Mask[IdxJ] >= 0) {
18825 StartMask = Mask[IdxJ] - j;
18826 break;
18827 }
18828 }
18829 // Note: Filling undef gaps with random elements is ok, since
18830 // those elements were being written anyway (with undefs).
18831 // In the case of all undefs we're defaulting to using elems from 0
18832 // Note: StartMask cannot be negative, it's checked in
18833 // isReInterleaveMask
18834 Shuffle = Builder.CreateShuffleVector(
18835 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18836 }
18837
18838 if (UseScalable)
18839 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18840 Shuffle, uint64_t(0));
18841
18842 Ops.push_back(Shuffle);
18843 }
18844
18845 if (UseScalable)
18846 Ops.push_back(PTrue);
18847
18848 // If we generating more than one store, we compute the base address of
18849 // subsequent stores as an offset from the previous.
18850 if (StoreCount > 0)
18851 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18852 BaseAddr, LaneLen * Factor);
18853
18854 Ops.push_back(BaseAddr);
18855 Builder.CreateCall(StNFunc, Ops);
18856 }
18857 return true;
18858}
18859
18861 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18862 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18863 if (Factor != 2 && Factor != 3 && Factor != 4) {
18864 LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
18865 return false;
18866 }
18867 auto *LI = dyn_cast<LoadInst>(Load);
18868 if (!LI)
18869 return false;
18870 assert(!Mask && "Unexpected mask on a load\n");
18871
18873
18874 const DataLayout &DL = LI->getModule()->getDataLayout();
18875 bool UseScalable;
18876 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18877 return false;
18878
18879 // TODO: Add support for using SVE instructions with fixed types later, using
18880 // the code from lowerInterleavedLoad to obtain the correct container type.
18881 if (UseScalable && !VTy->isScalableTy())
18882 return false;
18883
18884 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18885 VectorType *LdTy =
18887 VTy->getElementCount().divideCoefficientBy(NumLoads));
18888
18889 Type *PtrTy = LI->getPointerOperandType();
18890 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18891 UseScalable, LdTy, PtrTy);
18892
18893 IRBuilder<> Builder(LI);
18894 Value *Pred = nullptr;
18895 if (UseScalable)
18896 Pred =
18897 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18898
18899 Value *BaseAddr = LI->getPointerOperand();
18900 Value *Result = nullptr;
18901 if (NumLoads > 1) {
18902 // Create multiple legal small ldN.
18903 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18904 for (unsigned I = 0; I < NumLoads; ++I) {
18905 Value *Offset = Builder.getInt64(I * Factor);
18906
18907 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18908 Value *LdN = nullptr;
18909 if (UseScalable)
18910 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18911 else
18912 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18913 Value *Idx =
18914 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18915 for (unsigned J = 0; J < Factor; ++J) {
18916 ExtractedLdValues[J] = Builder.CreateInsertVector(
18917 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18918 }
18919 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18920 }
18921
18922 // Merge the values from different factors.
18923 Result = PoisonValue::get(DI->getType());
18924 for (unsigned J = 0; J < Factor; ++J)
18925 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18926 } else {
18927 if (UseScalable)
18928 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18929 else
18930 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18931 }
18932
18933 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18934 DI->replaceAllUsesWith(Result);
18935 return true;
18936}
18937
18939 Instruction *Store, Value *Mask,
18940 ArrayRef<Value *> InterleavedValues) const {
18941 unsigned Factor = InterleavedValues.size();
18942 if (Factor != 2 && Factor != 3 && Factor != 4) {
18943 LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
18944 return false;
18945 }
18947 if (!SI)
18948 return false;
18949 assert(!Mask && "Unexpected mask on plain store");
18950
18951 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18952 const DataLayout &DL = SI->getModule()->getDataLayout();
18953
18954 bool UseScalable;
18955 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18956 return false;
18957
18958 // TODO: Add support for using SVE instructions with fixed types later, using
18959 // the code from lowerInterleavedStore to obtain the correct container type.
18960 if (UseScalable && !VTy->isScalableTy())
18961 return false;
18962
18963 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18964
18965 VectorType *StTy =
18967 VTy->getElementCount().divideCoefficientBy(NumStores));
18968
18969 Type *PtrTy = SI->getPointerOperandType();
18970 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18971 UseScalable, StTy, PtrTy);
18972
18973 IRBuilder<> Builder(SI);
18974
18975 Value *BaseAddr = SI->getPointerOperand();
18976 Value *Pred = nullptr;
18977
18978 if (UseScalable)
18979 Pred =
18980 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18981
18982 auto ExtractedValues = InterleavedValues;
18983 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18984 if (UseScalable)
18985 StoreOperands.push_back(Pred);
18986 StoreOperands.push_back(BaseAddr);
18987 for (unsigned I = 0; I < NumStores; ++I) {
18988 Value *Address = BaseAddr;
18989 if (NumStores > 1) {
18990 Value *Offset = Builder.getInt64(I * Factor);
18991 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18992 Value *Idx =
18993 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18994 for (unsigned J = 0; J < Factor; J++) {
18995 StoreOperands[J] =
18996 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18997 }
18998 // update the address
18999 StoreOperands[StoreOperands.size() - 1] = Address;
19000 }
19001 Builder.CreateCall(StNFunc, StoreOperands);
19002 }
19003 return true;
19004}
19005
19007 LLVMContext &Context, const MemOp &Op,
19008 const AttributeList &FuncAttributes) const {
19009 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
19010 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
19011 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
19012 // For zero memset, only use AdvSIMD for 32-byte and above. It would have
19013 // taken one instruction to materialize the v2i64 zero and one store (with
19014 // restrictive addressing mode). Just do i64 stores.
19015 // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
19016 bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
19017 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
19018 if (Op.isAligned(AlignCheck))
19019 return true;
19020 unsigned Fast;
19021 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
19023 Fast;
19024 };
19025
19026 // For non-zero memset, use NEON even for smaller sizes as dup + scalar store
19027 // is efficient
19028 if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
19029 AlignmentIsAcceptable(MVT::v16i8, Align(1)))
19030 return MVT::v16i8;
19031 if (CanUseFP && !IsSmallZeroMemset &&
19032 AlignmentIsAcceptable(MVT::f128, Align(16)))
19033 return MVT::f128;
19034 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
19035 return MVT::i64;
19036 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
19037 return MVT::i32;
19038 return MVT::Other;
19039}
19040
19042 LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
19043 const MemOp &Op, unsigned DstAS, unsigned SrcAS,
19044 const AttributeList &FuncAttributes, EVT *LargestVT) const {
19045 // For non-zero memset with v16i8, don't downgrade. We can extract smaller
19046 // stores (i64, i32, i16, i8) from the v16i8 splat efficiently.
19047 EVT VT = getOptimalMemOpType(Context, Op, FuncAttributes);
19048 if (VT == MVT::v16i8 && Op.isMemset() && !Op.isZeroMemset() &&
19049 Op.size() < 16) {
19050 unsigned Size = Op.size();
19051 unsigned RemainingSize = Size;
19052
19053 // Break down the size into stores that we can extract from v16i8.
19054 // We support: i64 (8 bytes), i32 (4 bytes), i16 (2 bytes), i8 (1 byte)
19055 // Use the largest possible stores first to minimize the number of
19056 // operations.
19057 while (RemainingSize > 0) {
19058 EVT TargetVT;
19059
19060 // Try largest stores first
19061 if (RemainingSize >= 8) {
19062 TargetVT = MVT::i64;
19063 RemainingSize -= 8;
19064 } else if (RemainingSize >= 4) {
19065 TargetVT = MVT::i32;
19066 RemainingSize -= 4;
19067 } else if (RemainingSize >= 2) {
19068 TargetVT = MVT::i16;
19069 RemainingSize -= 2;
19070 } else if (RemainingSize >= 1) {
19071 TargetVT = MVT::i8;
19072 RemainingSize -= 1;
19073 } else {
19074 // Should not reach here, but fall back to default implementation
19075 break;
19076 }
19077
19078 MemOps.push_back(TargetVT);
19079 }
19080
19081 // If we successfully decomposed the entire size, set LargestVT to v16i8
19082 // to ensure getMemsetValue generates the efficient vector splat (DUP).
19083 // We don't add v16i8 to MemOps since we only need it for value generation.
19084 if (RemainingSize == 0 && !MemOps.empty()) {
19085 if (LargestVT)
19086 *LargestVT = VT; // v16i8 for vector splat generation
19087 return true;
19088 }
19089
19090 // Clear MemOps if we didn't successfully handle everything
19091 MemOps.clear();
19092 }
19093 // Otherwise, use the default implementation
19095 Context, MemOps, Limit, Op, DstAS, SrcAS, FuncAttributes, LargestVT);
19096}
19097
19099 const MemOp &Op, const AttributeList &FuncAttributes) const {
19100 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
19101 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
19102 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
19103 // For zero memset, only use AdvSIMD for 32-byte and above. It would have
19104 // taken one instruction to materialize the v2i64 zero and one store (with
19105 // restrictive addressing mode). Just do i64 stores.
19106 // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
19107 bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
19108 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
19109 if (Op.isAligned(AlignCheck))
19110 return true;
19111 unsigned Fast;
19112 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
19114 Fast;
19115 };
19116
19117 // For non-zero memset, use NEON for all sizes where it's beneficial.
19118 // NEON dup + scalar store works for any alignment and is efficient.
19119 if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
19120 AlignmentIsAcceptable(MVT::v16i8, Align(1)))
19121 return LLT::fixed_vector(2, LLT::integer(64));
19122 if (CanUseFP && !IsSmallZeroMemset &&
19123 AlignmentIsAcceptable(MVT::f128, Align(16)))
19124 return LLT::floatIEEE(128);
19125 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
19126 return LLT::integer(64);
19127 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
19128 return LLT::integer(32);
19129 return LLT();
19130}
19131
19132// 12-bit optionally shifted immediates are legal for adds.
19134 if (Immed == std::numeric_limits<int64_t>::min()) {
19135 return false;
19136 }
19137 // Same encoding for add/sub, just flip the sign.
19138 return isLegalArithImmed((uint64_t)std::abs(Immed));
19139}
19140
19142 // We will only emit addvl/inc* instructions for SVE2
19143 if (!Subtarget->hasSVE2())
19144 return false;
19145
19146 // addvl's immediates are in terms of the number of bytes in a register.
19147 // Since there are 16 in the base supported size (128bits), we need to
19148 // divide the immediate by that much to give us a useful immediate to
19149 // multiply by vscale. We can't have a remainder as a result of this.
19150 if (Imm % 16 == 0)
19151 return isInt<6>(Imm / 16);
19152
19153 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
19154 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
19155 // of addvl as a result, so only take h|w|d into account.
19156 // Dec[h|w|d] will cover subtractions.
19157 // Immediates are in the range [1,16], so we can't do a 2's complement check.
19158 // FIXME: Can we make use of other patterns to cover other immediates?
19159
19160 // inch|dech
19161 if (Imm % 8 == 0)
19162 return std::abs(Imm / 8) <= 16;
19163 // incw|decw
19164 if (Imm % 4 == 0)
19165 return std::abs(Imm / 4) <= 16;
19166 // incd|decd
19167 if (Imm % 2 == 0)
19168 return std::abs(Imm / 2) <= 16;
19169
19170 return false;
19171}
19172
19173// Return false to prevent folding
19174// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
19175// if the folding leads to worse code.
19177 SDValue AddNode, SDValue ConstNode) const {
19178 // Let the DAGCombiner decide for vector types and large types.
19179 const EVT VT = AddNode.getValueType();
19180 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
19181 return true;
19182
19183 // It is worse if c1 is legal add immediate, while c1*c2 is not
19184 // and has to be composed by at least two instructions.
19185 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19186 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
19187 const int64_t C1 = C1Node->getSExtValue();
19188 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
19190 return true;
19192 // Adapt to the width of a register.
19193 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
19194 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
19195 if (Insn.size() > 1)
19196 return false;
19197
19198 // Default to true and let the DAGCombiner decide.
19199 return true;
19200}
19201
19202// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
19203// immediates is the same as for an add or a sub.
19205 return isLegalAddImmediate(Immed);
19206}
19207
19208/// isLegalAddressingMode - Return true if the addressing mode represented
19209/// by AM is legal for this target, for a load/store of the specified type.
19211 const AddrMode &AMode, Type *Ty,
19212 unsigned AS, Instruction *I) const {
19213 // AArch64 has five basic addressing modes:
19214 // reg
19215 // reg + 9-bit signed offset
19216 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
19217 // reg1 + reg2
19218 // reg + SIZE_IN_BYTES * reg
19219
19220 // No global is ever allowed as a base.
19221 if (AMode.BaseGV)
19222 return false;
19223
19224 // No reg+reg+imm addressing.
19225 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
19226 return false;
19227
19228 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
19229 // `2*ScaledReg` into `BaseReg + ScaledReg`
19230 AddrMode AM = AMode;
19231 if (AM.Scale && !AM.HasBaseReg) {
19232 if (AM.Scale == 1) {
19233 AM.HasBaseReg = true;
19234 AM.Scale = 0;
19235 } else if (AM.Scale == 2) {
19236 AM.HasBaseReg = true;
19237 AM.Scale = 1;
19238 } else {
19239 return false;
19240 }
19241 }
19242
19243 // A base register is required in all addressing modes.
19244 if (!AM.HasBaseReg)
19245 return false;
19246
19247 if (Ty->isScalableTy()) {
19248 if (isa<ScalableVectorType>(Ty)) {
19249 // See if we have a foldable vscale-based offset, for vector types which
19250 // are either legal or smaller than the minimum; more work will be
19251 // required if we need to consider addressing for types which need
19252 // legalization by splitting.
19253 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
19254 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
19255 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
19256 isPowerOf2_64(VecNumBytes))
19257 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
19258
19259 uint64_t VecElemNumBytes =
19260 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
19261 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
19262 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
19263 }
19264
19265 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
19266 }
19267
19268 // No scalable offsets allowed for non-scalable types.
19269 if (AM.ScalableOffset)
19270 return false;
19271
19272 // check reg + imm case:
19273 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
19274 uint64_t NumBytes = 0;
19275 if (Ty->isSized()) {
19276 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
19277 NumBytes = NumBits / 8;
19278 if (!isPowerOf2_64(NumBits))
19279 NumBytes = 0;
19280 }
19281
19282 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
19283 AM.Scale);
19284}
19285
19286// Check whether the 2 offsets belong to the same imm24 range, and their high
19287// 12bits are same, then their high part can be decoded with the offset of add.
19288int64_t
19290 int64_t MaxOffset) const {
19291 int64_t HighPart = MinOffset & ~0xfffULL;
19292 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
19293 // Rebase the value to an integer multiple of imm12.
19294 return HighPart;
19295 }
19296
19297 return 0;
19298}
19299
19301 // Consider splitting large offset of struct or array.
19302 return true;
19303}
19304
19306 const MachineFunction &MF, EVT VT) const {
19307 EVT ScalarVT = VT.getScalarType();
19308
19309 if (!ScalarVT.isSimple())
19310 return false;
19311
19312 switch (ScalarVT.getSimpleVT().SimpleTy) {
19313 case MVT::f16:
19314 return Subtarget->hasFullFP16();
19315 case MVT::f32:
19316 case MVT::f64:
19317 return true;
19318 case MVT::bf16:
19319 return VT.isScalableVector() && Subtarget->hasBF16() &&
19320 Subtarget->isNonStreamingSVEorSME2Available();
19321 default:
19322 break;
19323 }
19324
19325 return false;
19326}
19327
19329 Type *Ty) const {
19330 switch (Ty->getScalarType()->getTypeID()) {
19331 case Type::FloatTyID:
19332 case Type::DoubleTyID:
19333 return true;
19334 default:
19335 return false;
19336 }
19337}
19338
19340 EVT VT, CodeGenOptLevel OptLevel) const {
19341 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
19343}
19344
19345const MCPhysReg *
19347 // LR is a callee-save register, but we must treat it as clobbered by any call
19348 // site. Hence we include LR in the scratch registers, which are in turn added
19349 // as implicit-defs for stackmaps and patchpoints.
19350 static const MCPhysReg ScratchRegs[] = {
19351 AArch64::X16, AArch64::X17, AArch64::LR, 0
19352 };
19353 return ScratchRegs;
19354}
19355
19357 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
19358 return RCRegs;
19359}
19360
19361bool
19363 CombineLevel Level) const {
19364 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
19365 N->getOpcode() == ISD::SRL) &&
19366 "Expected shift op");
19367
19368 SDValue ShiftLHS = N->getOperand(0);
19369 EVT VT = N->getValueType(0);
19370
19371 if (!ShiftLHS->hasOneUse())
19372 return false;
19373
19374 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
19375 !ShiftLHS.getOperand(0)->hasOneUse())
19376 return false;
19377
19378 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
19379 // combine it with shift 'N' to let it be lowered to UBFX except:
19380 // ((x >> C) & mask) << C.
19381 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
19382 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
19383 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
19384 if (isMask_64(TruncMask)) {
19385 SDValue AndLHS = ShiftLHS.getOperand(0);
19386 if (AndLHS.getOpcode() == ISD::SRL) {
19387 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
19388 if (N->getOpcode() == ISD::SHL)
19389 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
19390 return SRLC->getZExtValue() == SHLC->getZExtValue();
19391 return false;
19392 }
19393 }
19394 }
19395 }
19396 return true;
19397}
19398
19400 const SDNode *N) const {
19401 assert(N->getOpcode() == ISD::XOR &&
19402 (N->getOperand(0).getOpcode() == ISD::SHL ||
19403 N->getOperand(0).getOpcode() == ISD::SRL) &&
19404 "Expected XOR(SHIFT) pattern");
19405
19406 // Only commute if the entire NOT mask is a hidden shifted mask.
19407 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
19408 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19409 if (XorC && ShiftC) {
19410 unsigned MaskIdx, MaskLen;
19411 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
19412 unsigned ShiftAmt = ShiftC->getZExtValue();
19413 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
19414 if (N->getOperand(0).getOpcode() == ISD::SHL)
19415 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
19416 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
19417 }
19418 }
19419
19420 return false;
19421}
19422
19424 const SDNode *N) const {
19425 assert(((N->getOpcode() == ISD::SHL &&
19426 N->getOperand(0).getOpcode() == ISD::SRL) ||
19427 (N->getOpcode() == ISD::SRL &&
19428 N->getOperand(0).getOpcode() == ISD::SHL)) &&
19429 "Expected shift-shift mask");
19430 // Don't allow multiuse shift folding with the same shift amount.
19431 if (!N->getOperand(0)->hasOneUse())
19432 return false;
19433
19434 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
19435 EVT VT = N->getValueType(0);
19436 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
19437 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19438 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
19439 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
19440 }
19441
19442 // We do not need to fold when this shifting used in specific load case:
19443 // (ldr x, (add x, (shl (srl x, c1) 2)))
19444 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
19445 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
19446 unsigned ShlAmt = C2->getZExtValue();
19447 if (auto ShouldADD = *N->user_begin();
19448 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
19449 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
19450 EVT MemVT = Load->getMemoryVT();
19451
19452 if (Load->getValueType(0).isScalableVector())
19453 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
19454
19455 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
19456 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
19457 }
19458 }
19459 }
19460 }
19461
19462 return true;
19463}
19464
19466 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
19467 SDValue Y) const {
19468 return VT.isScalableVector() && isTypeLegal(VT) &&
19469 SelectOpcode == ISD::VSELECT;
19470}
19471
19473 Type *Ty) const {
19474 assert(Ty->isIntegerTy());
19475
19476 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19477 if (BitSize == 0)
19478 return false;
19479
19480 int64_t Val = Imm.getSExtValue();
19481 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
19482 return true;
19483
19484 if (Val < 0)
19485 Val = ~Val;
19486 if (BitSize == 32)
19487 Val &= (1LL << 32) - 1;
19488
19489 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
19490 // MOVZ is free so return true for one or fewer MOVK.
19491 return Shift < 3;
19492}
19493
19495 unsigned Index) const {
19497 return false;
19498
19499 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
19500}
19501
19503 LLVMContext &Context, EVT VT) const {
19504 if (getTypeAction(Context, VT) != TypeExpandInteger)
19505 return false;
19506
19507 EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
19508 return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
19509}
19510
19511/// Turn vector tests of the signbit in the form of:
19512/// xor (sra X, elt_size(X)-1), -1
19513/// into:
19514/// cmge X, X, #0
19516 const AArch64Subtarget *Subtarget) {
19517 EVT VT = N->getValueType(0);
19518 if (!Subtarget->hasNEON() || !VT.isVector())
19519 return SDValue();
19520
19521 // There must be a shift right algebraic before the xor, and the xor must be a
19522 // 'not' operation.
19523 SDValue Shift = N->getOperand(0);
19524 SDValue Ones = N->getOperand(1);
19525 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
19527 return SDValue();
19528
19529 // The shift should be smearing the sign bit across each vector element.
19530 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
19531 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
19532 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
19533 return SDValue();
19534
19535 SDLoc DL(N);
19536 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
19537 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
19538}
19539
19540// Given a vecreduce_add node, detect the below pattern and convert it to the
19541// node sequence with UABDL, [S|U]ADB and UADDLP.
19542//
19543// i32 vecreduce_add(
19544// v16i32 abs(
19545// v16i32 sub(
19546// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
19547//
19548// or
19549//
19550// i32 vecreduce_add(
19551// v16i32 zext(
19552// v16i16 abs(
19553// v16i16 sub(
19554// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
19555//
19556// =================>
19557// i32 vecreduce_add(
19558// v4i32 UADDLP(
19559// v8i16 add(
19560// v8i16 zext(
19561// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
19562// v8i16 zext(
19563// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
19565 SelectionDAG &DAG) {
19566 // Assumed i32 vecreduce_add
19567 if (N->getValueType(0) != MVT::i32)
19568 return SDValue();
19569
19570 SDValue VecReduceOp0 = N->getOperand(0);
19571 bool SawTrailingZext = false;
19572 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
19573 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
19574 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
19575 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
19576 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
19577 SawTrailingZext = true;
19578 VecReduceOp0 = VecReduceOp0.getOperand(0);
19579 }
19580
19581 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
19582 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
19583 // Assumed v16i16 or v16i32 abs input
19584 unsigned Opcode = VecReduceOp0.getOpcode();
19585 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
19586 return SDValue();
19587
19588 SDValue ABS = VecReduceOp0;
19589 // Assumed v16i16 or v16i32 sub
19590 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
19591 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
19592 return SDValue();
19593
19594 SDValue SUB = ABS->getOperand(0);
19595 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
19596 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
19597 // Assumed v16i16 or v16i32 type
19598 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
19599 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
19600 return SDValue();
19601
19602 // Assumed zext or sext
19603 bool IsZExt = false;
19604 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
19605 IsZExt = true;
19606 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
19607 IsZExt = false;
19608 } else
19609 return SDValue();
19610
19611 SDValue EXT0 = SUB->getOperand(0);
19612 SDValue EXT1 = SUB->getOperand(1);
19613 // Assumed zext's operand has v16i8 type
19614 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
19615 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
19616 return SDValue();
19617
19618 // Pattern is detected. Let's convert it to sequence of nodes.
19619 SDLoc DL(N);
19620
19621 // First, create the node pattern of UABD/SABD.
19622 SDValue UABDHigh8Op0 =
19623 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19624 DAG.getConstant(8, DL, MVT::i64));
19625 SDValue UABDHigh8Op1 =
19626 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19627 DAG.getConstant(8, DL, MVT::i64));
19628 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19629 UABDHigh8Op0, UABDHigh8Op1);
19630 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
19631
19632 // Second, create the node pattern of UABAL.
19633 SDValue UABDLo8Op0 =
19634 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19635 DAG.getConstant(0, DL, MVT::i64));
19636 SDValue UABDLo8Op1 =
19637 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19638 DAG.getConstant(0, DL, MVT::i64));
19639 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19640 UABDLo8Op0, UABDLo8Op1);
19641 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
19642 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
19643
19644 // Third, create the node of UADDLP.
19645 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
19646
19647 // Fourth, create the node of VECREDUCE_ADD.
19648 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
19649}
19650
19651static SDValue
19653 const AArch64Subtarget *ST) {
19654 if (DCI.isBeforeLegalize())
19655 return SDValue();
19656
19657 if (SDValue Brk = optimizeBrk(N, DCI.DAG))
19658 return Brk;
19659
19660 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
19661 /*IsEqual=*/false))
19662 return While;
19663
19664 if (!N->getValueType(0).isScalableVector() ||
19665 !ST->isSVEorStreamingSVEAvailable() ||
19666 !(ST->hasSVE2p1() || ST->hasSME2()))
19667 return SDValue();
19668
19669 // Count the number of users which are extract_vectors.
19670 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
19671 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
19672 });
19673
19674 auto MaskEC = N->getValueType(0).getVectorElementCount();
19675 if (!MaskEC.isKnownMultipleOf(NumExts))
19676 return SDValue();
19677
19678 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
19679 if (ExtMinEC.getKnownMinValue() < 2)
19680 return SDValue();
19681
19682 SmallVector<SDNode *> Extracts(NumExts, nullptr);
19683 for (SDNode *Use : N->users()) {
19684 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
19685 continue;
19686
19687 // Ensure the extract type is correct (e.g. if NumExts is 4 and
19688 // the mask return type is nxv8i1, each extract should be nxv2i1.
19689 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
19690 return SDValue();
19691
19692 // There should be exactly one extract for each part of the mask.
19693 unsigned Offset = Use->getConstantOperandVal(1);
19694 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
19695 if (Extracts[Part] != nullptr)
19696 return SDValue();
19697
19698 Extracts[Part] = Use;
19699 }
19700
19701 SelectionDAG &DAG = DCI.DAG;
19702 SDLoc DL(N);
19703 SDValue ID =
19704 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
19705
19706 SDValue Idx = N->getOperand(0);
19707 SDValue TC = N->getOperand(1);
19708 if (Idx.getValueType() != MVT::i64) {
19709 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
19710 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
19711 }
19712
19713 // Create the whilelo_x2 intrinsics from each pair of extracts
19714 EVT ExtVT = Extracts[0]->getValueType(0);
19715 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19716 auto R =
19717 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19718 DCI.CombineTo(Extracts[0], R.getValue(0));
19719 DCI.CombineTo(Extracts[1], R.getValue(1));
19720 SmallVector<SDValue> Concats = {DAG.getNode(
19721 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
19722
19723 if (NumExts == 2) {
19724 assert(N->getValueType(0) == DoubleExtVT);
19725 return Concats[0];
19726 }
19727
19728 auto Elts =
19729 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
19730 for (unsigned I = 2; I < NumExts; I += 2) {
19731 // After the first whilelo_x2, we need to increment the starting value.
19732 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
19733 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19734 DCI.CombineTo(Extracts[I], R.getValue(0));
19735 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
19736 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
19737 R.getValue(0), R.getValue(1)));
19738 }
19739
19740 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
19741}
19742
19743// Turn vecreduce.add(ZExt(predicate)) into cntp(predicate).
19745 const AArch64Subtarget *ST) {
19746 SDValue Op = N->getOperand(0);
19747 if (Op->getOpcode() != ISD::ZERO_EXTEND)
19748 return SDValue();
19749
19750 SDValue ZExtOp = Op->getOperand(0);
19751 EVT VT = ZExtOp.getValueType();
19752 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19754 return SDValue();
19755
19756 SDLoc DL(N);
19757 SDValue Cntp = DAG.getNode(
19758 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
19759 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), ZExtOp,
19760 ZExtOp);
19761 return DAG.getZExtOrTrunc(Cntp, DL, N->getValueType(0));
19762}
19763
19765 const AArch64Subtarget *ST) {
19766 if (SDValue Result = performVecReduceAddCntpCombine(N, DAG, ST))
19767 return Result;
19768
19769 if (!ST->isNeonAvailable())
19770 return SDValue();
19771
19772 if (!ST->hasDotProd())
19774
19775 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
19776 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
19777 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
19778 // If we have vectors larger than v16i8 we extract v16i8 vectors,
19779 // Follow the same steps above to get DOT instructions concatenate them
19780 // and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
19781
19782 SDValue Op0 = N->getOperand(0);
19783 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
19784 Op0.getValueType().getVectorElementType() != MVT::i32)
19785 return SDValue();
19786
19787 unsigned ExtOpcode = Op0.getOpcode();
19788 SDValue A = Op0;
19789 SDValue B;
19790 unsigned DotOpcode;
19791 if (ExtOpcode == ISD::MUL) {
19792 A = Op0.getOperand(0);
19793 B = Op0.getOperand(1);
19794 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
19795 return SDValue();
19796 auto OpCodeA = A.getOpcode();
19797 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
19798 return SDValue();
19799
19800 auto OpCodeB = B.getOpcode();
19801 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
19802 return SDValue();
19803
19804 if (OpCodeA == OpCodeB) {
19805 DotOpcode =
19806 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
19807 } else {
19808 // Check USDOT support support
19809 if (!ST->hasMatMulInt8())
19810 return SDValue();
19811 DotOpcode = AArch64ISD::USDOT;
19812 if (OpCodeA == ISD::SIGN_EXTEND)
19813 std::swap(A, B);
19814 }
19815 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
19816 DotOpcode = AArch64ISD::UDOT;
19817 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
19818 DotOpcode = AArch64ISD::SDOT;
19819 } else {
19820 return SDValue();
19821 }
19822
19823 EVT Op0VT = A.getOperand(0).getValueType();
19824 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19825 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19826 if (!IsValidElementCount || !IsValidSize)
19827 return SDValue();
19828
19829 SDLoc DL(Op0);
19830 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19831 // the extend B.
19832 if (!B)
19833 B = DAG.getConstant(1, DL, Op0VT);
19834 else
19835 B = B.getOperand(0);
19836
19837 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19838 unsigned NumOfVecReduce;
19839 EVT TargetType;
19840 if (IsMultipleOf16) {
19841 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19842 TargetType = MVT::v4i32;
19843 } else {
19844 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19845 TargetType = MVT::v2i32;
19846 }
19847 // Handle the case where we need to generate only one Dot operation.
19848 if (NumOfVecReduce == 1) {
19849 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19850 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19851 A.getOperand(0), B);
19852 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19853 }
19854 // Generate Dot instructions that are multiple of 16.
19855 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19856 SmallVector<SDValue, 4> SDotVec16;
19857 unsigned I = 0;
19858 for (; I < VecReduce16Num; I += 1) {
19859 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19860 SDValue Op0 =
19861 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19862 DAG.getConstant(I * 16, DL, MVT::i64));
19863 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19864 DAG.getConstant(I * 16, DL, MVT::i64));
19865 SDValue Dot =
19866 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19867 SDotVec16.push_back(Dot);
19868 }
19869 // Concatenate dot operations.
19870 EVT SDot16EVT =
19871 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19872 SDValue ConcatSDot16 =
19873 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19874 SDValue VecReduceAdd16 =
19875 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19876 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19877 if (VecReduce8Num == 0)
19878 return VecReduceAdd16;
19879
19880 // Generate the remainder Dot operation that is multiple of 8.
19881 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19882 SDValue Vec8Op0 =
19883 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19884 DAG.getConstant(I * 16, DL, MVT::i64));
19885 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19886 DAG.getConstant(I * 16, DL, MVT::i64));
19887 SDValue Dot =
19888 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19889 SDValue VecReduceAdd8 =
19890 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19891 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19892 VecReduceAdd8);
19893}
19894
19895// Given an (integer) vecreduce, we know the order of the inputs does not
19896// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19897// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19898// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19900 auto DetectAddExtract = [&](SDValue A) {
19901 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19902 // UADDLP(x) if found.
19903 assert(A.getOpcode() == ISD::ADD);
19904 EVT VT = A.getValueType();
19905 SDValue Op0 = A.getOperand(0);
19906 SDValue Op1 = A.getOperand(1);
19907 if (Op0.getOpcode() != Op1.getOpcode() ||
19908 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19909 Op0.getOpcode() != ISD::SIGN_EXTEND))
19910 return SDValue();
19911 SDValue Ext0 = Op0.getOperand(0);
19912 SDValue Ext1 = Op1.getOperand(0);
19913 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19915 Ext0.getOperand(0) != Ext1.getOperand(0) ||
19917 return SDValue();
19918 // Check that the type is twice the add types, and the extract are from
19919 // upper/lower parts of the same source.
19921 VT.getVectorNumElements() * 2)
19922 return SDValue();
19923 if ((Ext0.getConstantOperandVal(1) != 0 ||
19925 (Ext1.getConstantOperandVal(1) != 0 ||
19927 return SDValue();
19928 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19929 : AArch64ISD::SADDLP;
19930 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19931 };
19932
19933 if (SDValue R = DetectAddExtract(A))
19934 return R;
19935
19936 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19937 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19938 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19939 A.getOperand(1));
19940 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19941 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19942 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19943 A.getOperand(0));
19944 return SDValue();
19945}
19946
19947// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19948// UADDLV(concat), where the concat represents the 64-bit zext sources.
19950 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19951 // UADDLV(concat(zext, zext)) if found.
19952 assert(A.getOpcode() == ISD::ADD);
19953 EVT VT = A.getValueType();
19954 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19955 return SDValue();
19956 SDValue Op0 = A.getOperand(0);
19957 SDValue Op1 = A.getOperand(1);
19958 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19959 return SDValue();
19960 SDValue Ext0 = Op0.getOperand(0);
19961 SDValue Ext1 = Op1.getOperand(0);
19962 EVT ExtVT0 = Ext0.getValueType();
19963 EVT ExtVT1 = Ext1.getValueType();
19964 // Check zext VTs are the same and 64-bit length.
19965 if (ExtVT0 != ExtVT1 ||
19966 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19967 return SDValue();
19968 // Get VT for concat of zext sources.
19969 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19970 SDValue Concat =
19971 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19972
19973 switch (VT.getSimpleVT().SimpleTy) {
19974 case MVT::v2i64:
19975 case MVT::v4i32:
19976 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19977 case MVT::v8i16: {
19978 SDValue Uaddlv =
19979 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19980 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19981 }
19982 default:
19983 llvm_unreachable("Unhandled vector type");
19984 }
19985}
19986
19988 SDValue A = N->getOperand(0);
19989 if (A.getOpcode() == ISD::ADD) {
19990 if (SDValue R = performUADDVAddCombine(A, DAG))
19991 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19992 else if (SDValue R = performUADDVZextCombine(A, DAG))
19993 return R;
19994 }
19995
19996 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19997 MVT OpVT = A.getSimpleValueType();
19998 assert(N->getSimpleValueType(0) == OpVT &&
19999 "The operand type should be consistent with the result type of UADDV");
20001 Mask.clearBit(0);
20002 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
20003 if (KnownLeadingLanes.isZero())
20004 return A;
20005
20006 return SDValue();
20007}
20008
20012 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
20013 APInt DemandedElts =
20014 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
20015
20017 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
20018 return SDValue(N, 0);
20019 return SDValue();
20020}
20021
20024 const AArch64Subtarget *Subtarget) {
20025 if (DCI.isBeforeLegalizeOps())
20026 return SDValue();
20027
20028 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
20029}
20030
20031SDValue
20032AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
20033 SelectionDAG &DAG,
20034 SmallVectorImpl<SDNode *> &Created) const {
20035 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
20036 if (isIntDivCheap(N->getValueType(0), Attr))
20037 return SDValue(N, 0); // Lower SDIV as SDIV
20038
20039 EVT VT = N->getValueType(0);
20040
20041 // If SVE is available, we can generate
20042 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
20043 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
20044 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
20045 return SDValue(N, 0);
20046
20047 // fold (sdiv X, pow2)
20048 if ((VT != MVT::i32 && VT != MVT::i64) ||
20049 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
20050 return SDValue();
20051
20052 // If the divisor is 2 or -2, the default expansion is better. It will add
20053 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
20054 if (Divisor == 2 ||
20055 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
20056 return SDValue();
20057
20058 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
20059}
20060
20061SDValue
20062AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
20063 SelectionDAG &DAG,
20064 SmallVectorImpl<SDNode *> &Created) const {
20065 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
20066 if (isIntDivCheap(N->getValueType(0), Attr))
20067 return SDValue(N, 0); // Lower SREM as SREM
20068
20069 EVT VT = N->getValueType(0);
20070
20071 // For scalable and fixed types, mark them as cheap so we can handle it much
20072 // later. This allows us to handle larger than legal types.
20073 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
20074 return SDValue(N, 0);
20075
20076 // fold (srem X, pow2)
20077 if ((VT != MVT::i32 && VT != MVT::i64) ||
20078 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
20079 return SDValue();
20080
20081 unsigned Lg2 = Divisor.countr_zero();
20082 if (Lg2 == 0)
20083 return SDValue();
20084
20085 SDLoc DL(N);
20086 SDValue N0 = N->getOperand(0);
20087 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
20088 SDValue Zero = DAG.getConstant(0, DL, VT);
20089 SDValue CCVal, CSNeg;
20090 if (Lg2 == 1) {
20091 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
20092 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
20093 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
20094
20095 Created.push_back(Cmp.getNode());
20096 Created.push_back(And.getNode());
20097 } else {
20098 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
20099 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
20100
20101 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
20102 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
20103 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
20104 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
20105 Negs.getValue(1));
20106
20107 Created.push_back(Negs.getNode());
20108 Created.push_back(AndPos.getNode());
20109 Created.push_back(AndNeg.getNode());
20110 }
20111
20112 return CSNeg;
20113}
20114
20116 switch(getIntrinsicID(S.getNode())) {
20117 default:
20118 break;
20119 case Intrinsic::aarch64_sve_cntb:
20120 case Intrinsic::aarch64_sve_cnth:
20121 case Intrinsic::aarch64_sve_cntw:
20122 case Intrinsic::aarch64_sve_cntd:
20123 return true;
20124 }
20125 return false;
20126}
20127
20128// Returns the maximum (scalable) value that can be returned by an SVE count
20129// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
20130static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
20131 Intrinsic::ID IID = getIntrinsicID(Op.getNode());
20132 if (IID == Intrinsic::aarch64_sve_cntp)
20133 return Op.getOperand(1).getValueType().getVectorElementCount();
20134 switch (IID) {
20135 case Intrinsic::aarch64_sve_cntd:
20136 return ElementCount::getScalable(2);
20137 case Intrinsic::aarch64_sve_cntw:
20138 return ElementCount::getScalable(4);
20139 case Intrinsic::aarch64_sve_cnth:
20140 return ElementCount::getScalable(8);
20141 case Intrinsic::aarch64_sve_cntb:
20142 return ElementCount::getScalable(16);
20143 default:
20144 return std::nullopt;
20145 }
20146}
20147
20148/// Calculates what the pre-extend type is, based on the extension
20149/// operation node provided by \p Extend.
20150///
20151/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
20152/// pre-extend type is pulled directly from the operand, while other extend
20153/// operations need a bit more inspection to get this information.
20154///
20155/// \param Extend The SDNode from the DAG that represents the extend operation
20156///
20157/// \returns The type representing the \p Extend source type, or \p MVT::Other
20158/// if no valid type can be determined
20160 switch (Extend.getOpcode()) {
20161 case ISD::SIGN_EXTEND:
20162 case ISD::ZERO_EXTEND:
20163 case ISD::ANY_EXTEND:
20164 return Extend.getOperand(0).getValueType();
20165 case ISD::AssertSext:
20166 case ISD::AssertZext:
20168 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
20169 if (!TypeNode)
20170 return MVT::Other;
20171 return TypeNode->getVT();
20172 }
20173 case ISD::AND: {
20176 if (!Constant)
20177 return MVT::Other;
20178
20179 uint32_t Mask = Constant->getZExtValue();
20180
20181 if (Mask == UCHAR_MAX)
20182 return MVT::i8;
20183 else if (Mask == USHRT_MAX)
20184 return MVT::i16;
20185 else if (Mask == UINT_MAX)
20186 return MVT::i32;
20187
20188 return MVT::Other;
20189 }
20190 default:
20191 return MVT::Other;
20192 }
20193}
20194
20195/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
20196/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
20197/// SExt/ZExt rather than the scalar SExt/ZExt
20199 EVT VT = BV.getValueType();
20200 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
20202 return SDValue();
20203
20204 // Use the first item in the buildvector/shuffle to get the size of the
20205 // extend, and make sure it looks valid.
20206 SDValue Extend = BV->getOperand(0);
20207 unsigned ExtendOpcode = Extend.getOpcode();
20208 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
20209 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
20210 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
20211 ExtendOpcode == ISD::AssertSext;
20212 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
20213 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
20214 return SDValue();
20215 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
20216 // ensure calculatePreExtendType will work without issue.
20217 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
20218 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND &&
20219 ExtendOpcode != ISD::ANY_EXTEND)
20220 return SDValue();
20221
20222 // Restrict valid pre-extend data type
20223 EVT PreExtendType = calculatePreExtendType(Extend);
20224 if (PreExtendType == MVT::Other ||
20225 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
20226 return SDValue();
20227
20228 // Make sure all other operands are equally extended.
20229 bool SeenZExtOrSExt = !IsAnyExt;
20230 for (SDValue Op : drop_begin(BV->ops())) {
20231 if (Op.isUndef())
20232 continue;
20233
20234 if (calculatePreExtendType(Op) != PreExtendType)
20235 return SDValue();
20236
20237 unsigned Opc = Op.getOpcode();
20240 return SDValue();
20241
20242 if (Opc == ISD::ANY_EXTEND)
20243 continue;
20244
20245 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
20247
20248 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
20249 return SDValue();
20250
20251 IsSExt = OpcIsSExt;
20252 SeenZExtOrSExt = true;
20253 }
20254
20255 SDValue NBV;
20256 SDLoc DL(BV);
20257 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
20258 EVT PreExtendVT =
20259 VT.changeVectorElementType(*DAG.getContext(), PreExtendType);
20260 EVT PreExtendLegalType =
20261 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
20263 for (SDValue Op : BV->ops())
20264 NewOps.push_back(Op.isUndef() ? DAG.getPOISON(PreExtendLegalType)
20265 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
20266 PreExtendLegalType));
20267 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
20268 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
20269 EVT PreExtendVT = VT.changeVectorElementType(*DAG.getContext(),
20270 PreExtendType.getScalarType());
20271 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
20272 BV.getOperand(1).isUndef()
20273 ? DAG.getPOISON(PreExtendVT)
20274 : BV.getOperand(1).getOperand(0),
20275 cast<ShuffleVectorSDNode>(BV)->getMask());
20276 }
20277 unsigned ExtOpc = !SeenZExtOrSExt
20279 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
20280 return DAG.getNode(ExtOpc, DL, VT, NBV);
20281}
20282
20283/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
20284/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
20286 // If the value type isn't a vector, none of the operands are going to be dups
20287 EVT VT = Mul->getValueType(0);
20288 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
20289 return SDValue();
20290
20291 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
20292 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
20293
20294 // Neither operands have been changed, don't make any further changes
20295 if (!Op0 && !Op1)
20296 return SDValue();
20297
20298 SDLoc DL(Mul);
20299 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
20300 Op1 ? Op1 : Mul->getOperand(1));
20301}
20302
20303// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
20304// folding a power-of-two factor of the constant into the RDSVL immediate and
20305// compensating with an extra shift.
20306//
20307// We rewrite:
20308// (mul (srl (rdsvl 1), w), x)
20309// to one of:
20310// (shl (rdsvl y), z) if z > 0
20311// (srl (rdsvl y), abs(z)) if z < 0
20312// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
20314 SDLoc DL(Mul);
20315 EVT VT = Mul->getValueType(0);
20316 SDValue MulOp0 = Mul->getOperand(0);
20317 int ConstMultiplier =
20318 cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
20319 if ((MulOp0->getOpcode() != ISD::SRL) ||
20320 (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
20321 return SDValue();
20322
20323 unsigned AbsConstValue = abs(ConstMultiplier);
20324 unsigned OperandShift =
20325 cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
20326
20327 // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
20328 // integral)
20329 int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
20330
20331 // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
20332 // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
20333 unsigned B = ConstMultiplier < 0 ? 32 : 31;
20334 unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
20335 int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
20336
20337 // No valid solution found.
20338 if (LowerBound > UpperBound)
20339 return SDValue();
20340
20341 // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
20342 // shift if possible.
20343 int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
20344
20345 // y = x / 2^(w + z)
20346 int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
20347 (ConstMultiplier < 0 ? -1 : 1);
20348 auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
20349 DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
20350
20351 if (Shift == 0)
20352 return Rdsvl;
20353 return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
20354 DAG.getConstant(abs(Shift), DL, MVT::i32),
20356}
20357
20358// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
20359// Same for other types with equivalent constants.
20361 EVT VT = N->getValueType(0);
20362 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
20363 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
20364 return SDValue();
20365 if (N->getOperand(0).getOpcode() != ISD::AND ||
20366 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
20367 return SDValue();
20368
20369 SDValue And = N->getOperand(0);
20370 SDValue Srl = And.getOperand(0);
20371
20372 APInt V1, V2, V3;
20373 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
20374 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
20376 return SDValue();
20377
20378 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
20379 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
20380 V3 != (HalfSize - 1))
20381 return SDValue();
20382
20383 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20384 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
20385 VT.getVectorElementCount() * 2);
20386
20387 SDLoc DL(N);
20388 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
20389 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
20390 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
20391 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
20392}
20393
20394// Transform vector add(zext i8 to i32, zext i8 to i32)
20395// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
20396// This allows extra uses of saddl/uaddl at the lower vector widths, and less
20397// extends.
20399 EVT VT = N->getValueType(0);
20400 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
20401 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
20402 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
20403 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
20404 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
20405 N->getOperand(0).getOperand(0).getValueType() !=
20406 N->getOperand(1).getOperand(0).getValueType())
20407 return SDValue();
20408
20409 if (N->getOpcode() == ISD::MUL &&
20410 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
20411 return SDValue();
20412
20413 SDValue N0 = N->getOperand(0).getOperand(0);
20414 SDValue N1 = N->getOperand(1).getOperand(0);
20415 EVT InVT = N0.getValueType();
20416
20417 EVT S1 = InVT.getScalarType();
20418 EVT S2 = VT.getScalarType();
20419 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
20420 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
20421 SDLoc DL(N);
20422 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20425 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
20426 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
20427 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
20428 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
20429 : (unsigned)ISD::SIGN_EXTEND,
20430 DL, VT, NewOp);
20431 }
20432 return SDValue();
20433}
20434
20437 const AArch64Subtarget *Subtarget) {
20438
20439 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
20440 return Ext;
20442 return Ext;
20443 if (SDValue Ext = performVectorExtCombine(N, DAG))
20444 return Ext;
20445 if (DCI.isBeforeLegalizeOps())
20446 return SDValue();
20447
20448 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
20449 // and in MachineCombiner pass, add+mul will be combined into madd.
20450 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
20451 SDLoc DL(N);
20452 EVT VT = N->getValueType(0);
20453 SDValue N0 = N->getOperand(0);
20454 SDValue N1 = N->getOperand(1);
20455 SDValue MulOper;
20456 unsigned AddSubOpc;
20457
20458 auto IsAddSubWith1 = [&](SDValue V) -> bool {
20459 AddSubOpc = V->getOpcode();
20460 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
20461 SDValue Opnd = V->getOperand(1);
20462 MulOper = V->getOperand(0);
20463 if (AddSubOpc == ISD::SUB)
20464 std::swap(Opnd, MulOper);
20465 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
20466 return C->isOne();
20467 }
20468 return false;
20469 };
20470
20471 if (IsAddSubWith1(N0)) {
20472 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
20473 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
20474 }
20475
20476 if (IsAddSubWith1(N1)) {
20477 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
20478 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
20479 }
20480
20481 // The below optimizations require a constant RHS.
20482 if (!isa<ConstantSDNode>(N1))
20483 return SDValue();
20484
20485 if (SDValue Ext = performMulRdsvlCombine(N, DAG))
20486 return Ext;
20487
20489 const APInt &ConstValue = C->getAPIntValue();
20490
20491 // Allow the scaling to be folded into the `cnt` instruction by preventing
20492 // the scaling to be obscured here. This makes it easier to pattern match.
20493 if (IsSVECntIntrinsic(N0) ||
20494 (N0->getOpcode() == ISD::TRUNCATE &&
20495 (IsSVECntIntrinsic(N0->getOperand(0)))))
20496 if (ConstValue.sge(1) && ConstValue.sle(16))
20497 return SDValue();
20498
20499 // Multiplication of a power of two plus/minus one can be done more
20500 // cheaply as shift+add/sub. For now, this is true unilaterally. If
20501 // future CPUs have a cheaper MADD instruction, this may need to be
20502 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
20503 // 64-bit is 5 cycles, so this is always a win.
20504 // More aggressively, some multiplications N0 * C can be lowered to
20505 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
20506 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
20507 // TODO: lower more cases.
20508
20509 // TrailingZeroes is used to test if the mul can be lowered to
20510 // shift+add+shift.
20511 unsigned TrailingZeroes = ConstValue.countr_zero();
20512 if (TrailingZeroes) {
20513 // Conservatively do not lower to shift+add+shift if the mul might be
20514 // folded into smul or umul.
20515 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
20516 isZeroExtended(N0, DAG)))
20517 return SDValue();
20518 // Conservatively do not lower to shift+add+shift if the mul might be
20519 // folded into madd or msub.
20520 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
20521 N->user_begin()->getOpcode() == ISD::SUB))
20522 return SDValue();
20523 }
20524 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
20525 // and shift+add+shift.
20526 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
20527 unsigned ShiftAmt;
20528
20529 auto Shl = [&](SDValue N0, unsigned N1) {
20530 if (!N0.getNode())
20531 return SDValue();
20532 // If shift causes overflow, ignore this combine.
20533 if (N1 >= N0.getValueSizeInBits())
20534 return SDValue();
20535 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
20536 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
20537 };
20538 auto Add = [&](SDValue N0, SDValue N1) {
20539 if (!N0.getNode() || !N1.getNode())
20540 return SDValue();
20541 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
20542 };
20543 auto Sub = [&](SDValue N0, SDValue N1) {
20544 if (!N0.getNode() || !N1.getNode())
20545 return SDValue();
20546 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
20547 };
20548 auto Negate = [&](SDValue N) {
20549 if (!N0.getNode())
20550 return SDValue();
20551 SDValue Zero = DAG.getConstant(0, DL, VT);
20552 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
20553 };
20554
20555 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
20556 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
20557 // the (2^N - 1) can't be execused via a single instruction.
20558 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
20559 unsigned BitWidth = C.getBitWidth();
20560 for (unsigned i = 1; i < BitWidth / 2; i++) {
20561 APInt Rem;
20562 APInt X(BitWidth, (1 << i) + 1);
20563 APInt::sdivrem(C, X, N, Rem);
20564 APInt NVMinus1 = N - 1;
20565 if (Rem == 0 && NVMinus1.isPowerOf2()) {
20566 M = X;
20567 return true;
20568 }
20569 }
20570 return false;
20571 };
20572
20573 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
20574 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
20575 // the (2^N - 1) can't be execused via a single instruction.
20576 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
20577 APInt CVMinus1 = C - 1;
20578 if (CVMinus1.isNegative())
20579 return false;
20580 unsigned TrailingZeroes = CVMinus1.countr_zero();
20581 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
20582 if (SCVMinus1.isPowerOf2()) {
20583 unsigned BitWidth = SCVMinus1.getBitWidth();
20584 M = APInt(BitWidth, SCVMinus1.logBase2());
20585 N = APInt(BitWidth, TrailingZeroes);
20586 return true;
20587 }
20588 return false;
20589 };
20590
20591 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
20592 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
20593 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
20594 APInt CVMinus1 = C - 1;
20595 if (CVMinus1.isNegative())
20596 return false;
20597 unsigned TrailingZeroes = CVMinus1.countr_zero();
20598 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
20599 if (CVPlus1.isPowerOf2()) {
20600 unsigned BitWidth = CVPlus1.getBitWidth();
20601 M = APInt(BitWidth, CVPlus1.logBase2());
20602 N = APInt(BitWidth, TrailingZeroes);
20603 return true;
20604 }
20605 return false;
20606 };
20607
20608 if (ConstValue.isNonNegative()) {
20609 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
20610 // (mul x, 2^N - 1) => (sub (shl x, N), x)
20611 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
20612 // (mul x, (2^M + 1) * (2^N + 1))
20613 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
20614 // (mul x, (2^M + 1) * 2^N + 1))
20615 // => MV = add (shl x, M), x); add (shl MV, N), x)
20616 // (mul x, 1 - (1 - 2^M) * 2^N))
20617 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
20618 APInt SCVMinus1 = ShiftedConstValue - 1;
20619 APInt SCVPlus1 = ShiftedConstValue + 1;
20620 APInt CVPlus1 = ConstValue + 1;
20621 APInt CVM, CVN;
20622 if (SCVMinus1.isPowerOf2()) {
20623 ShiftAmt = SCVMinus1.logBase2();
20624 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
20625 } else if (CVPlus1.isPowerOf2()) {
20626 ShiftAmt = CVPlus1.