LLVM 23.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
20#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/Constants.h"
63#include "llvm/IR/DataLayout.h"
64#include "llvm/IR/DebugLoc.h"
66#include "llvm/IR/Function.h"
68#include "llvm/IR/GlobalValue.h"
69#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/Instruction.h"
73#include "llvm/IR/Intrinsics.h"
74#include "llvm/IR/IntrinsicsAArch64.h"
75#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Use.h"
79#include "llvm/IR/Value.h"
84#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <bitset>
96#include <cassert>
97#include <cctype>
98#include <cstdint>
99#include <cstdlib>
100#include <iterator>
101#include <limits>
102#include <optional>
103#include <tuple>
104#include <utility>
105#include <vector>
106
107using namespace llvm;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251static inline bool isPackedPredicateType(EVT VT, SelectionDAG &DAG) {
253 "Expected legal type!");
254 return VT == MVT::nxv16i1;
255}
256
257/// Returns true if the conceptual representation for \p VT does not map
258/// directly to its physical register representation, meaning there are gaps
259/// between elements in the register. In practice, the vector elements will be
260/// strided by a power of two and placed starting from lane 0. For example,
261/// nxv8i1 or nxv2f32 are unpacked types.
262///
263///\pre VT is a legal type.
264static inline bool isUnpackedType(EVT VT, SelectionDAG &DAG) {
265 bool Res = !isPackedVectorType(VT, DAG) && !isPackedPredicateType(VT, DAG);
266 assert((!Res || VT.isScalableVector()) &&
267 "Unexpected fixed-size unpacked type.");
268 return Res;
269}
270
271// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
272// predicate and end with a passthru value matching the result type.
273static bool isMergePassthruOpcode(unsigned Opc) {
274 switch (Opc) {
275 default:
276 return false;
277 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
278 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
279 case AArch64ISD::REVH_MERGE_PASSTHRU:
280 case AArch64ISD::REVW_MERGE_PASSTHRU:
281 case AArch64ISD::REVD_MERGE_PASSTHRU:
282 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
283 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
284 case AArch64ISD::DUP_MERGE_PASSTHRU:
285 case AArch64ISD::ABS_MERGE_PASSTHRU:
286 case AArch64ISD::NEG_MERGE_PASSTHRU:
287 case AArch64ISD::FNEG_MERGE_PASSTHRU:
288 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
289 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
290 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
291 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
292 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
293 case AArch64ISD::FRINT_MERGE_PASSTHRU:
294 case AArch64ISD::FRINT32_MERGE_PASSTHRU:
295 case AArch64ISD::FRINT64_MERGE_PASSTHRU:
296 case AArch64ISD::FROUND_MERGE_PASSTHRU:
297 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
298 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
299 case AArch64ISD::FTRUNC32_MERGE_PASSTHRU:
300 case AArch64ISD::FTRUNC64_MERGE_PASSTHRU:
301 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
302 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
303 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
304 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
305 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
306 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
307 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
308 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
309 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
310 case AArch64ISD::FABS_MERGE_PASSTHRU:
311 return true;
312 }
313}
314
315// Returns true if inactive lanes are known to be zeroed by construction.
317 switch (Op.getOpcode()) {
318 default:
319 return false;
320 // We guarantee i1 splat_vectors to zero the other lanes
323 case AArch64ISD::PTRUE:
324 case AArch64ISD::SETCC_MERGE_ZERO:
325 return true;
327 switch (Op.getConstantOperandVal(0)) {
328 default:
329 return false;
330 case Intrinsic::aarch64_sve_ptrue:
331 case Intrinsic::aarch64_sve_pnext:
332 case Intrinsic::aarch64_sve_cmpeq:
333 case Intrinsic::aarch64_sve_cmpne:
334 case Intrinsic::aarch64_sve_cmpge:
335 case Intrinsic::aarch64_sve_cmpgt:
336 case Intrinsic::aarch64_sve_cmphs:
337 case Intrinsic::aarch64_sve_cmphi:
338 case Intrinsic::aarch64_sve_cmpeq_wide:
339 case Intrinsic::aarch64_sve_cmpne_wide:
340 case Intrinsic::aarch64_sve_cmpge_wide:
341 case Intrinsic::aarch64_sve_cmpgt_wide:
342 case Intrinsic::aarch64_sve_cmplt_wide:
343 case Intrinsic::aarch64_sve_cmple_wide:
344 case Intrinsic::aarch64_sve_cmphs_wide:
345 case Intrinsic::aarch64_sve_cmphi_wide:
346 case Intrinsic::aarch64_sve_cmplo_wide:
347 case Intrinsic::aarch64_sve_cmpls_wide:
348 case Intrinsic::aarch64_sve_fcmpeq:
349 case Intrinsic::aarch64_sve_fcmpne:
350 case Intrinsic::aarch64_sve_fcmpge:
351 case Intrinsic::aarch64_sve_fcmpgt:
352 case Intrinsic::aarch64_sve_fcmpuo:
353 case Intrinsic::aarch64_sve_facgt:
354 case Intrinsic::aarch64_sve_facge:
355 case Intrinsic::aarch64_sve_whilege:
356 case Intrinsic::aarch64_sve_whilegt:
357 case Intrinsic::aarch64_sve_whilehi:
358 case Intrinsic::aarch64_sve_whilehs:
359 case Intrinsic::aarch64_sve_whilele:
360 case Intrinsic::aarch64_sve_whilelo:
361 case Intrinsic::aarch64_sve_whilels:
362 case Intrinsic::aarch64_sve_whilelt:
363 case Intrinsic::aarch64_sve_match:
364 case Intrinsic::aarch64_sve_nmatch:
365 case Intrinsic::aarch64_sve_whilege_x2:
366 case Intrinsic::aarch64_sve_whilegt_x2:
367 case Intrinsic::aarch64_sve_whilehi_x2:
368 case Intrinsic::aarch64_sve_whilehs_x2:
369 case Intrinsic::aarch64_sve_whilele_x2:
370 case Intrinsic::aarch64_sve_whilelo_x2:
371 case Intrinsic::aarch64_sve_whilels_x2:
372 case Intrinsic::aarch64_sve_whilelt_x2:
373 return true;
374 }
375 }
376}
377
378static std::tuple<SDValue, SDValue>
380 SDLoc DL(Disc);
381 SDValue AddrDisc;
382 SDValue ConstDisc;
383
384 // If this is a blend, remember the constant and address discriminators.
385 // Otherwise, it's either a constant discriminator, or a non-blended
386 // address discriminator.
387 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
388 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
389 AddrDisc = Disc->getOperand(1);
390 ConstDisc = Disc->getOperand(2);
391 } else {
392 ConstDisc = Disc;
393 }
394
395 // If the constant discriminator (either the blend RHS, or the entire
396 // discriminator value) isn't a 16-bit constant, bail out, and let the
397 // discriminator be computed separately.
398 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
399 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
400 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
401
402 // If there's no address discriminator, use NoRegister, which we'll later
403 // replace with XZR, or directly use a Z variant of the inst. when available.
404 if (!AddrDisc)
405 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
406
407 return std::make_tuple(
408 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
409 AddrDisc);
410}
411
413 const AArch64Subtarget &STI)
414 : TargetLowering(TM, STI), Subtarget(&STI) {
415 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
416 // we have to make something up. Arbitrarily, choose ZeroOrOne.
418 // When comparing vectors the result sets the different elements in the
419 // vector to all-one or all-zero.
421
422 // Set up the register classes.
423 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
424 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
425
426 if (Subtarget->hasLS64()) {
427 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
428 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
430 }
431
432 if (Subtarget->hasFPARMv8()) {
433 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
434 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
435 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
436 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
437 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
438 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
439 }
440
441 if (Subtarget->hasNEON()) {
442 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
443 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
444
445 addDRType(MVT::v2f32);
446 addDRType(MVT::v8i8);
447 addDRType(MVT::v4i16);
448 addDRType(MVT::v2i32);
449 addDRType(MVT::v1i64);
450 addDRType(MVT::v1f64);
451 addDRType(MVT::v4f16);
452 addDRType(MVT::v4bf16);
453
454 addQRType(MVT::v4f32);
455 addQRType(MVT::v2f64);
456 addQRType(MVT::v16i8);
457 addQRType(MVT::v8i16);
458 addQRType(MVT::v4i32);
459 addQRType(MVT::v2i64);
460 addQRType(MVT::v8f16);
461 addQRType(MVT::v8bf16);
462 }
463
464 if (Subtarget->isSVEorStreamingSVEAvailable()) {
465 // Add legal sve predicate types
466 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
467 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
468 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
469 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
470 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
471
472 // Add sve predicate as counter type
473 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
474
475 // Add legal sve data types
476 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
477 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
478 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
479 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
480
481 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
482 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
483 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
484 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
485 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
486 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
487
488 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
489 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
490 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
491
492 if (Subtarget->useSVEForFixedLengthVectors()) {
495 addRegisterClass(VT, &AArch64::ZPRRegClass);
496
499 addRegisterClass(VT, &AArch64::ZPRRegClass);
500 }
501 }
502
503 // Compute derived properties from the register classes
504 computeRegisterProperties(Subtarget->getRegisterInfo());
505
506 // Provide all sorts of operation actions
534 if (Subtarget->hasFPARMv8()) {
537 }
550
552
556
559
561
562 // Custom lowering hooks are needed for XOR
563 // to fold it into CSINC/CSINV.
566
569
570 // Virtually no operation on f128 is legal, but LLVM can't expand them when
571 // there's a valid register class, so we need custom operations in most cases.
596 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
597 // aren't handled.
598
599 // Lowering for many of the conversions is actually specified by the non-f128
600 // type. The LowerXXX function will be trivial when f128 isn't involved.
625 if (Subtarget->hasFPARMv8()) {
628 }
631 if (Subtarget->hasFPARMv8()) {
634 }
637
642
643 // Variable arguments.
648
649 // Variable-sized objects.
652
653 // Lowering Funnel Shifts to EXTR
658
660
661 // Constant pool entries
663
664 // BlockAddress
666
667 // AArch64 lacks both left-rotate and popcount instructions.
673 }
674
675 // AArch64 doesn't have i32 MULH{S|U}.
678
679 // AArch64 doesn't have {U|S}MUL_LOHI.
684
685 if (Subtarget->hasCSSC()) {
689
691
695
698
703
708 } else {
712
715
718 }
719
725 }
732
733 // Custom lower Add/Sub/Mul with overflow.
746
755
764 if (Subtarget->hasFullFP16()) {
767 } else {
770 }
771
772 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
786 setOperationAction(Op, MVT::f16, Promote);
787 setOperationAction(Op, MVT::v4f16, Expand);
788 setOperationAction(Op, MVT::v8f16, Expand);
789 setOperationAction(Op, MVT::bf16, Promote);
790 setOperationAction(Op, MVT::v4bf16, Expand);
791 setOperationAction(Op, MVT::v8bf16, Expand);
792 }
793
794 // Legalize fcanonicalize to circumvent default expansion
795 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
796 if (Subtarget->hasFullFP16()) {
798 }
799
800 // fpextend from f16 or bf16 to f32 is legal
805 // fpextend from bf16 to f64 needs to be split into two fpextends
808
809 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
810 for (auto Op : {
814 ISD::FADD,
815 ISD::FSUB,
816 ISD::FMUL,
817 ISD::FDIV,
818 ISD::FMA,
851 })
852 setOperationAction(Op, ScalarVT, Promote);
853
854 for (auto Op : {ISD::FNEG, ISD::FABS})
855 setOperationAction(Op, ScalarVT, Legal);
856
857 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
858 // because the result type is integer.
862 setOperationAction(Op, ScalarVT, Custom);
863
864 // promote v4f16 to v4f32 when that is known to be safe.
865 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
866 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
867 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
868 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
869 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
870 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
871 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
872 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
873 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
874 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
875 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
876 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
877 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
878 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
879
888
889 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
890 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
891 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
892
913 };
914
915 if (!Subtarget->hasFullFP16()) {
916 LegalizeNarrowFP(MVT::f16);
917 }
918 LegalizeNarrowFP(MVT::bf16);
921
922 // AArch64 has implementations of a lot of rounding-like FP operations.
923 // clang-format off
924 for (auto Op :
936 for (MVT Ty : {MVT::f32, MVT::f64})
938 if (Subtarget->hasFullFP16())
939 setOperationAction(Op, MVT::f16, Legal);
940 }
941 // clang-format on
942
943 // Basic strict FP operations are legal
946 for (MVT Ty : {MVT::f32, MVT::f64})
948 if (Subtarget->hasFullFP16())
949 setOperationAction(Op, MVT::f16, Legal);
950 }
951
953
959
961 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
964 } else {
967 }
970
971 // Generate outline atomics library calls only if LSE was not specified for
972 // subtarget
973 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
999 }
1000
1001 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
1006
1011
1016
1021
1026 }
1027
1028 if (Subtarget->hasLSE128()) {
1029 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1030 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1034 }
1035
1036 // 128-bit loads and stores can be done without expanding
1037 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1039
1040 // Aligned 128-bit loads and stores are single-copy atomic according to the
1041 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1042 if (Subtarget->hasLSE2()) {
1045 }
1046
1047 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1048 // custom lowering, as there are no un-paired non-temporal stores and
1049 // legalization will break up 256 bit inputs.
1050 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1051 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1052 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1053 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1054 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1055 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1056 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1057 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1058
1059 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1060 // custom lowering, as there are no un-paired non-temporal loads legalization
1061 // will break up 256 bit inputs.
1062 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1063 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1064 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1065 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1066 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1067 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1068 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1069 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1070
1071 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1073
1074 // Issue __sincos_stret if available.
1077
1078 // Make floating-point constants legal for the large code model, so they don't
1079 // become loads from the constant pool.
1080 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1083 }
1084
1085 // AArch64 does not have floating-point extending loads, i1 sign-extending
1086 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1087 for (MVT VT : MVT::fp_valuetypes()) {
1088 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1089 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1090 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1091 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1092 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1093 }
1094 for (MVT VT : MVT::integer_valuetypes())
1095 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1096
1097 for (MVT WideVT : MVT::fp_valuetypes()) {
1098 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1099 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1100 setTruncStoreAction(WideVT, NarrowVT, Expand);
1101 }
1102 }
1103 }
1104
1105 if (Subtarget->hasFPARMv8()) {
1109 }
1110
1111 // Indexed loads and stores are supported.
1112 for (unsigned im = (unsigned)ISD::PRE_INC;
1114 setIndexedLoadAction(im, MVT::i8, Legal);
1115 setIndexedLoadAction(im, MVT::i16, Legal);
1116 setIndexedLoadAction(im, MVT::i32, Legal);
1117 setIndexedLoadAction(im, MVT::i64, Legal);
1118 setIndexedLoadAction(im, MVT::f64, Legal);
1119 setIndexedLoadAction(im, MVT::f32, Legal);
1120 setIndexedLoadAction(im, MVT::f16, Legal);
1121 setIndexedLoadAction(im, MVT::bf16, Legal);
1122 setIndexedStoreAction(im, MVT::i8, Legal);
1123 setIndexedStoreAction(im, MVT::i16, Legal);
1124 setIndexedStoreAction(im, MVT::i32, Legal);
1125 setIndexedStoreAction(im, MVT::i64, Legal);
1126 setIndexedStoreAction(im, MVT::f64, Legal);
1127 setIndexedStoreAction(im, MVT::f32, Legal);
1128 setIndexedStoreAction(im, MVT::f16, Legal);
1129 setIndexedStoreAction(im, MVT::bf16, Legal);
1130 }
1131
1132 // Trap.
1133 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1136
1137 // We combine OR nodes for ccmp operations.
1139 // Try to create BICs for vector ANDs.
1141
1142 // llvm.init.trampoline and llvm.adjust.trampoline
1145
1146 // Vector add and sub nodes may conceal a high-half opportunity.
1147 // Also, try to fold ADD into CSINC/CSINV..
1150
1153
1154 // Try and combine setcc with csel
1156
1158
1166
1168
1170
1172
1176
1179
1181
1183
1185
1187
1191
1193
1197
1198 // In case of strict alignment, avoid an excessive number of byte wide stores.
1201 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1202
1206 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1207
1210 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1211
1214 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1215
1217
1219
1220 EnableExtLdPromotion = true;
1221
1222 // Set required alignment.
1224 // Set preferred alignments.
1225
1226 // Don't align loops on Windows. The SEH unwind info generation needs to
1227 // know the exact length of functions before the alignments have been
1228 // expanded.
1229 if (!Subtarget->isTargetWindows())
1233
1234 // Only change the limit for entries in a jump table if specified by
1235 // the sub target, but not at the command line.
1236 unsigned MaxJT = STI.getMaximumJumpTableSize();
1237 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1239
1241
1243
1245 if (Subtarget->hasSME())
1247
1248 if (Subtarget->isNeonAvailable()) {
1249 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1250 // silliness like this:
1251 // clang-format off
1252 for (auto Op :
1273 setOperationAction(Op, MVT::v1f64, Expand);
1274 // clang-format on
1275
1276 for (auto Op :
1281 setOperationAction(Op, MVT::v1i64, Expand);
1282
1283 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1284 // elements smaller than i32, so promote the input to i32 first.
1285 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1286 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1287
1288 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1289 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1290 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1293 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1295
1296 if (Subtarget->hasFullFP16()) {
1299
1308 } else {
1309 // when AArch64 doesn't have fullfp16 support, promote the input
1310 // to i32 first.
1311 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1312 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1313 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1314 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1315 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1316 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1317 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1318 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1319 }
1320
1321 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1322 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1323 // CTLS (Count Leading Sign bits) - Legal for BHS types (8/16/32-bit
1324 // elements) No hardware support for 64-bit element vectors
1325 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1326 MVT::v4i32})
1334 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1339 }
1340
1341 // Custom handling for some quad-vector types to detect MULL.
1342 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1343 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1344 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1345 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1346 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1347 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1348
1349 // Saturates
1350 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1351 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1356 }
1357
1358 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1359 MVT::v4i32}) {
1366 }
1367
1368 // Vector reductions
1369 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1370 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1371 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1376
1378 }
1379 }
1380 if (Subtarget->hasFullFP16())
1382
1383 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1384 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1393 }
1398
1400 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1401 // Likewise, narrowing and extending vector loads/stores aren't handled
1402 // directly.
1405
1406 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1409 } else {
1412 }
1415
1418
1419 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1420 setTruncStoreAction(VT, InnerVT, Expand);
1421 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1422 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1423 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1424 }
1425 }
1426
1427 for (auto Op :
1433 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1435 if (Subtarget->hasFullFP16())
1436 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1438 }
1439
1440 // LRINT and LLRINT.
1441 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1442 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1444 if (Subtarget->hasFullFP16())
1445 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1447 }
1448
1449 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1450
1455
1459
1460 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1461 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1462 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1463 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1464 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1465 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1466 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1467 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1468 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1469 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1470 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1471 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1472 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1473 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1474 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1475 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1476 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1477 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1478
1479 // ADDP custom lowering
1480 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1482 // FADDP custom lowering
1483 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1485
1486 if (Subtarget->hasDotProd()) {
1487 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1489
1490 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1491 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1492 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
1493 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1494
1495 if (Subtarget->hasMatMulInt8()) {
1497 MVT::v16i8, Legal);
1499 MVT::v16i8, Custom);
1500
1502 MVT::v8i8, Legal);
1503 }
1504 }
1505
1506 setOperationAction(ISD::CLMUL, MVT::v8i8, Legal);
1507 setOperationAction(ISD::CLMUL, MVT::v16i8, Legal);
1508
1509 } else /* !isNeonAvailable */ {
1511 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1513
1514 if (VT.is128BitVector() || VT.is64BitVector()) {
1518 Subtarget->isLittleEndian() ? Legal : Expand);
1519 }
1520 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1521 setTruncStoreAction(VT, InnerVT, Expand);
1522 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1523 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1524 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1525 }
1526 }
1527 }
1528
1529 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1533 }
1534
1535 if (Subtarget->hasSME()) {
1537 }
1538
1539 // FIXME: Move lowering for more nodes here if those are common between
1540 // SVE and SME.
1541 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1542 for (auto VT :
1543 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1548 }
1549 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1552 }
1553
1554 if (Subtarget->isSVEorStreamingSVEAvailable() &&
1555 (Subtarget->hasSVE2p1() || Subtarget->hasSME2()))
1557
1558 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1560
1561 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v2f64})
1563 }
1564
1565 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1566 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1610
1616
1625
1630
1634
1635 if (!Subtarget->isLittleEndian())
1637
1638 if (Subtarget->hasSVE2() ||
1639 (Subtarget->hasSME() && Subtarget->isStreaming()))
1640 // For SLI/SRI.
1642 }
1643
1644 // Illegal unpacked integer vector types.
1645 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1648 }
1649
1650 // Type legalize unpacked bitcasts.
1651 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1653
1654 for (auto VT :
1655 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1656 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1658
1659 // Promote predicate as counter load/stores to standard predicates.
1660 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
1661 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
1662
1663 // Predicate as counter legalization actions.
1664 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
1665 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
1666
1667 for (auto VT :
1668 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1676
1680
1681 // There are no legal MVT::nxv16f## based types.
1682 if (VT != MVT::nxv16i1) {
1687 }
1688 }
1689
1690 // NEON doesn't support masked loads/stores, but SME and SVE do.
1691 for (auto VT :
1692 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1693 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1694 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1697 }
1698
1699 // Firstly, exclude all scalable vector extending loads/truncating stores,
1700 // include both integer and floating scalable vector.
1702 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1703 setTruncStoreAction(VT, InnerVT, Expand);
1704 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1705 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1706 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1707 }
1708 }
1709
1710 // Then, selectively enable those which we directly support.
1711 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1712 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1713 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1714 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1715 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1716 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1717 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1718 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1719 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1720 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1721 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1722 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1723 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1724 }
1725
1726 // SVE supports truncating stores of 64 and 128-bit vectors
1727 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1728 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1729 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1730 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1731 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1732
1733 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1734 MVT::nxv4f32, MVT::nxv2f64}) {
1776
1798
1810 }
1811
1812 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1829 }
1830
1831 if (Subtarget->hasSVEB16B16() &&
1832 Subtarget->isNonStreamingSVEorSME2Available()) {
1833 // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
1834 for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
1835 MVT::nxv8bf16}) {
1844 }
1845 }
1846
1847 for (auto Opcode :
1852 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1853 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1854 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1855 }
1856
1857 if (!Subtarget->hasSVEB16B16() ||
1858 !Subtarget->isNonStreamingSVEorSME2Available()) {
1859 for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1860 MVT PromotedVT = VT.changeVectorElementType(MVT::f32);
1861 setOperationPromotedToType(ISD::FADD, VT, PromotedVT);
1862 setOperationPromotedToType(ISD::FMA, VT, PromotedVT);
1867 setOperationPromotedToType(ISD::FSUB, VT, PromotedVT);
1868
1869 if (VT != MVT::nxv2bf16 && Subtarget->hasBF16())
1871 else
1872 setOperationPromotedToType(ISD::FMUL, VT, PromotedVT);
1873 }
1874
1875 if (Subtarget->hasBF16() && Subtarget->isNeonAvailable())
1876 setOperationAction(ISD::FMUL, MVT::v8bf16, Custom);
1877 }
1878
1881
1882 // A number of operations like MULH and integer divides are not supported by
1883 // NEON but are available in SVE.
1884 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1885 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1890 }
1891
1892 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1893 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1894 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1895
1896 // NOTE: Currently this has to happen after computeRegisterProperties rather
1897 // than the preferred option of combining it with the addRegisterClass call.
1898 if (Subtarget->useSVEForFixedLengthVectors()) {
1901 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1902 addTypeForFixedLengthSVE(VT);
1903 }
1906 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1907 addTypeForFixedLengthSVE(VT);
1908 }
1909
1910 // 64bit results can mean a bigger than NEON input.
1911 for (auto VT : {MVT::v8i8, MVT::v4i16})
1914
1915 // 128bit results imply a bigger than NEON input.
1916 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1918 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v8bf16})
1920
1921 // These operations are not supported on NEON but SVE can do them.
1923 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1924 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1925 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1926 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1927 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1928 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1929 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1930 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1931 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1932 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1933 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1938
1939 // Int operations with no NEON support.
1940 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1941 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1947 }
1948
1949 // Use SVE for vectors with more than 2 elements.
1950 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1952 }
1953
1955 MVT::nxv2i64);
1957 MVT::nxv2i64);
1959 MVT::nxv4i32);
1961 MVT::nxv4i32);
1963 MVT::nxv8i16);
1965 MVT::nxv8i16);
1967 MVT::nxv16i8);
1969 MVT::nxv16i8);
1970
1972
1973 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1975 }
1976
1977 // Handle partial reduction operations
1978 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1979 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1980 // Other pairs will default to 'Expand'.
1981 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1983 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1984 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1985
1986 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1987
1988 if (Subtarget->hasMatMulInt8()) {
1990 MVT::nxv16i8, Legal);
1992 MVT::nxv16i8, Custom);
1993 }
1994
1995 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1996 // Wide add types
1997 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1998 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1999 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
2000
2001 setOperationAction(ISD::CLMUL, MVT::nxv16i8, Legal);
2002 }
2003
2004 // Handle floating-point partial reduction
2005 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
2007 MVT::nxv8f16, Legal);
2008 // We can use SVE2p1 fdot to emulate the fixed-length variant.
2010 MVT::v8f16, Custom);
2011 }
2012 }
2013
2014 // Handle non-aliasing elements mask
2015 if (Subtarget->hasSVE2() ||
2016 (Subtarget->hasSME() && Subtarget->isStreaming())) {
2017 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
2018 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
2021 }
2022 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
2025 }
2026 }
2027
2028 // Handle operations that are only available in non-streaming SVE mode.
2029 if (Subtarget->isSVEAvailable()) {
2030 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
2031 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2032 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
2033 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
2034 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
2035 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2036 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
2039 }
2040
2041 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2042 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
2043 MVT::v2f32, MVT::v4f32, MVT::v2f64})
2045
2046 // We can lower types that have <vscale x {2|4}> elements to compact.
2047 for (auto VT :
2048 {MVT::nxv4i32, MVT::nxv2i64, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64})
2050
2051 // If we have SVE, we can use SVE logic for legal NEON vectors in the lowest
2052 // bits of the SVE register.
2053 for (auto VT : {MVT::v2i32, MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32,
2054 MVT::v2f64})
2056
2057 // Promote v4i16/f16 to v4i32/f32 as the SVE container for v4i16 is nxv8,
2058 // which is not supported with for compact (with only +sve).
2059 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4bf16, MVT::v4i16);
2060 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4f16, MVT::v4i16);
2061 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4i16, MVT::v4i32);
2062
2063 for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
2064 MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
2065 MVT::nxv4i32, MVT::nxv4f32}) {
2066 // Use a custom lowering for masked stores that could be a supported
2067 // compressing store. Note: These types still use the normal (Legal)
2068 // lowering for non-compressing masked stores.
2070 }
2071
2072 // Histcnt is SVE2 only
2073 if (Subtarget->hasSVE2()) {
2075 Custom);
2077 Custom);
2078
2079 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2081 // Must be lowered to SVE instructions.
2082 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
2083 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
2084 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
2085 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
2086 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
2087 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
2088 }
2089 }
2090
2091 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
2092 // Only required for llvm.aarch64.mops.memset.tag
2094 }
2095
2097
2098 if (Subtarget->hasSVE()) {
2103 }
2104
2105 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2106
2107 IsStrictFPEnabled = true;
2109
2110 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2111 // it, but it's just a wrapper around ldexp.
2112 if (Subtarget->isTargetWindows()) {
2114 if (isOperationExpand(Op, MVT::f32))
2115 setOperationAction(Op, MVT::f32, Promote);
2116 }
2117
2118 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2119 // isn't legal.
2121 if (isOperationExpand(Op, MVT::f16))
2122 setOperationAction(Op, MVT::f16, Promote);
2123}
2124
2126 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2127}
2128
2129void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2130 assert(VT.isVector() && "VT should be a vector type");
2131
2132 if (VT.isFloatingPoint()) {
2134 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2135 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2136 }
2137
2138 // Mark vector float intrinsics as expand.
2139 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2157 }
2158
2159 // But we do support custom-lowering for FCOPYSIGN.
2160 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2161 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2162 VT == MVT::v8f16) &&
2163 Subtarget->hasFullFP16()))
2165
2180
2184 for (MVT InnerVT : MVT::all_valuetypes())
2185 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2186
2187 // CNT supports only B element sizes, then use UADDLP to widen.
2188 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2190
2196
2197 for (unsigned Opcode :
2200 setOperationAction(Opcode, VT, Custom);
2201
2202 if (!VT.isFloatingPoint())
2204
2205 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2206 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2207 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2208 setOperationAction(Opcode, VT, Legal);
2209
2210 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2211 // NEON types.
2212 if (VT.isFloatingPoint() &&
2213 VT.getVectorElementType() != MVT::bf16 &&
2214 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2215 for (unsigned Opcode :
2221 setOperationAction(Opcode, VT, Legal);
2222
2223 // Strict fp extend and trunc are legal
2224 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2226 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2228
2229 // FIXME: We could potentially make use of the vector comparison instructions
2230 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2231 // complications:
2232 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2233 // so we would need to expand when the condition code doesn't match the
2234 // kind of comparison.
2235 // * Some kinds of comparison require more than one FCMXY instruction so
2236 // would need to be expanded instead.
2237 // * The lowering of the non-strict versions involves target-specific ISD
2238 // nodes so we would likely need to add strict versions of all of them and
2239 // handle them appropriately.
2242
2243 // When little-endian we can use ordinary d and q register loads/stores for
2244 // vector types, but when big-endian we need to use structure load/store which
2245 // only allow post-index addressing.
2246 if (Subtarget->isLittleEndian()) {
2247 for (unsigned im = (unsigned)ISD::PRE_INC;
2248 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2251 }
2252 } else {
2255 }
2256
2257 if (Subtarget->hasD128()) {
2260 }
2261
2262 if (VT.isInteger()) {
2263 // Let common code emit inverted variants of compares we do support.
2269 }
2270}
2271
2273 EVT OpVT) const {
2274 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2275 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2276 ResVT.getVectorElementType() != MVT::i1)
2277 return true;
2278
2279 // Only support illegal types if the result is scalable and min elements > 1.
2280 if (ResVT.getVectorMinNumElements() == 1 ||
2281 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2282 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2283 return true;
2284
2285 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2286 // but anything larger should be expanded.
2287 if (OpVT.getFixedSizeInBits() > 64)
2288 return true;
2289
2290 return false;
2291}
2292
2294 if (!Subtarget->isSVEorStreamingSVEAvailable())
2295 return true;
2296
2297 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2298 // also support fixed-width predicates.
2299 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2300 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2301 VT != MVT::v4i1 && VT != MVT::v2i1;
2302}
2303
2305 unsigned SearchSize) const {
2306 // MATCH is SVE2 and only available in non-streaming mode.
2307 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2308 return true;
2309 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2310 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2311 return SearchSize != 8;
2312 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2313 return SearchSize != 8 && SearchSize != 16;
2314 return true;
2315}
2316
2317void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2318 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2319
2320 // By default everything must be expanded.
2321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2323
2324 if (VT.isFloatingPoint()) {
2334 }
2335
2337 VT == MVT::v1f64 ? Expand : Custom;
2338
2339 // Mark integer truncating stores/extending loads as having custom lowering
2340 if (VT.isInteger()) {
2341 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2342 while (InnerVT != VT) {
2343 setTruncStoreAction(VT, InnerVT, Default);
2344 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2345 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2346 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2347 InnerVT = InnerVT.changeVectorElementType(
2348 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2349 }
2350 }
2351
2352 // Mark floating-point truncating stores/extending loads as having custom
2353 // lowering
2354 if (VT.isFloatingPoint()) {
2355 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2356 while (InnerVT != VT) {
2357 setTruncStoreAction(VT, InnerVT, Custom);
2358 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2359 InnerVT = InnerVT.changeVectorElementType(
2361 }
2362 }
2363
2364 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2365 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2366
2367 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2369 unsigned NumElts = VT.getVectorNumElements();
2370 if (VT.getVectorElementType() == MVT::i64) {
2371 setPartialReduceMLAAction(MLAOps, VT,
2372 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2373 setPartialReduceMLAAction(MLAOps, VT,
2374 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2375 setPartialReduceMLAAction(MLAOps, VT,
2376 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2377 } else if (VT.getVectorElementType() == MVT::i32) {
2378 setPartialReduceMLAAction(MLAOps, VT,
2379 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2380 setPartialReduceMLAAction(MLAOps, VT,
2381 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2382 } else if (VT.getVectorElementType() == MVT::i16) {
2383 setPartialReduceMLAAction(MLAOps, VT,
2384 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2385 }
2386 if (Subtarget->hasMatMulInt8()) {
2387 if (VT.getVectorElementType() == MVT::i32)
2389 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2390 else if (VT.getVectorElementType() == MVT::i64)
2392 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2393 }
2394
2395 if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
2397 MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
2398 }
2399
2400 // Lower fixed length vector operations to scalable equivalents.
2407 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2445 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2446 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2448 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2467 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2494}
2495
2496void AArch64TargetLowering::addDRType(MVT VT) {
2497 addRegisterClass(VT, &AArch64::FPR64RegClass);
2498 if (Subtarget->isNeonAvailable())
2499 addTypeForNEON(VT);
2500}
2501
2502void AArch64TargetLowering::addQRType(MVT VT) {
2503 addRegisterClass(VT, &AArch64::FPR128RegClass);
2504 if (Subtarget->isNeonAvailable())
2505 addTypeForNEON(VT);
2506}
2507
2509 LLVMContext &C, EVT VT) const {
2510 if (!VT.isVector())
2511 return MVT::i32;
2512 if (VT.isScalableVector())
2513 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2515}
2516
2517// isIntImmediate - This method tests to see if the node is a constant
2518// operand. If so Imm will receive the value.
2519static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2521 Imm = C->getZExtValue();
2522 return true;
2523 }
2524 return false;
2525}
2526
2527bool isVectorizedBinOp(unsigned Opcode) {
2528 switch (Opcode) {
2529 case AArch64ISD::SQDMULH:
2530 return true;
2531 default:
2532 return false;
2533 }
2534}
2535
2536// isOpcWithIntImmediate - This method tests to see if the node is a specific
2537// opcode and that it has a immediate integer right operand.
2538// If so Imm will receive the value.
2539static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2540 uint64_t &Imm) {
2541 return N->getOpcode() == Opc &&
2542 isIntImmediate(N->getOperand(1).getNode(), Imm);
2543}
2544
2545static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2546 const APInt &Demanded,
2548 unsigned NewOpc) {
2549 uint64_t OldImm = Imm, NewImm, Enc;
2550 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2551
2552 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2553 // bimm64.
2554 if (Imm == 0 || Imm == Mask ||
2556 return false;
2557
2558 unsigned EltSize = Size;
2559 uint64_t DemandedBits = Demanded.getZExtValue();
2560
2561 // Clear bits that are not demanded.
2562 Imm &= DemandedBits;
2563
2564 while (true) {
2565 // The goal here is to set the non-demanded bits in a way that minimizes
2566 // the number of switching between 0 and 1. In order to achieve this goal,
2567 // we set the non-demanded bits to the value of the preceding demanded bits.
2568 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2569 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2570 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2571 // The final result is 0b11000011.
2572 uint64_t NonDemandedBits = ~DemandedBits;
2573 uint64_t InvertedImm = ~Imm & DemandedBits;
2574 uint64_t RotatedImm =
2575 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2576 NonDemandedBits;
2577 uint64_t Sum = RotatedImm + NonDemandedBits;
2578 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2579 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2580 NewImm = (Imm | Ones) & Mask;
2581
2582 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2583 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2584 // we halve the element size and continue the search.
2585 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2586 break;
2587
2588 // We cannot shrink the element size any further if it is 2-bits.
2589 if (EltSize == 2)
2590 return false;
2591
2592 EltSize /= 2;
2593 Mask >>= EltSize;
2594 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2595
2596 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2597 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2598 return false;
2599
2600 // Merge the upper and lower halves of Imm and DemandedBits.
2601 Imm |= Hi;
2602 DemandedBits |= DemandedBitsHi;
2603 }
2604
2605 ++NumOptimizedImms;
2606
2607 // Replicate the element across the register width.
2608 while (EltSize < Size) {
2609 NewImm |= NewImm << EltSize;
2610 EltSize *= 2;
2611 }
2612
2613 (void)OldImm;
2614 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2615 "demanded bits should never be altered");
2616 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2617
2618 // Create the new constant immediate node.
2619 EVT VT = Op.getValueType();
2620 SDLoc DL(Op);
2621 SDValue New;
2622
2623 // If the new constant immediate is all-zeros or all-ones, let the target
2624 // independent DAG combine optimize this node.
2625 if (NewImm == 0 || NewImm == OrigMask) {
2626 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2627 TLO.DAG.getConstant(NewImm, DL, VT));
2628 // Otherwise, create a machine node so that target independent DAG combine
2629 // doesn't undo this optimization.
2630 } else {
2632 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2633 New = SDValue(
2634 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2635 }
2636
2637 return TLO.CombineTo(Op, New);
2638}
2639
2641 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2642 TargetLoweringOpt &TLO) const {
2643 // Delay this optimization to as late as possible.
2644 if (!TLO.LegalOps)
2645 return false;
2646
2648 return false;
2649
2650 EVT VT = Op.getValueType();
2651 if (VT.isVector())
2652 return false;
2653
2654 unsigned Size = VT.getSizeInBits();
2655
2656 if (Size != 32 && Size != 64)
2657 return false;
2658
2659 // Exit early if we demand all bits.
2660 if (DemandedBits.isAllOnes())
2661 return false;
2662
2663 unsigned NewOpc;
2664 switch (Op.getOpcode()) {
2665 default:
2666 return false;
2667 case ISD::AND:
2668 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2669 break;
2670 case ISD::OR:
2671 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2672 break;
2673 case ISD::XOR:
2674 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2675 break;
2676 }
2677 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2678 if (!C)
2679 return false;
2680 uint64_t Imm = C->getZExtValue();
2681 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2682}
2683
2684/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2685/// Mask are known to be either zero or one and return them Known.
2687 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2688 const SelectionDAG &DAG, unsigned Depth) const {
2689 switch (Op.getOpcode()) {
2690 default:
2691 break;
2692 case AArch64ISD::DUP: {
2693 SDValue SrcOp = Op.getOperand(0);
2694 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2695 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2696 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2697 "Expected DUP implicit truncation");
2698 Known = Known.trunc(Op.getScalarValueSizeInBits());
2699 }
2700 break;
2701 }
2702 case AArch64ISD::CSEL: {
2703 KnownBits Known2;
2704 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2705 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2706 Known = Known.intersectWith(Known2);
2707 break;
2708 }
2709 case AArch64ISD::CSNEG:
2710 case AArch64ISD::CSINC:
2711 case AArch64ISD::CSINV: {
2712 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2713 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2714
2715 // The result is either:
2716 // CSINC: KnownOp0 or KnownOp1 + 1
2717 // CSINV: KnownOp0 or ~KnownOp1
2718 // CSNEG: KnownOp0 or KnownOp1 * -1
2719 if (Op.getOpcode() == AArch64ISD::CSINC)
2720 KnownOp1 = KnownBits::add(
2721 KnownOp1,
2722 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2723 else if (Op.getOpcode() == AArch64ISD::CSINV)
2724 std::swap(KnownOp1.Zero, KnownOp1.One);
2725 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2726 KnownOp1 =
2728 Op.getScalarValueSizeInBits())));
2729
2730 Known = KnownOp0.intersectWith(KnownOp1);
2731 break;
2732 }
2733 case AArch64ISD::BICi: {
2734 // Compute the bit cleared value.
2735 APInt Mask =
2736 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2737 .trunc(Known.getBitWidth());
2738 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2739 Known &= KnownBits::makeConstant(Mask);
2740 break;
2741 }
2742 case AArch64ISD::VLSHR: {
2743 KnownBits Known2;
2744 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2745 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2746 Known = KnownBits::lshr(Known, Known2);
2747 break;
2748 }
2749 case AArch64ISD::VASHR: {
2750 KnownBits Known2;
2751 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2752 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2753 Known = KnownBits::ashr(Known, Known2);
2754 break;
2755 }
2756 case AArch64ISD::VSHL: {
2757 KnownBits Known2;
2758 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2759 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2760 Known = KnownBits::shl(Known, Known2);
2761 break;
2762 }
2763 case AArch64ISD::MOVI: {
2765 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2766 break;
2767 }
2768 case AArch64ISD::MOVIshift: {
2770 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2771 << Op->getConstantOperandVal(1)));
2772 break;
2773 }
2774 case AArch64ISD::MOVImsl: {
2775 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2777 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2778 break;
2779 }
2780 case AArch64ISD::MOVIedit: {
2782 Known.getBitWidth(),
2783 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2784 break;
2785 }
2786 case AArch64ISD::MVNIshift: {
2788 APInt(Known.getBitWidth(),
2789 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2790 /*isSigned*/ false, /*implicitTrunc*/ true));
2791 break;
2792 }
2793 case AArch64ISD::MVNImsl: {
2794 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2796 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2797 /*isSigned*/ false, /*implicitTrunc*/ true));
2798 break;
2799 }
2800 case AArch64ISD::LOADgot:
2801 case AArch64ISD::ADDlow: {
2802 if (!Subtarget->isTargetILP32())
2803 break;
2804 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2805 Known.Zero = APInt::getHighBitsSet(64, 32);
2806 break;
2807 }
2808 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2809 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2810 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2811 break;
2812 }
2814 Intrinsic::ID IntID =
2815 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2816 switch (IntID) {
2817 default: return;
2818 case Intrinsic::aarch64_ldaxr:
2819 case Intrinsic::aarch64_ldxr: {
2820 unsigned BitWidth = Known.getBitWidth();
2821 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2822 unsigned MemBits = VT.getScalarSizeInBits();
2823 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2824 return;
2825 }
2826 }
2827 break;
2828 }
2830 case ISD::INTRINSIC_VOID: {
2831 unsigned IntNo = Op.getConstantOperandVal(0);
2832 switch (IntNo) {
2833 default:
2834 break;
2835 case Intrinsic::aarch64_neon_uaddlv: {
2836 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2837 unsigned BitWidth = Known.getBitWidth();
2838 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2839 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2840 assert(BitWidth >= Bound && "Unexpected width!");
2842 Known.Zero |= Mask;
2843 }
2844 break;
2845 }
2846 case Intrinsic::aarch64_neon_umaxv:
2847 case Intrinsic::aarch64_neon_uminv: {
2848 // Figure out the datatype of the vector operand. The UMINV instruction
2849 // will zero extend the result, so we can mark as known zero all the
2850 // bits larger than the element datatype. 32-bit or larget doesn't need
2851 // this as those are legal types and will be handled by isel directly.
2852 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2853 unsigned BitWidth = Known.getBitWidth();
2854 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2855 assert(BitWidth >= 8 && "Unexpected width!");
2857 Known.Zero |= Mask;
2858 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2859 assert(BitWidth >= 16 && "Unexpected width!");
2861 Known.Zero |= Mask;
2862 }
2863 break;
2864 } break;
2865 }
2866 }
2867 }
2868}
2869
2871 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2872 unsigned Depth) const {
2873 EVT VT = Op.getValueType();
2874 unsigned VTBits = VT.getScalarSizeInBits();
2875 unsigned Opcode = Op.getOpcode();
2876 switch (Opcode) {
2877 case AArch64ISD::FCMEQ:
2878 case AArch64ISD::FCMGE:
2879 case AArch64ISD::FCMGT:
2880 // Compares return either 0 or all-ones
2881 return VTBits;
2882 case AArch64ISD::VASHR: {
2883 unsigned Tmp =
2884 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2885 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2886 }
2887 }
2888
2889 return 1;
2890}
2891
2893 EVT) const {
2894 return MVT::i64;
2895}
2896
2898 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2899 unsigned *Fast) const {
2900
2901 // Allow SVE loads/stores where the alignment >= the size of the element type,
2902 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2903 // for stores that come from IR, only require element-size alignment (even if
2904 // unaligned accesses are disabled). Without this, these will be forced to
2905 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2906 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2907 if (VT.isScalableVector()) {
2908 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2909 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2910 return true;
2911 }
2912
2913 if (Subtarget->requiresStrictAlign())
2914 return false;
2915
2916 if (Fast) {
2917 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2918 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2919 // See comments in performSTORECombine() for more details about
2920 // these conditions.
2921
2922 // Code that uses clang vector extensions can mark that it
2923 // wants unaligned accesses to be treated as fast by
2924 // underspecifying alignment to be 1 or 2.
2925 Alignment <= 2 ||
2926
2927 // Disregard v2i64. Memcpy lowering produces those and splitting
2928 // them regresses performance on micro-benchmarks and olden/bh.
2929 VT == MVT::v2i64;
2930 }
2931 return true;
2932}
2933
2934// Same as above but handling LLTs instead.
2936 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2937 unsigned *Fast) const {
2938 if (Subtarget->requiresStrictAlign())
2939 return false;
2940
2941 if (Fast) {
2942 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2943 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2944 Ty.getSizeInBytes() != 16 ||
2945 // See comments in performSTORECombine() for more details about
2946 // these conditions.
2947
2948 // Code that uses clang vector extensions can mark that it
2949 // wants unaligned accesses to be treated as fast by
2950 // underspecifying alignment to be 1 or 2.
2951 Alignment <= 2 ||
2952
2953 // Disregard v2i64. Memcpy lowering produces those and splitting
2954 // them regresses performance on micro-benchmarks and olden/bh.
2955 Ty == LLT::fixed_vector(2, 64);
2956 }
2957 return true;
2958}
2959
2961 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2962 const LibcallLoweringInfo *libcallLowering) const {
2963 return AArch64::createFastISel(funcInfo, libInfo, libcallLowering);
2964}
2965
2968 MachineBasicBlock *MBB) const {
2969 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2970 // phi node:
2971
2972 // OrigBB:
2973 // [... previous instrs leading to comparison ...]
2974 // b.ne TrueBB
2975 // b EndBB
2976 // TrueBB:
2977 // ; Fallthrough
2978 // EndBB:
2979 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2980
2981 MachineFunction *MF = MBB->getParent();
2982 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2983 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2984 DebugLoc DL = MI.getDebugLoc();
2985 MachineFunction::iterator It = ++MBB->getIterator();
2986
2987 Register DestReg = MI.getOperand(0).getReg();
2988 Register IfTrueReg = MI.getOperand(1).getReg();
2989 Register IfFalseReg = MI.getOperand(2).getReg();
2990 unsigned CondCode = MI.getOperand(3).getImm();
2991 bool NZCVKilled = MI.getOperand(4).isKill();
2992
2993 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2994 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2995 MF->insert(It, TrueBB);
2996 MF->insert(It, EndBB);
2997
2998 // Transfer rest of current basic-block to EndBB
2999 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3000 MBB->end());
3002
3003 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3004 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3005 MBB->addSuccessor(TrueBB);
3006 MBB->addSuccessor(EndBB);
3007
3008 // TrueBB falls through to the end.
3009 TrueBB->addSuccessor(EndBB);
3010
3011 if (!NZCVKilled) {
3012 TrueBB->addLiveIn(AArch64::NZCV);
3013 EndBB->addLiveIn(AArch64::NZCV);
3014 }
3015
3016 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3017 .addReg(IfTrueReg)
3018 .addMBB(TrueBB)
3019 .addReg(IfFalseReg)
3020 .addMBB(MBB);
3021
3022 MI.eraseFromParent();
3023 return EndBB;
3024}
3025
3033
3036 MachineBasicBlock *MBB) const {
3037 MachineFunction &MF = *MBB->getParent();
3038 MachineBasicBlock::iterator MBBI = MI.getIterator();
3039 const AArch64InstrInfo &TII =
3040 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3041 Register TargetReg = MI.getOperand(0).getReg();
3043 TII.probedStackAlloc(MBBI, TargetReg, false);
3044
3045 MI.eraseFromParent();
3046 return NextInst->getParent();
3047}
3048
3051 MachineBasicBlock *MBB) const {
3052 MachineFunction *MF = MBB->getParent();
3054
3055 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
3056 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
3057
3058 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
3059 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
3060 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
3061 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
3062
3063 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3064 DebugLoc DL = MI.getDebugLoc();
3065
3066 // RDVL requires GPR64, ADDSVL requires GPR64sp
3067 // We need to insert COPY instructions, these will later be removed by the
3068 // RegisterCoalescer
3069 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
3070 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
3071 .addReg(RegVL_GPR);
3072
3073 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
3074 .addReg(RegVL_GPRsp)
3075 .addImm(-1);
3076 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
3077 .addReg(RegSVL_GPRsp);
3078
3079 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3080 MachineFunction::iterator It = ++MBB->getIterator();
3081 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
3082 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
3083 MF->insert(It, TrapBB);
3084 MF->insert(It, PassBB);
3085
3086 // Continue if vector lengths match
3087 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
3088 .addReg(RegSVL_GPR)
3089 .addMBB(PassBB);
3090
3091 // Transfer rest of current BB to PassBB
3092 PassBB->splice(PassBB->begin(), MBB,
3093 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
3095
3096 // Trap if vector lengths mismatch
3097 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
3098
3099 MBB->addSuccessor(TrapBB);
3100 MBB->addSuccessor(PassBB);
3101
3102 MI.eraseFromParent();
3103 return PassBB;
3104}
3105
3107AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3109 MachineBasicBlock *BB) const {
3110 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3111 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3112
3113 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3114 MIB.add(MI.getOperand(1)); // slice index register
3115 MIB.add(MI.getOperand(2)); // slice index offset
3116 MIB.add(MI.getOperand(3)); // pg
3117 MIB.add(MI.getOperand(4)); // base
3118 MIB.add(MI.getOperand(5)); // offset
3119
3120 MI.eraseFromParent(); // The pseudo is gone now.
3121 return BB;
3122}
3123
3126 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3128 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3129
3130 MIB.addReg(AArch64::ZA, RegState::Define);
3131 MIB.add(MI.getOperand(0)); // Vector select register
3132 MIB.add(MI.getOperand(1)); // Vector select offset
3133 MIB.add(MI.getOperand(2)); // Base
3134 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3135
3136 MI.eraseFromParent(); // The pseudo is gone now.
3137 return BB;
3138}
3139
3142 unsigned Opcode,
3143 bool Op0IsDef) const {
3144 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3146
3147 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3148 .addReg(MI.getOperand(0).getReg(), getDefRegState(Op0IsDef));
3149 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3150 MIB.add(MI.getOperand(I));
3151
3152 MI.eraseFromParent(); // The pseudo is gone now.
3153 return BB;
3154}
3155
3157AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3159 MachineBasicBlock *BB) const {
3160 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3161 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3162 unsigned StartIdx = 0;
3163
3164 bool HasTile = BaseReg != AArch64::ZA;
3165 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3166 if (HasZPROut) {
3167 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3168 ++StartIdx;
3169 }
3170 if (HasTile) {
3171 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3172 RegState::Define); // Output ZA Tile
3173 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3174 StartIdx++;
3175 } else {
3176 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3177 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3178 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3179 ++StartIdx;
3180 }
3181 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3182 }
3183 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3184 MIB.add(MI.getOperand(I));
3185
3186 MI.eraseFromParent(); // The pseudo is gone now.
3187 return BB;
3188}
3189
3192 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3194 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3195 MIB.add(MI.getOperand(0)); // Mask
3196
3197 unsigned Mask = MI.getOperand(0).getImm();
3198 for (unsigned I = 0; I < 8; I++) {
3199 if (Mask & (1 << I))
3200 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3201 }
3202
3203 MI.eraseFromParent(); // The pseudo is gone now.
3204 return BB;
3205}
3206
3209 MachineBasicBlock *BB) const {
3210 MachineFunction *MF = BB->getParent();
3211 MachineFrameInfo &MFI = MF->getFrameInfo();
3213 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3214 if (TPIDR2.Uses > 0) {
3215 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3216 // generally don't support big-endian SVE/SME.
3217 if (!Subtarget->isLittleEndian())
3219 "TPIDR2 block initialization is not supported on big-endian targets");
3220
3221 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3222 // Store buffer pointer and num_za_save_slices.
3223 // Bytes 10-15 are implicitly zeroed.
3224 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3225 .addReg(MI.getOperand(0).getReg())
3226 .addReg(MI.getOperand(1).getReg())
3227 .addFrameIndex(TPIDR2.FrameIndex)
3228 .addImm(0);
3229 } else
3230 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3231
3232 BB->remove_instr(&MI);
3233 return BB;
3234}
3235
3238 MachineBasicBlock *BB) const {
3239 MachineFunction *MF = BB->getParent();
3240 MachineFrameInfo &MFI = MF->getFrameInfo();
3242 // TODO This function grows the stack with a subtraction, which doesn't work
3243 // on Windows. Some refactoring to share the functionality in
3244 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3245 // supports SME
3247 "Lazy ZA save is not yet supported on Windows");
3248
3249 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3250
3251 if (TPIDR2.Uses > 0) {
3252 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3254
3255 // The SUBXrs below won't always be emitted in a form that accepts SP
3256 // directly
3257 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3258 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3259 .addReg(AArch64::SP);
3260
3261 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3262 auto Size = MI.getOperand(1).getReg();
3263 auto Dest = MI.getOperand(0).getReg();
3264 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3265 .addReg(Size)
3266 .addReg(Size)
3267 .addReg(SP);
3268 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3269 AArch64::SP)
3270 .addReg(Dest);
3271
3272 // We have just allocated a variable sized object, tell this to PEI.
3273 MFI.CreateVariableSizedObject(Align(16), nullptr);
3274 }
3275
3276 BB->remove_instr(&MI);
3277 return BB;
3278}
3279
3280// TODO: Find a way to merge this with EmitAllocateZABuffer.
3283 MachineBasicBlock *BB) const {
3284 MachineFunction *MF = BB->getParent();
3285 MachineFrameInfo &MFI = MF->getFrameInfo();
3288 "Lazy ZA save is not yet supported on Windows");
3289
3290 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3291 if (FuncInfo->isSMESaveBufferUsed()) {
3292 // Allocate a buffer object of the size given by MI.getOperand(1).
3293 auto Size = MI.getOperand(1).getReg();
3294 auto Dest = MI.getOperand(0).getReg();
3295 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3296 .addReg(AArch64::SP)
3297 .addReg(Size)
3299 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3300 .addReg(AArch64::SP);
3301
3302 // We have just allocated a variable sized object, tell this to PEI.
3303 MFI.CreateVariableSizedObject(Align(16), nullptr);
3304 } else
3305 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3306 MI.getOperand(0).getReg());
3307
3308 BB->remove_instr(&MI);
3309 return BB;
3310}
3311
3314 MachineBasicBlock *BB) const {
3315 // If the buffer is used, emit a call to __arm_sme_state_size()
3316 MachineFunction *MF = BB->getParent();
3318 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3319 if (FuncInfo->isSMESaveBufferUsed()) {
3320 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3321 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3322 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3324 .addReg(AArch64::X0, RegState::ImplicitDefine)
3325 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3326 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3327 MI.getOperand(0).getReg())
3328 .addReg(AArch64::X0);
3329 } else
3330 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3331 MI.getOperand(0).getReg())
3332 .addReg(AArch64::XZR);
3333 BB->remove_instr(&MI);
3334 return BB;
3335}
3336
3339 MachineBasicBlock *BB) const {
3340 MachineFunction *MF = BB->getParent();
3341 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3342 const DebugLoc &DL = MI.getDebugLoc();
3343 Register ResultReg = MI.getOperand(0).getReg();
3344 if (MF->getRegInfo().use_empty(ResultReg)) {
3345 // Nothing to do. Pseudo erased below.
3346 } else if (Subtarget->hasSME()) {
3347 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3348 .addImm(AArch64SysReg::SVCR)
3349 .addReg(AArch64::VG, RegState::Implicit);
3350 } else {
3351 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3352 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3353 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3355 .addReg(AArch64::X0, RegState::ImplicitDefine)
3356 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3357 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3358 .addReg(AArch64::X0);
3359 }
3360 MI.eraseFromParent();
3361 return BB;
3362}
3363
3364// Helper function to find the instruction that defined a virtual register.
3365// If unable to find such instruction, returns nullptr.
3367 Register Reg) {
3368 while (Reg.isVirtual()) {
3369 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3370 assert(DefMI && "Virtual register definition not found");
3371 unsigned Opcode = DefMI->getOpcode();
3372
3373 if (Opcode == AArch64::COPY) {
3374 Reg = DefMI->getOperand(1).getReg();
3375 // Vreg is defined by copying from physreg.
3376 if (Reg.isPhysical())
3377 return DefMI;
3378 continue;
3379 }
3380 if (Opcode == AArch64::SUBREG_TO_REG) {
3381 Reg = DefMI->getOperand(2).getReg();
3382 continue;
3383 }
3384
3385 return DefMI;
3386 }
3387 return nullptr;
3388}
3389
3392 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3393 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3394 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3395 const DebugLoc &DL = MI.getDebugLoc();
3396
3397 Register AddrDisc = AddrDiscOp.getReg();
3398 int64_t IntDisc = IntDiscOp.getImm();
3399 assert(IntDisc == 0 && "Blend components are already expanded");
3400
3401 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3402 if (DiscMI) {
3403 switch (DiscMI->getOpcode()) {
3404 case AArch64::MOVKXi:
3405 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3406 // #imm should be an immediate and not a global symbol, for example.
3407 if (DiscMI->getOperand(2).isImm() &&
3408 DiscMI->getOperand(3).getImm() == 48) {
3409 AddrDisc = DiscMI->getOperand(1).getReg();
3410 IntDisc = DiscMI->getOperand(2).getImm();
3411 }
3412 break;
3413 case AArch64::MOVi32imm:
3414 case AArch64::MOVi64imm:
3415 // Small immediate integer constant passed via VReg.
3416 if (DiscMI->getOperand(1).isImm() &&
3417 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3418 AddrDisc = AArch64::NoRegister;
3419 IntDisc = DiscMI->getOperand(1).getImm();
3420 }
3421 break;
3422 }
3423 }
3424
3425 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3426 // in the requested register class.
3427 if (AddrDisc == AArch64::XZR)
3428 AddrDisc = AArch64::NoRegister;
3429
3430 // Make sure AddrDisc operand respects the register class imposed by MI.
3431 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3432 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3433 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3434 AddrDisc = TmpReg;
3435 }
3436
3437 AddrDiscOp.setReg(AddrDisc);
3438 IntDiscOp.setImm(IntDisc);
3439}
3440
3442 MachineInstr &MI, MachineBasicBlock *BB) const {
3443
3444 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3445 if (SMEOrigInstr != -1) {
3446 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3447 uint64_t SMEMatrixType =
3448 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3449 switch (SMEMatrixType) {
3451 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3453 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3455 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3457 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3459 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3461 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3462 }
3463 }
3464
3465 switch (MI.getOpcode()) {
3466 default:
3467#ifndef NDEBUG
3468 MI.dump();
3469#endif
3470 llvm_unreachable("Unexpected instruction for custom inserter!");
3471 case AArch64::InitTPIDR2Obj:
3472 return EmitInitTPIDR2Object(MI, BB);
3473 case AArch64::AllocateZABuffer:
3474 return EmitAllocateZABuffer(MI, BB);
3475 case AArch64::AllocateSMESaveBuffer:
3476 return EmitAllocateSMESaveBuffer(MI, BB);
3477 case AArch64::GetSMESaveSize:
3478 return EmitGetSMESaveSize(MI, BB);
3479 case AArch64::EntryPStateSM:
3480 return EmitEntryPStateSM(MI, BB);
3481 case AArch64::F128CSEL:
3482 return EmitF128CSEL(MI, BB);
3483 case TargetOpcode::STATEPOINT:
3484 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3485 // while bl call instruction (where statepoint will be lowered at the end)
3486 // has implicit def. This def is early-clobber as it will be set at
3487 // the moment of the call and earlier than any use is read.
3488 // Add this implicit dead def here as a workaround.
3489 MI.addOperand(*MI.getMF(),
3491 AArch64::LR, /*isDef*/ true,
3492 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3493 /*isUndef*/ false, /*isEarlyClobber*/ true));
3494 [[fallthrough]];
3495 case TargetOpcode::STACKMAP:
3496 case TargetOpcode::PATCHPOINT:
3497 return emitPatchPoint(MI, BB);
3498
3499 case TargetOpcode::PATCHABLE_EVENT_CALL:
3500 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3501 return BB;
3502
3503 case AArch64::CATCHRET:
3504 return EmitLoweredCatchRet(MI, BB);
3505
3506 case AArch64::PROBED_STACKALLOC_DYN:
3507 return EmitDynamicProbedAlloc(MI, BB);
3508
3509 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3510 return EmitCheckMatchingVL(MI, BB);
3511
3512 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3513 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3514 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3515 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3516 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3517 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3518 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3519 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3520 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3521 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3522 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3523 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3524 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3525 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3526 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3527 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3528 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3529 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3530 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3531 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3532 case AArch64::LDR_ZA_PSEUDO:
3533 return EmitFill(MI, BB);
3534 case AArch64::LDR_TX_PSEUDO:
3535 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3536 case AArch64::STR_TX_PSEUDO:
3537 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3538 case AArch64::ZERO_M_PSEUDO:
3539 return EmitZero(MI, BB);
3540 case AArch64::ZERO_T_PSEUDO:
3541 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3542 case AArch64::MOVT_TIZ_PSEUDO:
3543 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3544
3545 case AArch64::PAC:
3546 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3547 &AArch64::GPR64noipRegClass);
3548 return BB;
3549 }
3550}
3551
3552//===----------------------------------------------------------------------===//
3553// AArch64 Lowering private implementation.
3554//===----------------------------------------------------------------------===//
3555
3556//===----------------------------------------------------------------------===//
3557// Lowering Code
3558//===----------------------------------------------------------------------===//
3559
3560// Forward declarations of SVE fixed length lowering helpers
3565 SelectionDAG &DAG);
3568 EVT VT);
3569
3570/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3571static bool isZerosVector(const SDNode *N) {
3572 // Look through a bit convert.
3573 while (N->getOpcode() == ISD::BITCAST)
3574 N = N->getOperand(0).getNode();
3575
3577 return true;
3578
3579 if (N->getOpcode() != AArch64ISD::DUP)
3580 return false;
3581
3582 auto Opnd0 = N->getOperand(0);
3583 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3584}
3585
3586/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3587/// CC
3589 SDValue RHS = {}) {
3590 switch (CC) {
3591 default:
3592 llvm_unreachable("Unknown condition code!");
3593 case ISD::SETNE:
3594 return AArch64CC::NE;
3595 case ISD::SETEQ:
3596 return AArch64CC::EQ;
3597 case ISD::SETGT:
3598 return AArch64CC::GT;
3599 case ISD::SETGE:
3601 case ISD::SETLT:
3603 case ISD::SETLE:
3604 return AArch64CC::LE;
3605 case ISD::SETUGT:
3606 return AArch64CC::HI;
3607 case ISD::SETUGE:
3608 return AArch64CC::HS;
3609 case ISD::SETULT:
3610 return AArch64CC::LO;
3611 case ISD::SETULE:
3612 return AArch64CC::LS;
3613 }
3614}
3615
3616/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3618 AArch64CC::CondCode &CondCode,
3619 AArch64CC::CondCode &CondCode2) {
3620 CondCode2 = AArch64CC::AL;
3621 switch (CC) {
3622 default:
3623 llvm_unreachable("Unknown FP condition!");
3624 case ISD::SETEQ:
3625 case ISD::SETOEQ:
3626 CondCode = AArch64CC::EQ;
3627 break;
3628 case ISD::SETGT:
3629 case ISD::SETOGT:
3630 CondCode = AArch64CC::GT;
3631 break;
3632 case ISD::SETGE:
3633 case ISD::SETOGE:
3634 CondCode = AArch64CC::GE;
3635 break;
3636 case ISD::SETOLT:
3637 CondCode = AArch64CC::MI;
3638 break;
3639 case ISD::SETOLE:
3640 CondCode = AArch64CC::LS;
3641 break;
3642 case ISD::SETONE:
3643 CondCode = AArch64CC::MI;
3644 CondCode2 = AArch64CC::GT;
3645 break;
3646 case ISD::SETO:
3647 CondCode = AArch64CC::VC;
3648 break;
3649 case ISD::SETUO:
3650 CondCode = AArch64CC::VS;
3651 break;
3652 case ISD::SETUEQ:
3653 CondCode = AArch64CC::EQ;
3654 CondCode2 = AArch64CC::VS;
3655 break;
3656 case ISD::SETUGT:
3657 CondCode = AArch64CC::HI;
3658 break;
3659 case ISD::SETUGE:
3660 CondCode = AArch64CC::PL;
3661 break;
3662 case ISD::SETLT:
3663 case ISD::SETULT:
3664 CondCode = AArch64CC::LT;
3665 break;
3666 case ISD::SETLE:
3667 case ISD::SETULE:
3668 CondCode = AArch64CC::LE;
3669 break;
3670 case ISD::SETNE:
3671 case ISD::SETUNE:
3672 CondCode = AArch64CC::NE;
3673 break;
3674 }
3675}
3676
3677/// Convert a DAG fp condition code to an AArch64 CC.
3678/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3679/// should be AND'ed instead of OR'ed.
3681 AArch64CC::CondCode &CondCode,
3682 AArch64CC::CondCode &CondCode2) {
3683 CondCode2 = AArch64CC::AL;
3684 switch (CC) {
3685 default:
3686 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3687 assert(CondCode2 == AArch64CC::AL);
3688 break;
3689 case ISD::SETONE:
3690 // (a one b)
3691 // == ((a olt b) || (a ogt b))
3692 // == ((a ord b) && (a une b))
3693 CondCode = AArch64CC::VC;
3694 CondCode2 = AArch64CC::NE;
3695 break;
3696 case ISD::SETUEQ:
3697 // (a ueq b)
3698 // == ((a uno b) || (a oeq b))
3699 // == ((a ule b) && (a uge b))
3700 CondCode = AArch64CC::PL;
3701 CondCode2 = AArch64CC::LE;
3702 break;
3703 }
3704}
3705
3706/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3707/// CC usable with the vector instructions. Fewer operations are available
3708/// without a real NZCV register, so we have to use less efficient combinations
3709/// to get the same effect.
3711 AArch64CC::CondCode &CondCode,
3712 AArch64CC::CondCode &CondCode2,
3713 bool &Invert) {
3714 Invert = false;
3715 switch (CC) {
3716 default:
3717 // Mostly the scalar mappings work fine.
3718 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3719 break;
3720 case ISD::SETUO:
3721 Invert = true;
3722 [[fallthrough]];
3723 case ISD::SETO:
3724 CondCode = AArch64CC::MI;
3725 CondCode2 = AArch64CC::GE;
3726 break;
3727 case ISD::SETUEQ:
3728 case ISD::SETULT:
3729 case ISD::SETULE:
3730 case ISD::SETUGT:
3731 case ISD::SETUGE:
3732 // All of the compare-mask comparisons are ordered, but we can switch
3733 // between the two by a double inversion. E.g. ULE == !OGT.
3734 Invert = true;
3735 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3736 CondCode, CondCode2);
3737 break;
3738 }
3739}
3740
3741/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3743 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3744 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3745}
3746
3748 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3749 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3750 LLVM_DEBUG(dbgs() << "Is imm " << C
3751 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3752 return IsLegal;
3753}
3754
3756 // Works for negative immediates too, as it can be written as an ADDS
3757 // instruction with a negated immediate.
3758 return isLegalArithImmed(C.abs().getZExtValue());
3759}
3760
3762 uint64_t Imm = C.getZExtValue();
3764 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3765 return Insn.size();
3766}
3767
3769 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3770 if (Op->getFlags().hasNoSignedWrap())
3771 return true;
3772
3773 // We can still figure out if the second operand is safe to use
3774 // in a CMN instruction by checking if it is known to be not the minimum
3775 // signed value. If it is not, then we can safely use CMN.
3776 // Note: We can eventually remove this check and simply rely on
3777 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3778 // consistently sets them appropriately when making said nodes.
3779
3780 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3781 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3782}
3783
3784// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3785// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3786// can be set differently by this operation. It comes down to whether
3787// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3788// everything is fine. If not then the optimization is wrong. Thus general
3789// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3790//
3791// So, finally, the only LLVM-native comparisons that don't mention C or V
3792// are the ones that aren't unsigned comparisons. They're the only ones we can
3793// safely use CMN for in the absence of information about op2.
3795 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3796 (isIntEqualitySetCC(CC) ||
3797 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3798 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3799}
3800
3802 SelectionDAG &DAG, SDValue Chain,
3803 bool IsSignaling) {
3804 EVT VT = LHS.getValueType();
3805 assert(VT != MVT::f128);
3806
3807 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3808
3809 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3810 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3811 {Chain, LHS});
3812 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3813 {LHS.getValue(1), RHS});
3814 Chain = RHS.getValue(1);
3815 }
3816 unsigned Opcode =
3817 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3818 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3819}
3820
3822 const SDLoc &DL, SelectionDAG &DAG) {
3823 EVT VT = LHS.getValueType();
3824 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3825
3826 if (VT.isFloatingPoint()) {
3827 assert(VT != MVT::f128);
3828 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3829 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3830 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3831 }
3832 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3833 }
3834
3835 // The CMP instruction is just an alias for SUBS, and representing it as
3836 // SUBS means that it's possible to get CSE with subtract operations.
3837 // A later phase can perform the optimization of setting the destination
3838 // register to WZR/XZR if it ends up being unused.
3839 unsigned Opcode = AArch64ISD::SUBS;
3840
3841 if (isCMN(RHS, CC, DAG)) {
3842 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3843 Opcode = AArch64ISD::ADDS;
3844 RHS = RHS.getOperand(1);
3845 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3846 isIntEqualitySetCC(CC)) {
3847 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3848 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3849 Opcode = AArch64ISD::ADDS;
3850 LHS = LHS.getOperand(1);
3851 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3852 if (LHS.getOpcode() == ISD::AND) {
3853 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3854 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3855 // of the signed comparisons.
3856 const SDValue ANDSNode =
3857 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3858 LHS.getOperand(0), LHS.getOperand(1));
3859 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3860 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3861 return ANDSNode.getValue(1);
3862 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3863 // Use result of ANDS
3864 return LHS.getValue(1);
3865 }
3866 }
3867
3868 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3869 .getValue(1);
3870}
3871
3872/// \defgroup AArch64CCMP CMP;CCMP matching
3873///
3874/// These functions deal with the formation of CMP;CCMP;... sequences.
3875/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3876/// a comparison. They set the NZCV flags to a predefined value if their
3877/// predicate is false. This allows to express arbitrary conjunctions, for
3878/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3879/// expressed as:
3880/// cmp A
3881/// ccmp B, inv(CB), CA
3882/// check for CB flags
3883///
3884/// This naturally lets us implement chains of AND operations with SETCC
3885/// operands. And we can even implement some other situations by transforming
3886/// them:
3887/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3888/// negating the flags used in a CCMP/FCCMP operations.
3889/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3890/// by negating the flags we test for afterwards. i.e.
3891/// NEG (CMP CCMP CCCMP ...) can be implemented.
3892/// - Note that we can only ever negate all previously processed results.
3893/// What we can not implement by flipping the flags to test is a negation
3894/// of two sub-trees (because the negation affects all sub-trees emitted so
3895/// far, so the 2nd sub-tree we emit would also affect the first).
3896/// With those tools we can implement some OR operations:
3897/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3898/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3899/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3900/// elimination rules from earlier to implement the whole thing as a
3901/// CCMP/FCCMP chain.
3902///
3903/// As complete example:
3904/// or (or (setCA (cmp A)) (setCB (cmp B)))
3905/// (and (setCC (cmp C)) (setCD (cmp D)))"
3906/// can be reassociated to:
3907/// or (and (setCC (cmp C)) setCD (cmp D))
3908// (or (setCA (cmp A)) (setCB (cmp B)))
3909/// can be transformed to:
3910/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3911/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3912/// which can be implemented as:
3913/// cmp C
3914/// ccmp D, inv(CD), CC
3915/// ccmp A, CA, inv(CD)
3916/// ccmp B, CB, inv(CA)
3917/// check for CB flags
3918///
3919/// A counterexample is "or (and A B) (and C D)" which translates to
3920/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3921/// can only implement 1 of the inner (not) operations, but not both!
3922/// @{
3923
3924/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3926 ISD::CondCode CC, SDValue CCOp,
3928 AArch64CC::CondCode OutCC,
3929 const SDLoc &DL, SelectionDAG &DAG) {
3930 unsigned Opcode = 0;
3931 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3932
3933 if (LHS.getValueType().isFloatingPoint()) {
3934 assert(LHS.getValueType() != MVT::f128);
3935 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3936 LHS.getValueType() == MVT::bf16) {
3937 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3938 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3939 }
3940 Opcode = AArch64ISD::FCCMP;
3941 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3942 APInt Imm = Const->getAPIntValue();
3943 if (Imm.isNegative() && Imm.sgt(-32)) {
3944 Opcode = AArch64ISD::CCMN;
3945 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3946 }
3947 } else if (isCMN(RHS, CC, DAG)) {
3948 Opcode = AArch64ISD::CCMN;
3949 RHS = RHS.getOperand(1);
3950 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3951 isIntEqualitySetCC(CC)) {
3952 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3953 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3954 Opcode = AArch64ISD::CCMN;
3955 LHS = LHS.getOperand(1);
3956 }
3957 if (Opcode == 0)
3958 Opcode = AArch64ISD::CCMP;
3959
3960 SDValue Condition = getCondCode(DAG, Predicate);
3962 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3963 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3964 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3965}
3966
3967/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3968/// expressed as a conjunction. See \ref AArch64CCMP.
3969/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3970/// changing the conditions on the SETCC tests.
3971/// (this means we can call emitConjunctionRec() with
3972/// Negate==true on this sub-tree)
3973/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3974/// cannot do the negation naturally. We are required to
3975/// emit the subtree first in this case.
3976/// \param PreferFirst Set to true if processing this subtree first may
3977/// result in more efficient code.
3978/// \param WillNegate Is true if are called when the result of this
3979/// subexpression must be negated. This happens when the
3980/// outer expression is an OR. We can use this fact to know
3981/// that we have a double negation (or (or ...) ...) that
3982/// can be implemented for free.
3983static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val,
3984 bool &CanNegate, bool &MustBeFirst,
3985 bool &PreferFirst, bool WillNegate,
3986 unsigned Depth = 0) {
3987 if (!Val.hasOneUse())
3988 return false;
3989 unsigned Opcode = Val->getOpcode();
3990 if (Opcode == ISD::SETCC) {
3991 EVT VT = Val->getOperand(0).getValueType();
3992 if (VT == MVT::f128)
3993 return false;
3994 CanNegate = true;
3995 MustBeFirst = false;
3996 // Designate this operation as a preferred first operation if the result
3997 // of a SUB operation can be reused.
3998 PreferFirst = DAG.doesNodeExist(ISD::SUB, DAG.getVTList(VT),
3999 {Val->getOperand(0), Val->getOperand(1)});
4000 return true;
4001 }
4002 // Protect against exponential runtime and stack overflow.
4003 if (Depth > 6)
4004 return false;
4005 if (Opcode == ISD::AND || Opcode == ISD::OR) {
4006 bool IsOR = Opcode == ISD::OR;
4007 SDValue O0 = Val->getOperand(0);
4008 SDValue O1 = Val->getOperand(1);
4009 bool CanNegateL;
4010 bool MustBeFirstL;
4011 bool PreferFirstL;
4012 if (!canEmitConjunction(DAG, O0, CanNegateL, MustBeFirstL, PreferFirstL,
4013 IsOR, Depth + 1))
4014 return false;
4015 bool CanNegateR;
4016 bool MustBeFirstR;
4017 bool PreferFirstR;
4018 if (!canEmitConjunction(DAG, O1, CanNegateR, MustBeFirstR, PreferFirstR,
4019 IsOR, Depth + 1))
4020 return false;
4021
4022 if (MustBeFirstL && MustBeFirstR)
4023 return false;
4024
4025 if (IsOR) {
4026 // For an OR expression we need to be able to naturally negate at least
4027 // one side or we cannot do the transformation at all.
4028 if (!CanNegateL && !CanNegateR)
4029 return false;
4030 // If we the result of the OR will be negated and we can naturally negate
4031 // the leafs, then this sub-tree as a whole negates naturally.
4032 CanNegate = WillNegate && CanNegateL && CanNegateR;
4033 // If we cannot naturally negate the whole sub-tree, then this must be
4034 // emitted first.
4035 MustBeFirst = !CanNegate;
4036 } else {
4037 assert(Opcode == ISD::AND && "Must be OR or AND");
4038 // We cannot naturally negate an AND operation.
4039 CanNegate = false;
4040 MustBeFirst = MustBeFirstL || MustBeFirstR;
4041 }
4042 PreferFirst = PreferFirstL || PreferFirstR;
4043 return true;
4044 }
4045 return false;
4046}
4047
4048/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
4049/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
4050/// Tries to transform the given i1 producing node @p Val to a series compare
4051/// and conditional compare operations. @returns an NZCV flags producing node
4052/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
4053/// transformation was not possible.
4054/// \p Negate is true if we want this sub-tree being negated just by changing
4055/// SETCC conditions.
4057 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
4059 // We're at a tree leaf, produce a conditional comparison operation.
4060 unsigned Opcode = Val->getOpcode();
4061 if (Opcode == ISD::SETCC) {
4062 SDValue LHS = Val->getOperand(0);
4063 SDValue RHS = Val->getOperand(1);
4064 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
4065 bool isInteger = LHS.getValueType().isInteger();
4066 if (Negate)
4067 CC = getSetCCInverse(CC, LHS.getValueType());
4068 SDLoc DL(Val);
4069 // Determine OutCC and handle FP special case.
4070 if (isInteger) {
4071 OutCC = changeIntCCToAArch64CC(CC, RHS);
4072 } else {
4073 assert(LHS.getValueType().isFloatingPoint());
4074 AArch64CC::CondCode ExtraCC;
4075 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4076 // Some floating point conditions can't be tested with a single condition
4077 // code. Construct an additional comparison in this case.
4078 if (ExtraCC != AArch64CC::AL) {
4079 SDValue ExtraCmp;
4080 if (!CCOp.getNode())
4081 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
4082 else
4083 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
4084 ExtraCC, DL, DAG);
4085 CCOp = ExtraCmp;
4086 Predicate = ExtraCC;
4087 }
4088 }
4089
4090 // Produce a normal comparison if we are first in the chain
4091 if (!CCOp)
4092 return emitComparison(LHS, RHS, CC, DL, DAG);
4093 // Otherwise produce a ccmp.
4094 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
4095 DAG);
4096 }
4097 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
4098
4099 bool IsOR = Opcode == ISD::OR;
4100
4101 SDValue LHS = Val->getOperand(0);
4102 bool CanNegateL;
4103 bool MustBeFirstL;
4104 bool PreferFirstL;
4105 bool ValidL = canEmitConjunction(DAG, LHS, CanNegateL, MustBeFirstL,
4106 PreferFirstL, IsOR);
4107 assert(ValidL && "Valid conjunction/disjunction tree");
4108 (void)ValidL;
4109
4110 SDValue RHS = Val->getOperand(1);
4111 bool CanNegateR;
4112 bool MustBeFirstR;
4113 bool PreferFirstR;
4114 bool ValidR = canEmitConjunction(DAG, RHS, CanNegateR, MustBeFirstR,
4115 PreferFirstR, IsOR);
4116 assert(ValidR && "Valid conjunction/disjunction tree");
4117 (void)ValidR;
4118
4119 bool ShouldFirstL = PreferFirstL && !PreferFirstR && !MustBeFirstR;
4120
4121 // Swap sub-tree that must or should come first to the right side.
4122 if (MustBeFirstL || ShouldFirstL) {
4123 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4124 std::swap(LHS, RHS);
4125 std::swap(CanNegateL, CanNegateR);
4126 std::swap(MustBeFirstL, MustBeFirstR);
4127 }
4128
4129 bool NegateR;
4130 bool NegateAfterR;
4131 bool NegateL;
4132 bool NegateAfterAll;
4133 if (Opcode == ISD::OR) {
4134 // Swap the sub-tree that we can negate naturally to the left.
4135 if (!CanNegateL) {
4136 assert(CanNegateR && "at least one side must be negatable");
4137 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4138 assert(!Negate);
4139 std::swap(LHS, RHS);
4140 NegateR = false;
4141 NegateAfterR = true;
4142 } else {
4143 // Negate the left sub-tree if possible, otherwise negate the result.
4144 NegateR = CanNegateR;
4145 NegateAfterR = !CanNegateR;
4146 }
4147 NegateL = true;
4148 NegateAfterAll = !Negate;
4149 } else {
4150 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4151 assert(!Negate && "Valid conjunction/disjunction tree");
4152
4153 NegateL = false;
4154 NegateR = false;
4155 NegateAfterR = false;
4156 NegateAfterAll = false;
4157 }
4158
4159 // Emit sub-trees.
4160 AArch64CC::CondCode RHSCC;
4161 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4162 if (NegateAfterR)
4163 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4164 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4165 if (NegateAfterAll)
4166 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4167 return CmpL;
4168}
4169
4170/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4171/// In some cases this is even possible with OR operations in the expression.
4172/// See \ref AArch64CCMP.
4173/// \see emitConjunctionRec().
4175 AArch64CC::CondCode &OutCC) {
4176 bool DummyCanNegate;
4177 bool DummyMustBeFirst;
4178 bool DummyPreferFirst;
4179 if (!canEmitConjunction(DAG, Val, DummyCanNegate, DummyMustBeFirst,
4180 DummyPreferFirst, false))
4181 return SDValue();
4182
4183 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4184}
4185
4186/// @}
4187
4188/// Returns how profitable it is to fold a comparison's operand's shift and/or
4189/// extension operations.
4191 auto isSupportedExtend = [&](SDValue V) {
4192 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4193 return true;
4194
4195 if (V.getOpcode() == ISD::AND)
4196 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4197 uint64_t Mask = MaskCst->getZExtValue();
4198 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4199 }
4200
4201 return false;
4202 };
4203
4204 if (!Op.hasOneUse())
4205 return 0;
4206
4207 if (isSupportedExtend(Op))
4208 return 1;
4209
4210 unsigned Opc = Op.getOpcode();
4211 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4212 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4213 uint64_t Shift = ShiftCst->getZExtValue();
4214 if (isSupportedExtend(Op.getOperand(0)))
4215 return (Shift <= 4) ? 2 : 1;
4216 EVT VT = Op.getValueType();
4217 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4218 return 1;
4219 }
4220
4221 return 0;
4222}
4223
4224// emitComparison() converts comparison with one or negative one to comparison
4225// with 0. Note that this only works for signed comparisons because of how ANDS
4226// works.
4228 // Only works for ANDS and AND.
4229 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4230 return false;
4231
4232 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4233 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4234 return true;
4235 }
4236
4237 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4238 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4239 return true;
4240 }
4241
4242 return false;
4243}
4244
4246 SDValue &AArch64cc, SelectionDAG &DAG,
4247 const SDLoc &DL) {
4248 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4249 EVT VT = RHS.getValueType();
4250 APInt C = RHSC->getAPIntValue();
4251 // shouldBeAdjustedToZero is a special case to better fold with
4252 // emitComparison().
4253 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4254 // Adjust the constant to zero.
4255 // CC has already been adjusted.
4256 RHS = DAG.getConstant(0, DL, VT);
4257 } else if (!isLegalCmpImmed(C)) {
4258 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4259 // Constant does not fit, try adjusting it by one?
4260 switch (CC) {
4261 default:
4262 break;
4263 case ISD::SETLT:
4264 case ISD::SETGE:
4265 if (!C.isMinSignedValue()) {
4266 APInt CMinusOne = C - 1;
4267 if (isLegalCmpImmed(CMinusOne) ||
4268 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4269 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4270 RHS = DAG.getConstant(CMinusOne, DL, VT);
4271 }
4272 }
4273 break;
4274 case ISD::SETULT:
4275 case ISD::SETUGE: {
4276 // C is not 0 because it is a legal immediate.
4277 assert(!C.isZero() && "C should not be zero here");
4278 APInt CMinusOne = C - 1;
4279 if (isLegalCmpImmed(CMinusOne) ||
4280 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4281 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4282 RHS = DAG.getConstant(CMinusOne, DL, VT);
4283 }
4284 break;
4285 }
4286 case ISD::SETLE:
4287 case ISD::SETGT:
4288 if (!C.isMaxSignedValue()) {
4289 APInt CPlusOne = C + 1;
4290 if (isLegalCmpImmed(CPlusOne) ||
4291 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4292 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4293 RHS = DAG.getConstant(CPlusOne, DL, VT);
4294 }
4295 }
4296 break;
4297 case ISD::SETULE:
4298 case ISD::SETUGT: {
4299 if (!C.isAllOnes()) {
4300 APInt CPlusOne = C + 1;
4301 if (isLegalCmpImmed(CPlusOne) ||
4302 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4303 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4304 RHS = DAG.getConstant(CPlusOne, DL, VT);
4305 }
4306 }
4307 break;
4308 }
4309 }
4310 }
4311 }
4312
4313 // Comparisons are canonicalized so that the RHS operand is simpler than the
4314 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4315 // can fold some shift+extend operations on the RHS operand, so swap the
4316 // operands if that can be done.
4317 //
4318 // For example:
4319 // lsl w13, w11, #1
4320 // cmp w13, w12
4321 // can be turned into:
4322 // cmp w12, w11, lsl #1
4323 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4324 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4325 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4326 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4327 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4328
4329 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4330 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4331 std::swap(LHS, RHS);
4333 }
4334 }
4335
4336 SDValue Cmp;
4338 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4340
4341 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4342 // For the i8 operand, the largest immediate is 255, so this can be easily
4343 // encoded in the compare instruction. For the i16 operand, however, the
4344 // largest immediate cannot be encoded in the compare.
4345 // Therefore, use a sign extending load and cmn to avoid materializing the
4346 // -1 constant. For example,
4347 // movz w1, #65535
4348 // ldrh w0, [x0, #0]
4349 // cmp w0, w1
4350 // >
4351 // ldrsh w0, [x0, #0]
4352 // cmn w0, #1
4353 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4354 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4355 // ensure both the LHS and RHS are truly zero extended and to make sure the
4356 // transformation is profitable.
4357 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4358 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4359 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4360 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4361 int16_t ValueofRHS = RHS->getAsZExtVal();
4362 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4363 SDValue SExt =
4364 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4365 DAG.getValueType(MVT::i16));
4366 Cmp = emitComparison(
4367 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4368 DL, DAG);
4370 }
4371 }
4372
4373 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4374 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4375 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4377 }
4378 }
4379 }
4380
4381 if (!Cmp) {
4382 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4384 }
4385 AArch64cc = getCondCode(DAG, AArch64CC);
4386 return Cmp;
4387}
4388
4389static std::pair<SDValue, SDValue>
4391 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4392 "Unsupported value type");
4393 SDValue Value, Overflow;
4394 SDLoc DL(Op);
4395 SDValue LHS = Op.getOperand(0);
4396 SDValue RHS = Op.getOperand(1);
4397 unsigned Opc = 0;
4398 switch (Op.getOpcode()) {
4399 default:
4400 llvm_unreachable("Unknown overflow instruction!");
4401 case ISD::SADDO:
4402 Opc = AArch64ISD::ADDS;
4403 CC = AArch64CC::VS;
4404 break;
4405 case ISD::UADDO:
4406 Opc = AArch64ISD::ADDS;
4407 CC = AArch64CC::HS;
4408 break;
4409 case ISD::SSUBO:
4410 Opc = AArch64ISD::SUBS;
4411 CC = AArch64CC::VS;
4412 break;
4413 case ISD::USUBO:
4414 Opc = AArch64ISD::SUBS;
4415 CC = AArch64CC::LO;
4416 break;
4417 // Multiply needs a little bit extra work.
4418 case ISD::SMULO:
4419 case ISD::UMULO: {
4420 CC = AArch64CC::NE;
4421 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4422 if (Op.getValueType() == MVT::i32) {
4423 // Extend to 64-bits, then perform a 64-bit multiply.
4424 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4425 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4426 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4427 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4428 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4429
4430 // Check that the result fits into a 32-bit integer.
4431 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4432 if (IsSigned) {
4433 // cmp xreg, wreg, sxtw
4434 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4435 Overflow =
4436 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4437 } else {
4438 // tst xreg, #0xffffffff00000000
4439 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4440 Overflow =
4441 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4442 }
4443 break;
4444 }
4445 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4446 // For the 64 bit multiply
4447 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4448 if (IsSigned) {
4449 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4450 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4451 DAG.getConstant(63, DL, MVT::i64));
4452 // It is important that LowerBits is last, otherwise the arithmetic
4453 // shift will not be folded into the compare (SUBS).
4454 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4455 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4456 .getValue(1);
4457 } else {
4458 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4459 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4460 Overflow =
4461 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4462 DAG.getConstant(0, DL, MVT::i64),
4463 UpperBits).getValue(1);
4464 }
4465 break;
4466 }
4467 } // switch (...)
4468
4469 if (Opc) {
4470 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4471
4472 // Emit the AArch64 operation with overflow check.
4473 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4474 Overflow = Value.getValue(1);
4475 }
4476 return std::make_pair(Value, Overflow);
4477}
4478
4479SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4480 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4481 !Subtarget->isNeonAvailable()))
4482 return LowerToScalableOp(Op, DAG);
4483
4484 SDValue Sel = Op.getOperand(0);
4485 SDValue Other = Op.getOperand(1);
4486 SDLoc DL(Sel);
4487
4488 // If the operand is an overflow checking operation, invert the condition
4489 // code and kill the Not operation. I.e., transform:
4490 // (xor (overflow_op_bool, 1))
4491 // -->
4492 // (csel 1, 0, invert(cc), overflow_op_bool)
4493 // ... which later gets transformed to just a cset instruction with an
4494 // inverted condition code, rather than a cset + eor sequence.
4496 // Only lower legal XALUO ops.
4498 return SDValue();
4499
4500 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4501 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4503 SDValue Value, Overflow;
4504 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4505 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4506 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4507 CCVal, Overflow);
4508 }
4509 // If neither operand is a SELECT_CC, give up.
4510 if (Sel.getOpcode() != ISD::SELECT_CC)
4511 std::swap(Sel, Other);
4512 if (Sel.getOpcode() != ISD::SELECT_CC)
4513 return Op;
4514
4515 // The folding we want to perform is:
4516 // (xor x, (select_cc a, b, cc, 0, -1) )
4517 // -->
4518 // (csel x, (xor x, -1), cc ...)
4519 //
4520 // The latter will get matched to a CSINV instruction.
4521
4522 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4523 SDValue LHS = Sel.getOperand(0);
4524 SDValue RHS = Sel.getOperand(1);
4525 SDValue TVal = Sel.getOperand(2);
4526 SDValue FVal = Sel.getOperand(3);
4527
4528 // FIXME: This could be generalized to non-integer comparisons.
4529 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4530 return Op;
4531
4532 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4533 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4534
4535 // The values aren't constants, this isn't the pattern we're looking for.
4536 if (!CFVal || !CTVal)
4537 return Op;
4538
4539 // We can commute the SELECT_CC by inverting the condition. This
4540 // might be needed to make this fit into a CSINV pattern.
4541 if (CTVal->isAllOnes() && CFVal->isZero()) {
4542 std::swap(TVal, FVal);
4543 std::swap(CTVal, CFVal);
4544 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4545 }
4546
4547 // If the constants line up, perform the transform!
4548 if (CTVal->isZero() && CFVal->isAllOnes()) {
4549 SDValue CCVal;
4550 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4551
4552 FVal = Other;
4553 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4554 DAG.getAllOnesConstant(DL, Other.getValueType()));
4555
4556 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4557 CCVal, Cmp);
4558 }
4559
4560 return Op;
4561}
4562
4563// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4564// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4565// sets 'C' bit to 0.
4567 SDLoc DL(Value);
4568 EVT VT = Value.getValueType();
4569 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4570 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4571 SDValue Cmp =
4572 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4573 return Cmp.getValue(1);
4574}
4575
4576// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4577// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4579 bool Invert) {
4580 assert(Glue.getResNo() == 1);
4581 SDLoc DL(Glue);
4582 SDValue Zero = DAG.getConstant(0, DL, VT);
4583 SDValue One = DAG.getConstant(1, DL, VT);
4585 SDValue CC = getCondCode(DAG, Cond);
4586 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4587}
4588
4589// Value is 1 if 'V' bit of NZCV is 1, else 0
4591 assert(Glue.getResNo() == 1);
4592 SDLoc DL(Glue);
4593 SDValue Zero = DAG.getConstant(0, DL, VT);
4594 SDValue One = DAG.getConstant(1, DL, VT);
4596 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4597}
4598
4599// This lowering is inefficient, but it will get cleaned up by
4600// `foldOverflowCheck`
4602 unsigned Opcode, bool IsSigned) {
4603 EVT VT0 = Op.getValue(0).getValueType();
4604 EVT VT1 = Op.getValue(1).getValueType();
4605
4606 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4607 return SDValue();
4608
4609 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4610 SDValue OpLHS = Op.getOperand(0);
4611 SDValue OpRHS = Op.getOperand(1);
4612 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4613
4614 SDLoc DL(Op);
4615
4616 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4617 OpRHS, OpCarryIn);
4618
4619 SDValue OutFlag =
4620 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4621 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4622
4623 return DAG.getMergeValues({Sum, OutFlag}, DL);
4624}
4625
4626static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
4627 SelectionDAG &DAG,
4628 bool LastOperandIsImm = false) {
4629 if (Op.getValueType().isVector())
4630 return SDValue();
4631
4632 SDLoc DL(Op);
4634 const unsigned NumOperands = Op.getNumOperands();
4635 auto getFloatVT = [](EVT VT) {
4636 assert((VT == MVT::i32 || VT == MVT::i64) && "Unexpected VT");
4637 return VT == MVT::i32 ? MVT::f32 : MVT::f64;
4638 };
4639 auto bitcastToFloat = [&](SDValue Val) {
4640 return DAG.getBitcast(getFloatVT(Val.getValueType()), Val);
4641 };
4642
4643 // Skip first operand as it is intrinsic ID.
4644 for (unsigned I = 1; I < NumOperands; ++I) {
4645 SDValue Val = Op.getOperand(I);
4646 const bool KeepInt = LastOperandIsImm && (I == NumOperands - 1);
4647 NewOps.push_back(KeepInt ? Val : bitcastToFloat(Val));
4648 }
4649 EVT OrigVT = Op.getValueType();
4650 SDValue OpNode = DAG.getNode(Opcode, DL, getFloatVT(OrigVT), NewOps);
4651 return DAG.getBitcast(OrigVT, OpNode);
4652}
4653
4655 // Let legalize expand this if it isn't a legal type yet.
4656 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4657 return SDValue();
4658
4659 SDLoc DL(Op);
4661 // The actual operation that sets the overflow or carry flag.
4662 SDValue Value, Overflow;
4663 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4664
4665 // We use 0 and 1 as false and true values.
4666 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4667 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4668
4669 // We use an inverted condition, because the conditional select is inverted
4670 // too. This will allow it to be selected to a single instruction:
4671 // CSINC Wd, WZR, WZR, invert(cond).
4672 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4673 Overflow =
4674 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4675
4676 return DAG.getMergeValues({Value, Overflow}, DL);
4677}
4678
4679// Prefetch operands are:
4680// 1: Address to prefetch
4681// 2: bool isWrite
4682// 3: int locality (0 = no locality ... 3 = extreme locality)
4683// 4: bool isDataCache
4685 SDLoc DL(Op);
4686 unsigned IsWrite = Op.getConstantOperandVal(2);
4687 unsigned Locality = Op.getConstantOperandVal(3);
4688 unsigned IsData = Op.getConstantOperandVal(4);
4689
4690 bool IsStream = !Locality;
4691 // When the locality number is set
4692 if (Locality) {
4693 // The front-end should have filtered out the out-of-range values
4694 assert(Locality <= 3 && "Prefetch locality out-of-range");
4695 // The locality degree is the opposite of the cache speed.
4696 // Put the number the other way around.
4697 // The encoding starts at 0 for level 1
4698 Locality = 3 - Locality;
4699 }
4700
4701 // built the mask value encoding the expected behavior.
4702 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4703 (!IsData << 3) | // IsDataCache bit
4704 (Locality << 1) | // Cache level bits
4705 (unsigned)IsStream; // Stream bit
4706 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4707 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4708 Op.getOperand(1));
4709}
4710
4711// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4712// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4713// (AND X Y) Z which produces a better opt with EmitComparison
4715 SelectionDAG &DAG, const SDLoc DL) {
4716 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4717 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4719 if (LHSConstOp && RHSConst) {
4720 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4721 uint64_t RHSConstant = RHSConst->getZExtValue();
4722 if (isPowerOf2_64(RHSConstant)) {
4723 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4724 LHS =
4725 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4726 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4727 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4728 CC = ISD::SETEQ;
4729 }
4730 }
4731 }
4732}
4733
4734SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4735 SelectionDAG &DAG) const {
4736 EVT VT = Op.getValueType();
4737 if (VT.isScalableVector()) {
4738 SDValue SrcVal = Op.getOperand(0);
4739
4740 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4741 // Break conversion in two with the first part converting to f32 and the
4742 // second using native f32->VT instructions.
4743 SDLoc DL(Op);
4744 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4745 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4746 }
4747
4748 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4749 }
4750
4751 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4752 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4753
4754 bool IsStrict = Op->isStrictFPOpcode();
4755 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4756 EVT Op0VT = Op0.getValueType();
4757 if (VT == MVT::f64) {
4758 // FP16->FP32 extends are legal for v32 and v4f32.
4759 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4760 return Op;
4761 // Split bf16->f64 extends into two fpextends.
4762 if (Op0VT == MVT::bf16 && IsStrict) {
4763 SDValue Ext1 =
4764 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4765 {Op0, Op.getOperand(0)});
4766 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4767 {Ext1, Ext1.getValue(1)});
4768 }
4769 if (Op0VT == MVT::bf16)
4770 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4771 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4772 return SDValue();
4773 }
4774
4775 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4776 return SDValue();
4777}
4778
4779SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4780 SelectionDAG &DAG) const {
4781 EVT VT = Op.getValueType();
4782 bool IsStrict = Op->isStrictFPOpcode();
4783 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4784 EVT SrcVT = SrcVal.getValueType();
4785 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4786
4787 if (VT.isScalableVector()) {
4788 // Let common code split the operation.
4789 if (SrcVT == MVT::nxv8f32)
4790 return Op;
4791
4792 if (VT.getScalarType() != MVT::bf16)
4793 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4794
4795 SDLoc DL(Op);
4796 constexpr EVT I32 = MVT::nxv4i32;
4797 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4798
4799 SDValue NaN;
4800 SDValue Narrow;
4801
4802 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4803 if (Subtarget->hasBF16())
4804 return LowerToPredicatedOp(Op, DAG,
4805 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4806
4807 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4808
4809 // Set the quiet bit.
4810 if (!DAG.isKnownNeverSNaN(SrcVal))
4811 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4812 } else if (SrcVT == MVT::nxv2f64 &&
4813 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4814 // Round to float without introducing rounding errors and try again.
4815 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4816 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4817 Pg, SrcVal, DAG.getPOISON(MVT::nxv2f32));
4818
4820 if (IsStrict)
4821 NewOps.push_back(Op.getOperand(0));
4822 NewOps.push_back(Narrow);
4823 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4824 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4825 } else
4826 return SDValue();
4827
4828 if (!Trunc) {
4829 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4830 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4831 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4832 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4833 }
4834
4835 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4836 // 0x80000000.
4837 if (NaN) {
4838 EVT I1 = I32.changeElementType(*DAG.getContext(), MVT::i1);
4839 EVT CondVT = VT.changeElementType(*DAG.getContext(), MVT::i1);
4840 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4841 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4842 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4843 }
4844
4845 // Now that we have rounded, shift the bits into position.
4846 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4847 return getSVESafeBitCast(VT, Narrow, DAG);
4848 }
4849
4850 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4851 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4852
4853 // Expand cases where the result type is BF16 but we don't have hardware
4854 // instructions to lower it.
4855 if (VT.getScalarType() == MVT::bf16 &&
4856 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4857 Subtarget->hasBF16())) {
4858 SDLoc DL(Op);
4859 SDValue Narrow = SrcVal;
4860 SDValue NaN;
4861 EVT I32 = SrcVT.changeElementType(*DAG.getContext(), MVT::i32);
4862 EVT F32 = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
4863 if (SrcVT.getScalarType() == MVT::f32) {
4864 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4865 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4866 if (!NeverSNaN) {
4867 // Set the quiet bit.
4868 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4869 DAG.getConstant(0x400000, DL, I32));
4870 }
4871 } else if (SrcVT.getScalarType() == MVT::f64) {
4872 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4873 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4874 } else {
4875 return SDValue();
4876 }
4877 if (!Trunc) {
4878 SDValue One = DAG.getConstant(1, DL, I32);
4879 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4880 DAG.getShiftAmountConstant(16, I32, DL));
4881 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4882 SDValue RoundingBias =
4883 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4884 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4885 }
4886
4887 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4888 // 0x80000000.
4889 if (NaN) {
4890 SDValue IsNaN = DAG.getSetCC(
4891 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4892 SrcVal, SrcVal, ISD::SETUO);
4893 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4894 }
4895
4896 // Now that we have rounded, shift the bits into position.
4897 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4898 DAG.getShiftAmountConstant(16, I32, DL));
4899 if (VT.isVector()) {
4900 EVT I16 = I32.changeVectorElementType(*DAG.getContext(), MVT::i16);
4901 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4902 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4903 }
4904 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4905 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4906 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4907 : Result;
4908 }
4909
4910 if (SrcVT != MVT::f128) {
4911 // Expand cases where the input is a vector bigger than NEON.
4913 return SDValue();
4914
4915 // It's legal except when f128 is involved
4916 return Op;
4917 }
4918
4919 return SDValue();
4920}
4921
4922SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4923 SelectionDAG &DAG) const {
4924 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4925 // Any additional optimization in this function should be recorded
4926 // in the cost tables.
4927 bool IsStrict = Op->isStrictFPOpcode();
4928 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4929 EVT VT = Op.getValueType();
4930
4931 assert(!(IsStrict && VT.isScalableVector()) &&
4932 "Unimplemented SVE support for STRICT_FP_to_INT!");
4933
4934 // f16 conversions are promoted to f32 when full fp16 is not supported.
4935 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4936 InVT.getVectorElementType() == MVT::bf16) {
4937 EVT NewVT = VT.changeElementType(*DAG.getContext(), MVT::f32);
4938 SDLoc DL(Op);
4939 if (IsStrict) {
4940 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4941 {Op.getOperand(0), Op.getOperand(1)});
4942 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4943 {Ext.getValue(1), Ext.getValue(0)});
4944 }
4945 return DAG.getNode(
4946 Op.getOpcode(), DL, Op.getValueType(),
4947 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4948 }
4949
4950 if (VT.isScalableVector()) {
4951 if (VT.getVectorElementType() == MVT::i1) {
4952 SDLoc DL(Op);
4953 EVT CvtVT = getPromotedVTForPredicate(VT);
4954 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4955 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4956 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4957 }
4958
4959 // Let common code split the operation.
4960 if (InVT == MVT::nxv8f32)
4961 return Op;
4962
4963 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4964 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4965 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4966 return LowerToPredicatedOp(Op, DAG, Opcode);
4967 }
4968
4969 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4970 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4971 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4972
4973 uint64_t VTSize = VT.getFixedSizeInBits();
4974 uint64_t InVTSize = InVT.getFixedSizeInBits();
4975 if (VTSize < InVTSize) {
4976 SDLoc DL(Op);
4977 if (IsStrict) {
4979 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4980 {Op.getOperand(0), Op.getOperand(1)});
4981 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4982 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4983 }
4984 SDValue Cv =
4985 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4986 Op.getOperand(0));
4987 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4988 }
4989
4990 if (VTSize > InVTSize) {
4991 SDLoc DL(Op);
4992 MVT ExtVT =
4995 if (IsStrict) {
4996 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4997 {Op.getOperand(0), Op.getOperand(1)});
4998 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4999 {Ext.getValue(1), Ext.getValue(0)});
5000 }
5001 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
5002 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
5003 }
5004
5005 // Use a scalar operation for conversions between single-element vectors of
5006 // the same size.
5007 if (InVT.getVectorNumElements() == 1) {
5008 SDLoc DL(Op);
5009 SDValue Extract = DAG.getNode(
5011 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
5012 EVT ScalarVT = VT.getScalarType();
5013 if (IsStrict)
5014 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5015 {Op.getOperand(0), Extract});
5016 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5017 }
5018
5019 // Type changing conversions are illegal.
5020 return Op;
5021}
5022
5023SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
5024 SelectionDAG &DAG) const {
5025 bool IsStrict = Op->isStrictFPOpcode();
5026 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5027
5028 if (SrcVal.getValueType().isVector())
5029 return LowerVectorFP_TO_INT(Op, DAG);
5030
5031 // f16 conversions are promoted to f32 when full fp16 is not supported.
5032 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
5033 SrcVal.getValueType() == MVT::bf16) {
5034 SDLoc DL(Op);
5035 if (IsStrict) {
5036 SDValue Ext =
5037 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
5038 {Op.getOperand(0), SrcVal});
5039 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
5040 {Ext.getValue(1), Ext.getValue(0)});
5041 }
5042 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
5043 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
5044 }
5045
5046 if (SrcVal.getValueType() != MVT::f128) {
5047 // It's legal except when f128 is involved
5048 return Op;
5049 }
5050
5051 return SDValue();
5052}
5053
5054SDValue
5055AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
5056 SelectionDAG &DAG) const {
5057 // AArch64 FP-to-int conversions saturate to the destination element size, so
5058 // we can lower common saturating conversions to simple instructions.
5059 SDValue SrcVal = Op.getOperand(0);
5060 EVT SrcVT = SrcVal.getValueType();
5061 EVT DstVT = Op.getValueType();
5062 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5063
5064 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
5065 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
5066 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5067 assert(SatWidth <= DstElementWidth &&
5068 "Saturation width cannot exceed result width");
5069
5070 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
5071 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
5072 // types, so this is hard to reach.
5073 if (DstVT.isScalableVector())
5074 return SDValue();
5075
5076 EVT SrcElementVT = SrcVT.getVectorElementType();
5077
5078 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5079 SDLoc DL(Op);
5080 SDValue SrcVal2;
5081 if ((SrcElementVT == MVT::f16 &&
5082 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
5083 SrcElementVT == MVT::bf16) {
5084 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
5085 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
5086 // If we are extending to a v8f32, split into two v4f32 to produce legal
5087 // types.
5088 if (F32VT.getSizeInBits() > 128) {
5089 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
5090 F32VT = F32VT.getHalfNumVectorElementsVT();
5091 }
5092 SrcVT = F32VT;
5093 SrcElementVT = MVT::f32;
5094 SrcElementWidth = 32;
5095 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
5096 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
5097 return SDValue();
5098
5099 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
5100 // width and produce a fcvtzu.
5101 if (SatWidth == 64 && SrcElementWidth < 64) {
5102 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
5103 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
5104 SrcVT = F64VT;
5105 SrcElementVT = MVT::f64;
5106 SrcElementWidth = 64;
5107 }
5108 // Cases that we can emit directly.
5109 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
5110 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5111 DAG.getValueType(DstVT.getScalarType()));
5112 if (SrcVal2) {
5113 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
5114 DAG.getValueType(DstVT.getScalarType()));
5115 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
5116 }
5117 return Res;
5118 }
5119
5120 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5121 // result. This is only valid if the legal cvt is larger than the saturate
5122 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
5123 // (at least until sqxtn is selected).
5124 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
5125 return SDValue();
5126
5127 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
5128 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
5129 DAG.getValueType(IntVT.getScalarType()));
5130 SDValue NativeCvt2 =
5131 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
5132 DAG.getValueType(IntVT.getScalarType()))
5133 : SDValue();
5134 SDValue Sat, Sat2;
5135 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5136 SDValue MinC = DAG.getConstant(
5137 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5138 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
5139 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5140 SDValue MaxC = DAG.getConstant(
5141 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5142 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
5143 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
5144 } else {
5145 SDValue MinC = DAG.getConstant(
5146 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
5147 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
5148 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5149 }
5150
5151 if (SrcVal2)
5152 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
5154 Sat, Sat2);
5155
5156 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5157}
5158
5159SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5160 SelectionDAG &DAG) const {
5161 // AArch64 FP-to-int conversions saturate to the destination register size, so
5162 // we can lower common saturating conversions to simple instructions.
5163 SDValue SrcVal = Op.getOperand(0);
5164 EVT SrcVT = SrcVal.getValueType();
5165
5166 if (SrcVT.isVector())
5167 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5168
5169 EVT DstVT = Op.getValueType();
5170 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5171 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5172 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5173 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5174
5175 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5176 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5177 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5178 SrcVT = MVT::f32;
5179 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5180 SrcVT != MVT::bf16)
5181 return SDValue();
5182
5183 SDLoc DL(Op);
5184 // Cases that we can emit directly.
5185 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5186 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5187 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5188 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5189 DAG.getValueType(DstVT));
5190
5191 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5192 // result. This is only valid if the legal cvt is larger than the saturate
5193 // width.
5194 if (DstWidth < SatWidth)
5195 return SDValue();
5196
5197 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5198 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5199 SDValue CVTf32 =
5200 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5201 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5202 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5203 DAG.getValueType(SatVT));
5204 }
5205 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5206 return DAG.getBitcast(DstVT, CVTf32);
5207 }
5208
5209 SDValue NativeCvt =
5210 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5211 SDValue Sat;
5212 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5213 SDValue MinC = DAG.getConstant(
5214 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5215 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5216 SDValue MaxC = DAG.getConstant(
5217 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5218 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5219 } else {
5220 SDValue MinC = DAG.getConstant(
5221 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5222 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5223 }
5224
5225 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5226}
5227
5228SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5229 SelectionDAG &DAG) const {
5230 EVT VT = Op.getValueType();
5231 SDValue Src = Op.getOperand(0);
5232 SDLoc DL(Op);
5233
5234 assert(VT.isVector() && "Expected vector type");
5235
5236 EVT CastVT = VT.changeVectorElementType(
5237 *DAG.getContext(), Src.getValueType().getVectorElementType());
5238
5239 // Round the floating-point value into a floating-point register with the
5240 // current rounding mode.
5241 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5242
5243 // Truncate the rounded floating point to an integer.
5244 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5246}
5247
5248SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5249 SelectionDAG &DAG) const {
5250 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5251 // Any additional optimization in this function should be recorded
5252 // in the cost tables.
5253 bool IsStrict = Op->isStrictFPOpcode();
5254 EVT VT = Op.getValueType();
5255 SDLoc DL(Op);
5256 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5257 EVT InVT = In.getValueType();
5258 unsigned Opc = Op.getOpcode();
5259 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5260
5261 assert(!(IsStrict && VT.isScalableVector()) &&
5262 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5263
5264 // NOTE: i1->bf16 does not require promotion to f32.
5265 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5266 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5267 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5268 : DAG.getConstantFP(1.0, DL, VT);
5269 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5270 }
5271
5272 // Promote bf16 conversions to f32.
5273 if (VT.getVectorElementType() == MVT::bf16) {
5274 EVT F32 = VT.changeElementType(*DAG.getContext(), MVT::f32);
5275 if (IsStrict) {
5276 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5277 {Op.getOperand(0), In});
5278 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5279 {Op.getValueType(), MVT::Other},
5280 {Val.getValue(1), Val.getValue(0),
5281 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5282 }
5283 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5284 DAG.getNode(Op.getOpcode(), DL, F32, In),
5285 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5286 }
5287
5288 if (VT.isScalableVector()) {
5289 // Let common code split the operation.
5290 if (VT == MVT::nxv8f32)
5291 return Op;
5292
5293 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5294 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5295 return LowerToPredicatedOp(Op, DAG, Opcode);
5296 }
5297
5298 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5299 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5300 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5301
5302 uint64_t VTSize = VT.getFixedSizeInBits();
5303 uint64_t InVTSize = InVT.getFixedSizeInBits();
5304 if (VTSize < InVTSize) {
5305 // AArch64 doesn't have a direct vector instruction to convert
5306 // fixed point to floating point AND narrow it at the same time.
5307 // Additional rounding when the target is f32/f64 causes double
5308 // rounding issues. Conversion to f16 is fine due to narrow width.
5309 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5310 bool IsTargetf16 = false;
5311 if (Op.hasOneUse() &&
5312 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5313 // Some vector types are split during legalization into half, followed by
5314 // concatenation, followed by rounding to the original vector type. If we
5315 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5316 SDNode *U = *Op->user_begin();
5317 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5318 EVT TmpVT = U->user_begin()->getValueType(0);
5319 if (TmpVT.getScalarType() == MVT::f16)
5320 IsTargetf16 = true;
5321 }
5322 }
5323
5324 if (IsTargetf32 && !IsTargetf16) {
5325 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5326 }
5327
5328 MVT CastVT =
5330 InVT.getVectorNumElements());
5331 if (IsStrict) {
5332 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5333 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5334 {In.getValue(1), In.getValue(0),
5335 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5336 }
5337 In = DAG.getNode(Opc, DL, CastVT, In);
5338 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5339 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5340 }
5341
5342 if (VTSize > InVTSize) {
5343 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5344 EVT CastVT = VT.changeVectorElementTypeToInteger();
5345 In = DAG.getNode(CastOpc, DL, CastVT, In);
5346 if (IsStrict)
5347 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5348 return DAG.getNode(Opc, DL, VT, In);
5349 }
5350
5351 // Use a scalar operation for conversions between single-element vectors of
5352 // the same size.
5353 if (VT.getVectorNumElements() == 1) {
5354 SDValue Extract =
5356 DAG.getConstant(0, DL, MVT::i64));
5357 EVT ScalarVT = VT.getScalarType();
5358 if (IsStrict)
5359 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5360 {Op.getOperand(0), Extract});
5361 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5362 }
5363
5364 return Op;
5365}
5366
5367SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5368 SelectionDAG &DAG) const {
5369 if (Op.getValueType().isVector())
5370 return LowerVectorINT_TO_FP(Op, DAG);
5371
5372 bool IsStrict = Op->isStrictFPOpcode();
5373 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5374
5375 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5376 Op->getOpcode() == ISD::SINT_TO_FP;
5377
5378 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5379 SDLoc DL(Op);
5380 if (IsStrict) {
5381 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5382 {Op.getOperand(0), SrcVal});
5383 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5384 {Op.getValueType(), MVT::Other},
5385 {Val.getValue(1), Val.getValue(0),
5386 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5387 }
5388 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5389 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5390 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5391 };
5392
5393 if (Op.getValueType() == MVT::bf16) {
5394 unsigned MaxWidth = IsSigned
5395 ? DAG.ComputeMaxSignificantBits(SrcVal)
5396 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5397 // bf16 conversions are promoted to f32 when converting from i16.
5398 if (MaxWidth <= 24) {
5399 return IntToFpViaPromotion(MVT::f32);
5400 }
5401
5402 // bf16 conversions are promoted to f64 when converting from i32.
5403 if (MaxWidth <= 53) {
5404 return IntToFpViaPromotion(MVT::f64);
5405 }
5406
5407 // We need to be careful about i64 -> bf16.
5408 // Consider an i32 22216703.
5409 // This number cannot be represented exactly as an f32 and so a itofp will
5410 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5411 // However, the correct bf16 was supposed to be 22151168.0
5412 // We need to use sticky rounding to get this correct.
5413 if (SrcVal.getValueType() == MVT::i64) {
5414 SDLoc DL(Op);
5415 // This algorithm is equivalent to the following:
5416 // uint64_t SrcHi = SrcVal & ~0xfffull;
5417 // uint64_t SrcLo = SrcVal & 0xfffull;
5418 // uint64_t Highest = SrcVal >> 53;
5419 // bool HasHighest = Highest != 0;
5420 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5421 // double Rounded = static_cast<double>(ToRound);
5422 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5423 // uint64_t HasLo = SrcLo != 0;
5424 // bool NeedsAdjustment = HasHighest & HasLo;
5425 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5426 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5427 // return static_cast<__bf16>(Adjusted);
5428 //
5429 // Essentially, what happens is that SrcVal either fits perfectly in a
5430 // double-precision value or it is too big. If it is sufficiently small,
5431 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5432 // ensure that u64 -> double has no rounding error by only using the 52
5433 // MSB of the input. The low order bits will get merged into a sticky bit
5434 // which will avoid issues incurred by double rounding.
5435
5436 // Signed conversion is more or less like so:
5437 // copysign((__bf16)abs(SrcVal), SrcVal)
5438 SDValue SignBit;
5439 if (IsSigned) {
5440 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5441 DAG.getConstant(1ull << 63, DL, MVT::i64));
5442 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5443 }
5444 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5445 DAG.getConstant(~0xfffull, DL, MVT::i64));
5446 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5447 DAG.getConstant(0xfffull, DL, MVT::i64));
5448 SDValue Highest =
5449 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5450 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5451 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5452 SDValue ToRound =
5453 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5454 SDValue Rounded =
5455 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5456 {Op.getOperand(0), ToRound})
5457 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5458
5459 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5460 if (SignBit) {
5461 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5462 }
5463
5464 SDValue HasHighest = DAG.getSetCC(
5465 DL,
5466 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5467 Highest, Zero64, ISD::SETNE);
5468
5469 SDValue HasLo = DAG.getSetCC(
5470 DL,
5471 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5472 SrcLo, Zero64, ISD::SETNE);
5473
5474 SDValue NeedsAdjustment =
5475 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5476 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5477
5478 SDValue AdjustedBits =
5479 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5480 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5481 return IsStrict
5482 ? DAG.getNode(
5484 {Op.getValueType(), MVT::Other},
5485 {Rounded.getValue(1), Adjusted,
5486 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5487 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5488 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5489 }
5490 }
5491
5492 // f16 conversions are promoted to f32 when full fp16 is not supported.
5493 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5494 return IntToFpViaPromotion(MVT::f32);
5495 }
5496
5497 // i128 conversions are libcalls.
5498 if (SrcVal.getValueType() == MVT::i128)
5499 return SDValue();
5500
5501 // Other conversions are legal, unless it's to the completely software-based
5502 // fp128.
5503 if (Op.getValueType() != MVT::f128)
5504 return Op;
5505 return SDValue();
5506}
5507
5508static MVT getSVEContainerType(EVT ContentTy);
5509
5510SDValue
5511AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5512 SelectionDAG &DAG) const {
5513 assert((Subtarget->hasSVE2() ||
5514 (Subtarget->hasSME() && Subtarget->isStreaming())) &&
5515 "Lowering loop_dependence_raw_mask or loop_dependence_war_mask "
5516 "requires SVE or SME");
5517
5518 SDLoc DL(Op);
5519 EVT VT = Op.getValueType();
5520 unsigned LaneOffset = Op.getConstantOperandVal(3);
5521 unsigned NumElements = VT.getVectorMinNumElements();
5522 uint64_t EltSizeInBytes = Op.getConstantOperandVal(2);
5523
5524 // Lane offsets and other element sizes are not supported by whilewr/rw.
5525 if (LaneOffset != 0 || !is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes))
5526 return SDValue();
5527
5528 EVT EltVT = MVT::getIntegerVT(EltSizeInBytes * 8);
5529 EVT PredVT =
5530 getPackedSVEVectorVT(EltVT).changeElementType(*DAG.getContext(), MVT::i1);
5531
5532 // Legal whilewr/rw (lowered by tablegen matcher).
5533 if (PredVT == VT)
5534 return Op;
5535
5536 // Expand if this mask needs splitting (this will produce a whilelo).
5537 if (NumElements > PredVT.getVectorMinNumElements())
5538 return SDValue();
5539
5540 SDValue Mask =
5541 DAG.getNode(Op.getOpcode(), DL, PredVT, to_vector(Op->op_values()));
5542
5543 if (VT.isFixedLengthVector()) {
5544 EVT WidePredVT =
5545 PredVT.changeElementType(*DAG.getContext(), VT.getScalarType());
5546 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, WidePredVT, Mask);
5547 return convertFromScalableVector(DAG, VT, MaskAsInt);
5548 }
5549
5550 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Mask,
5551 DAG.getConstant(0, DL, MVT::i64));
5552}
5553
5554SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5555 SelectionDAG &DAG) const {
5556 EVT OpVT = Op.getValueType();
5557 EVT ArgVT = Op.getOperand(0).getValueType();
5558
5560 return LowerFixedLengthBitcastToSVE(Op, DAG);
5561
5562 if (OpVT.isScalableVector()) {
5563 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5564
5565 // Handle type legalisation first.
5566 if (!isTypeLegal(ArgVT)) {
5567 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5568 "Expected int->fp bitcast!");
5569
5570 // Bitcasting between unpacked vector types of different element counts is
5571 // not a NOP because the live elements are laid out differently.
5572 // 01234567
5573 // e.g. nxv2i32 = XX??XX??
5574 // nxv4f16 = X?X?X?X?
5575 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5576 return SDValue();
5577
5578 SDValue ExtResult =
5579 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5580 Op.getOperand(0));
5581 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5582 }
5583
5584 // Bitcasts between legal types with the same element count are legal.
5585 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5586 return Op;
5587
5588 // getSVESafeBitCast does not support casting between unpacked types.
5589 if (!isPackedVectorType(OpVT, DAG))
5590 return SDValue();
5591
5592 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5593 }
5594
5595 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5596 return SDValue();
5597
5598 // Bitcasts between f16 and bf16 are legal.
5599 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5600 return Op;
5601
5602 assert(ArgVT == MVT::i16);
5603 SDLoc DL(Op);
5604
5605 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5606 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5607 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5608}
5609
5610// Returns lane if Op extracts from a two-element vector and lane is constant
5611// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5612static std::optional<uint64_t>
5614 SDNode *OpNode = Op.getNode();
5615 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5616 return std::nullopt;
5617
5618 EVT VT = OpNode->getOperand(0).getValueType();
5620 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5621 return std::nullopt;
5622
5623 return C->getZExtValue();
5624}
5625
5627 bool isSigned) {
5628 EVT VT = N.getValueType();
5629
5630 if (N.getOpcode() != ISD::BUILD_VECTOR)
5631 return false;
5632
5633 for (const SDValue &Elt : N->op_values()) {
5635 unsigned EltSize = VT.getScalarSizeInBits();
5636 unsigned HalfSize = EltSize / 2;
5637 if (isSigned) {
5638 if (!isIntN(HalfSize, C->getSExtValue()))
5639 return false;
5640 } else {
5641 if (!isUIntN(HalfSize, C->getZExtValue()))
5642 return false;
5643 }
5644 continue;
5645 }
5646 return false;
5647 }
5648
5649 return true;
5650}
5651
5653 EVT VT = N.getValueType();
5654 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5655 EVT HalfVT = EVT::getVectorVT(
5656 *DAG.getContext(),
5659 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5660}
5661
5663 return N.getOpcode() == ISD::SIGN_EXTEND ||
5664 N.getOpcode() == ISD::ANY_EXTEND ||
5665 isExtendedBUILD_VECTOR(N, DAG, true);
5666}
5667
5669 return N.getOpcode() == ISD::ZERO_EXTEND ||
5670 N.getOpcode() == ISD::ANY_EXTEND ||
5671 isExtendedBUILD_VECTOR(N, DAG, false);
5672}
5673
5675 unsigned Opcode = N.getOpcode();
5676 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5677 SDValue N0 = N.getOperand(0);
5678 SDValue N1 = N.getOperand(1);
5679 return N0->hasOneUse() && N1->hasOneUse() &&
5680 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5681 }
5682 return false;
5683}
5684
5686 unsigned Opcode = N.getOpcode();
5687 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5688 SDValue N0 = N.getOperand(0);
5689 SDValue N1 = N.getOperand(1);
5690 return N0->hasOneUse() && N1->hasOneUse() &&
5691 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5692 }
5693 return false;
5694}
5695
5696SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5697 SelectionDAG &DAG) const {
5698 // The rounding mode is in bits 23:22 of the FPSCR.
5699 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5700 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5701 // so that the shift + and get folded into a bitfield extract.
5702 SDLoc DL(Op);
5703
5704 SDValue Chain = Op.getOperand(0);
5705 SDValue FPCR_64 =
5706 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5707 {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
5708 MVT::i64)});
5709 Chain = FPCR_64.getValue(1);
5710 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5711 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5712 DAG.getConstant(1U << 22, DL, MVT::i32));
5713 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5714 DAG.getConstant(22, DL, MVT::i32));
5715 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5716 DAG.getConstant(3, DL, MVT::i32));
5717 return DAG.getMergeValues({AND, Chain}, DL);
5718}
5719
5720SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5721 SelectionDAG &DAG) const {
5722 SDLoc DL(Op);
5723 SDValue Chain = Op->getOperand(0);
5724 SDValue RMValue = Op->getOperand(1);
5725
5726 // The rounding mode is in bits 23:22 of the FPCR.
5727 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5728 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5729 // ((arg - 1) & 3) << 22).
5730 //
5731 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5732 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5733 // generated llvm.set.rounding to ensure this condition.
5734
5735 // Calculate new value of FPCR[23:22].
5736 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5737 DAG.getConstant(1, DL, MVT::i32));
5738 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5739 DAG.getConstant(0x3, DL, MVT::i32));
5740 RMValue =
5741 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5742 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5743 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5744
5745 // Get current value of FPCR.
5746 SDValue Ops[] = {
5747 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5748 SDValue FPCR =
5749 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5750 Chain = FPCR.getValue(1);
5751 FPCR = FPCR.getValue(0);
5752
5753 // Put new rounding mode into FPSCR[23:22].
5754 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5755 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5756 DAG.getConstant(RMMask, DL, MVT::i64));
5757 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5758 SDValue Ops2[] = {
5759 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5760 FPCR};
5761 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5762}
5763
5764SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5765 SelectionDAG &DAG) const {
5766 SDLoc DL(Op);
5767 SDValue Chain = Op->getOperand(0);
5768
5769 // Get current value of FPCR.
5770 SDValue Ops[] = {
5771 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5772 SDValue FPCR =
5773 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5774 Chain = FPCR.getValue(1);
5775 FPCR = FPCR.getValue(0);
5776
5777 // Truncate FPCR to 32 bits.
5778 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5779
5780 return DAG.getMergeValues({Result, Chain}, DL);
5781}
5782
5783SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5784 SelectionDAG &DAG) const {
5785 SDLoc DL(Op);
5786 SDValue Chain = Op->getOperand(0);
5787 SDValue Mode = Op->getOperand(1);
5788
5789 // Extend the specified value to 64 bits.
5790 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5791
5792 // Set new value of FPCR.
5793 SDValue Ops2[] = {
5794 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5795 FPCR};
5796 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5797}
5798
5799SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5800 SelectionDAG &DAG) const {
5801 SDLoc DL(Op);
5802 SDValue Chain = Op->getOperand(0);
5803
5804 // Get current value of FPCR.
5805 SDValue Ops[] = {
5806 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5807 SDValue FPCR =
5808 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5809 Chain = FPCR.getValue(1);
5810 FPCR = FPCR.getValue(0);
5811
5812 // Clear bits that are not reserved.
5813 SDValue FPSCRMasked = DAG.getNode(
5814 ISD::AND, DL, MVT::i64, FPCR,
5816
5817 // Set new value of FPCR.
5818 SDValue Ops2[] = {
5819 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5820 FPSCRMasked};
5821 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5822}
5823
5824static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5825 SDLoc DL, bool &IsMLA) {
5826 bool IsN0SExt = isSignExtended(N0, DAG);
5827 bool IsN1SExt = isSignExtended(N1, DAG);
5828 if (IsN0SExt && IsN1SExt)
5829 return AArch64ISD::SMULL;
5830
5831 bool IsN0ZExt = isZeroExtended(N0, DAG);
5832 bool IsN1ZExt = isZeroExtended(N1, DAG);
5833
5834 if (IsN0ZExt && IsN1ZExt)
5835 return AArch64ISD::UMULL;
5836
5837 // Select UMULL if we can replace the other operand with an extend.
5838 EVT VT = N0.getValueType();
5839 unsigned EltSize = VT.getScalarSizeInBits();
5840 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5841 if (IsN0ZExt || IsN1ZExt) {
5842 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5843 return AArch64ISD::UMULL;
5844 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5845 DAG.MaskedValueIsZero(N1, Mask)) {
5846 // For v2i64 we look more aggressively at both operands being zero, to avoid
5847 // scalarization.
5848 return AArch64ISD::UMULL;
5849 }
5850
5851 if (IsN0SExt || IsN1SExt) {
5852 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5853 return AArch64ISD::SMULL;
5854 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5855 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5856 return AArch64ISD::SMULL;
5857 }
5858
5859 if (!IsN1SExt && !IsN1ZExt)
5860 return 0;
5861
5862 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5863 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5864 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5865 IsMLA = true;
5866 return AArch64ISD::SMULL;
5867 }
5868 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5869 IsMLA = true;
5870 return AArch64ISD::UMULL;
5871 }
5872 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5873 std::swap(N0, N1);
5874 IsMLA = true;
5875 return AArch64ISD::UMULL;
5876 }
5877 return 0;
5878}
5879
5880SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5881 EVT VT = Op.getValueType();
5882
5883 bool OverrideNEON = !Subtarget->isNeonAvailable();
5884 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5885 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5886
5887 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5888 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5889 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5890 "unexpected type for custom-lowering ISD::MUL");
5891 SDValue N0 = Op.getOperand(0);
5892 SDValue N1 = Op.getOperand(1);
5893 bool isMLA = false;
5894 EVT OVT = VT;
5895 if (VT.is64BitVector()) {
5896 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5897 isNullConstant(N0.getOperand(1)) &&
5900 isNullConstant(N1.getOperand(1)) &&
5902 N0 = N0.getOperand(0);
5903 N1 = N1.getOperand(0);
5904 VT = N0.getValueType();
5905 } else {
5906 if (VT == MVT::v1i64) {
5907 if (Subtarget->hasSVE())
5908 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5909 // Fall through to expand this. It is not legal.
5910 return SDValue();
5911 } else
5912 // Other vector multiplications are legal.
5913 return Op;
5914 }
5915 }
5916
5917 SDLoc DL(Op);
5918 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5919
5920 if (!NewOpc) {
5921 if (VT.getVectorElementType() == MVT::i64) {
5922 // If SVE is available then i64 vector multiplications can also be made
5923 // legal.
5924 if (Subtarget->hasSVE())
5925 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5926 // Fall through to expand this. It is not legal.
5927 return SDValue();
5928 } else
5929 // Other vector multiplications are legal.
5930 return Op;
5931 }
5932
5933 // Legalize to a S/UMULL instruction
5934 SDValue Op0;
5935 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5936 if (!isMLA) {
5937 Op0 = skipExtensionForVectorMULL(N0, DAG);
5939 Op1.getValueType().is64BitVector() &&
5940 "unexpected types for extended operands to VMULL");
5941 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5942 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5943 DAG.getConstant(0, DL, MVT::i64));
5944 }
5945 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5946 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5947 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5950 EVT Op1VT = Op1.getValueType();
5951 return DAG.getNode(
5953 DAG.getNode(N0.getOpcode(), DL, VT,
5954 DAG.getNode(NewOpc, DL, VT,
5955 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5956 DAG.getNode(NewOpc, DL, VT,
5957 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5958 DAG.getConstant(0, DL, MVT::i64));
5959}
5960
5961static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5962 int Pattern) {
5963 if (Pattern == AArch64SVEPredPattern::all)
5964 return DAG.getConstant(1, DL, VT);
5965
5966 // When the number of active elements of a pattern matches the scalable vector
5967 // length, we can upgrade the pattern to ALL and emit a splat instead.
5968 if (unsigned PatNumElts = getNumElementsFromSVEPredPattern(Pattern)) {
5969 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
5970 unsigned NumElts = VT.getVectorMinNumElements();
5971 unsigned VScale = Subtarget.getSVEVectorSizeInBits() / 128;
5972 if (PatNumElts == (NumElts * VScale))
5973 return DAG.getConstant(1, DL, VT);
5974 }
5975
5976 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5977 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5978}
5979
5981 bool IsSigned, bool IsEqual) {
5982 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5983 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5984
5985 if (!N->getValueType(0).isScalableVector() ||
5986 !isa<ConstantSDNode>(N->getOperand(Op1)))
5987 return SDValue();
5988
5989 SDLoc DL(N);
5990 APInt Y = N->getConstantOperandAPInt(Op1);
5991
5992 // When the second operand is the maximum value, comparisons that include
5993 // equality can never fail and thus we can return an all active predicate.
5994 if (IsEqual)
5995 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5996 return DAG.getConstant(1, DL, N->getValueType(0));
5997
5998 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5999 return SDValue();
6000
6001 APInt X = N->getConstantOperandAPInt(Op0);
6002
6003 bool Overflow;
6004 APInt NumActiveElems =
6005 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
6006
6007 if (Overflow)
6008 return SDValue();
6009
6010 if (IsEqual) {
6011 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
6012 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
6013 : NumActiveElems.uadd_ov(One, Overflow);
6014 if (Overflow)
6015 return SDValue();
6016 }
6017
6018 std::optional<unsigned> PredPattern =
6020 unsigned MinSVEVectorSize = std::max(
6022 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
6023 if (PredPattern != std::nullopt &&
6024 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
6025 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
6026
6027 return SDValue();
6028}
6029
6030// Returns a safe bitcast between two scalable vector predicates, where
6031// any newly created lanes from a widening bitcast are defined as zero.
6033 SDLoc DL(Op);
6034 EVT InVT = Op.getValueType();
6035
6036 assert(InVT.getVectorElementType() == MVT::i1 &&
6037 VT.getVectorElementType() == MVT::i1 &&
6038 "Expected a predicate-to-predicate bitcast");
6040 InVT.isScalableVector() &&
6041 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
6042 "Only expect to cast between legal scalable predicate types!");
6043
6044 // Return the operand if the cast isn't changing type,
6045 if (InVT == VT)
6046 return Op;
6047
6048 // Look through casts to <vscale x 16 x i1> when their input has more lanes
6049 // than VT. This will increase the chances of removing casts that introduce
6050 // new lanes, which have to be explicitly zero'd.
6051 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6052 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
6053 Op.getOperand(1).getValueType().bitsGT(VT))
6054 Op = Op.getOperand(1);
6055
6056 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
6057
6058 // We only have to zero the lanes if new lanes are being defined, e.g. when
6059 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
6060 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
6061 // we can return here.
6062 if (InVT.bitsGT(VT))
6063 return Reinterpret;
6064
6065 // Check if the other lanes are already known to be zeroed by
6066 // construction.
6068 return Reinterpret;
6069
6070 // Zero the newly introduced lanes.
6071 SDValue Mask = DAG.getConstant(1, DL, InVT);
6072 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
6073 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
6074}
6075
6076SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
6077 SDValue Chain, SDLoc DL,
6078 EVT VT) const {
6079 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
6080 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
6081 SDValue Callee =
6082 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
6083 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
6084 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
6085 TargetLowering::CallLoweringInfo CLI(DAG);
6087 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
6088 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
6089 std::move(Args));
6090 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6091 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
6092 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
6093 Mask);
6094}
6095
6096// Lower an SME LDR/STR ZA intrinsic
6097// Case 1: If the vector number (vecnum) is an immediate in range, it gets
6098// folded into the instruction
6099// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
6100// Case 2: If the vecnum is not an immediate, then it is used to modify the base
6101// and tile slice registers
6102// ldr(%tileslice, %ptr, %vecnum)
6103// ->
6104// %svl = rdsvl
6105// %ptr2 = %ptr + %svl * %vecnum
6106// %tileslice2 = %tileslice + %vecnum
6107// ldr [%tileslice2, 0], [%ptr2, 0]
6108// Case 3: If the vecnum is an immediate out of range, then the same is done as
6109// case 2, but the base and slice registers are modified by the greatest
6110// multiple of 15 lower than the vecnum and the remainder is folded into the
6111// instruction. This means that successive loads and stores that are offset from
6112// each other can share the same base and slice register updates.
6113// ldr(%tileslice, %ptr, 22)
6114// ldr(%tileslice, %ptr, 23)
6115// ->
6116// %svl = rdsvl
6117// %ptr2 = %ptr + %svl * 15
6118// %tileslice2 = %tileslice + 15
6119// ldr [%tileslice2, 7], [%ptr2, 7]
6120// ldr [%tileslice2, 8], [%ptr2, 8]
6121// Case 4: If the vecnum is an add of an immediate, then the non-immediate
6122// operand and the immediate can be folded into the instruction, like case 2.
6123// ldr(%tileslice, %ptr, %vecnum + 7)
6124// ldr(%tileslice, %ptr, %vecnum + 8)
6125// ->
6126// %svl = rdsvl
6127// %ptr2 = %ptr + %svl * %vecnum
6128// %tileslice2 = %tileslice + %vecnum
6129// ldr [%tileslice2, 7], [%ptr2, 7]
6130// ldr [%tileslice2, 8], [%ptr2, 8]
6131// Case 5: The vecnum being an add of an immediate out of range is also handled,
6132// in which case the same remainder logic as case 3 is used.
6134 SDLoc DL(N);
6135
6136 SDValue TileSlice = N->getOperand(2);
6137 SDValue Base = N->getOperand(3);
6138 SDValue VecNum = N->getOperand(4);
6139 int32_t ConstAddend = 0;
6140 SDValue VarAddend = VecNum;
6141
6142 // If the vnum is an add of an immediate, we can fold it into the instruction
6143 if (VecNum.getOpcode() == ISD::ADD &&
6144 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6145 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6146 VarAddend = VecNum.getOperand(0);
6147 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6148 ConstAddend = ImmNode->getSExtValue();
6149 VarAddend = SDValue();
6150 }
6151
6152 int32_t ImmAddend = ConstAddend % 16;
6153 if (int32_t C = (ConstAddend - ImmAddend)) {
6154 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6155 VarAddend = VarAddend
6156 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6157 : CVal;
6158 }
6159
6160 if (VarAddend) {
6161 // Get the vector length that will be multiplied by vnum
6162 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6163 DAG.getConstant(1, DL, MVT::i32));
6164
6165 // Multiply SVL and vnum then add it to the base
6166 SDValue Mul = DAG.getNode(
6167 ISD::MUL, DL, MVT::i64,
6168 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6169 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6170 // Just add vnum to the tileslice
6171 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6172 }
6173
6174 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6175 DL, MVT::Other,
6176 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6177 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6178}
6179
6181 SDLoc DL(Op);
6182 SDValue ID =
6183 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6184
6185 auto Op1 = Op.getOperand(1);
6186 auto Op2 = Op.getOperand(2);
6187 auto Mask = Op.getOperand(3);
6188
6189 EVT Op1VT = Op1.getValueType();
6190 EVT Op2VT = Op2.getValueType();
6191 EVT ResVT = Op.getValueType();
6192
6193 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6194 Op1VT.getVectorElementType() == MVT::i16) &&
6195 "Expected 8-bit or 16-bit characters.");
6196
6197 // Scalable vector type used to wrap operands.
6198 // A single container is enough for both operands because ultimately the
6199 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6200 EVT OpContainerVT = Op1VT.isScalableVector()
6201 ? Op1VT
6203
6204 if (Op2VT.is128BitVector()) {
6205 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6206 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6207 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6208 if (ResVT.isScalableVector())
6209 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6210 DAG.getTargetConstant(0, DL, MVT::i64));
6211 } else {
6212 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6213 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6214 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6215 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6216 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6217 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6218 DAG.getConstant(0, DL, MVT::i64));
6219 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6220 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6221 }
6222
6223 // If the result is scalable, we just need to carry out the MATCH.
6224 if (ResVT.isScalableVector())
6225 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6226
6227 // If the result is fixed, we can still use MATCH but we need to wrap the
6228 // first operand and the mask in scalable vectors before doing so.
6229
6230 // Wrap the operands.
6231 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6232 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6233 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6234
6235 // Carry out the match.
6236 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6237 ID, Mask, Op1, Op2);
6238
6239 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6240 // (v16i8/v8i8).
6241 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6242 Match = convertFromScalableVector(DAG, Op1VT, Match);
6243 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6244}
6245
6246SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6247 SelectionDAG &DAG) const {
6248 unsigned IntNo = Op.getConstantOperandVal(1);
6249 SDLoc DL(Op);
6250 switch (IntNo) {
6251 default:
6252 return SDValue(); // Don't custom lower most intrinsics.
6253 case Intrinsic::aarch64_prefetch: {
6254 SDValue Chain = Op.getOperand(0);
6255 SDValue Addr = Op.getOperand(2);
6256
6257 unsigned IsWrite = Op.getConstantOperandVal(3);
6258 unsigned Locality = Op.getConstantOperandVal(4);
6259 unsigned IsStream = Op.getConstantOperandVal(5);
6260 unsigned IsData = Op.getConstantOperandVal(6);
6261 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6262 (!IsData << 3) | // IsDataCache bit
6263 (Locality << 1) | // Cache level bits
6264 (unsigned)IsStream; // Stream bit
6265
6266 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6267 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6268 }
6269 case Intrinsic::aarch64_range_prefetch: {
6270 SDValue Chain = Op.getOperand(0);
6271 SDValue Addr = Op.getOperand(2);
6272
6273 unsigned IsWrite = Op.getConstantOperandVal(3);
6274 unsigned IsStream = Op.getConstantOperandVal(4);
6275 unsigned PrfOp = (IsStream << 2) | IsWrite;
6276
6277 SDValue Metadata = Op.getOperand(5);
6278 return DAG.getNode(AArch64ISD::RANGE_PREFETCH, DL, MVT::Other, Chain,
6279 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr,
6280 Metadata);
6281 }
6282 case Intrinsic::aarch64_sme_str:
6283 case Intrinsic::aarch64_sme_ldr: {
6284 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6285 }
6286 case Intrinsic::aarch64_sme_za_enable:
6287 return DAG.getNode(
6288 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6289 Op->getOperand(0), // Chain
6290 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6291 case Intrinsic::aarch64_sme_za_disable:
6292 return DAG.getNode(
6293 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6294 Op->getOperand(0), // Chain
6295 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6296 }
6297}
6298
6299SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6300 SelectionDAG &DAG) const {
6301 unsigned IntNo = Op.getConstantOperandVal(1);
6302 SDLoc DL(Op);
6303 switch (IntNo) {
6304 default:
6305 return SDValue(); // Don't custom lower most intrinsics.
6306 case Intrinsic::aarch64_mops_memset_tag: {
6307 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6308 SDValue Chain = Node->getChain();
6309 SDValue Dst = Op.getOperand(2);
6310 SDValue Val = Op.getOperand(3);
6311 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6312 SDValue Size = Op.getOperand(4);
6313 auto Alignment = Node->getMemOperand()->getAlign();
6314 bool IsVol = Node->isVolatile();
6315 auto DstPtrInfo = Node->getPointerInfo();
6316
6317 const auto &SDI =
6318 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6319 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6320 Chain, Dst, Val, Size, Alignment, IsVol,
6321 DstPtrInfo, MachinePointerInfo{});
6322
6323 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6324 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6325 // LowerOperationWrapper will complain that the number of results has
6326 // changed.
6327 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6328 }
6329 }
6330}
6331
6332SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6333 SelectionDAG &DAG) const {
6334 unsigned IntNo = Op.getConstantOperandVal(0);
6335 SDLoc DL(Op);
6336 switch (IntNo) {
6337 default: return SDValue(); // Don't custom lower most intrinsics.
6338 case Intrinsic::thread_pointer: {
6339 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6340 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6341 }
6342 case Intrinsic::aarch64_sve_whilewr_b:
6343 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6344 Op.getOperand(1), Op.getOperand(2),
6345 DAG.getConstant(1, DL, MVT::i64),
6346 DAG.getConstant(0, DL, MVT::i64));
6347 case Intrinsic::aarch64_sve_whilewr_h:
6348 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6349 Op.getOperand(1), Op.getOperand(2),
6350 DAG.getConstant(2, DL, MVT::i64),
6351 DAG.getConstant(0, DL, MVT::i64));
6352 case Intrinsic::aarch64_sve_whilewr_s:
6353 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6354 Op.getOperand(1), Op.getOperand(2),
6355 DAG.getConstant(4, DL, MVT::i64),
6356 DAG.getConstant(0, DL, MVT::i64));
6357 case Intrinsic::aarch64_sve_whilewr_d:
6358 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6359 Op.getOperand(1), Op.getOperand(2),
6360 DAG.getConstant(8, DL, MVT::i64),
6361 DAG.getConstant(0, DL, MVT::i64));
6362 case Intrinsic::aarch64_sve_whilerw_b:
6363 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6364 Op.getOperand(1), Op.getOperand(2),
6365 DAG.getConstant(1, DL, MVT::i64),
6366 DAG.getConstant(0, DL, MVT::i64));
6367 case Intrinsic::aarch64_sve_whilerw_h:
6368 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6369 Op.getOperand(1), Op.getOperand(2),
6370 DAG.getConstant(2, DL, MVT::i64),
6371 DAG.getConstant(0, DL, MVT::i64));
6372 case Intrinsic::aarch64_sve_whilerw_s:
6373 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6374 Op.getOperand(1), Op.getOperand(2),
6375 DAG.getConstant(4, DL, MVT::i64),
6376 DAG.getConstant(0, DL, MVT::i64));
6377 case Intrinsic::aarch64_sve_whilerw_d:
6378 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6379 Op.getOperand(1), Op.getOperand(2),
6380 DAG.getConstant(8, DL, MVT::i64),
6381 DAG.getConstant(0, DL, MVT::i64));
6382 case Intrinsic::aarch64_neon_abs: {
6383 EVT Ty = Op.getValueType();
6384 if (Ty == MVT::i64) {
6385 SDValue Result =
6386 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6387 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6388 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6389 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6390 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6391 } else {
6392 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6393 }
6394 }
6395 case Intrinsic::aarch64_neon_pmull64: {
6396 SDValue LHS = Op.getOperand(1);
6397 SDValue RHS = Op.getOperand(2);
6398
6399 std::optional<uint64_t> LHSLane =
6401 std::optional<uint64_t> RHSLane =
6403
6404 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6405 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6406
6407 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6408 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6409 // which ISel recognizes better. For example, generate a ldr into d*
6410 // registers as opposed to a GPR load followed by a fmov.
6411 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6412 std::optional<uint64_t> OtherLane,
6413 const SDLoc &DL,
6414 SelectionDAG &DAG) -> SDValue {
6415 // If the operand is an higher half itself, rewrite it to
6416 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6417 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6418 if (NLane == 1)
6419 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6420 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6421
6422 // Operand N is not a higher half but the other operand is.
6423 if (OtherLane == 1) {
6424 // If this operand is a lower half, rewrite it to
6425 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6426 // align lanes of two operands. A roundtrip sequence (to move from lane
6427 // 1 to lane 0) is like this:
6428 // mov x8, v0.d[1]
6429 // fmov d0, x8
6430 if (NLane == 0)
6431 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6432 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6433 N.getOperand(0),
6434 DAG.getConstant(0, DL, MVT::i64)),
6435 DAG.getConstant(1, DL, MVT::i64));
6436
6437 // Otherwise just dup from main to all lanes.
6438 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6439 }
6440
6441 // Neither operand is an extract of higher half, so codegen may just use
6442 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6443 assert(N.getValueType() == MVT::i64 &&
6444 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6445 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6446 };
6447
6448 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6449 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6450
6451 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6452 }
6453 case Intrinsic::aarch64_neon_smax:
6454 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6455 Op.getOperand(2));
6456 case Intrinsic::aarch64_neon_umax:
6457 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6458 Op.getOperand(2));
6459 case Intrinsic::aarch64_neon_smin:
6460 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6461 Op.getOperand(2));
6462 case Intrinsic::aarch64_neon_umin:
6463 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6464 Op.getOperand(2));
6465 case Intrinsic::aarch64_neon_scalar_sqxtn:
6466 case Intrinsic::aarch64_neon_scalar_sqxtun:
6467 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6468 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6469 if (Op.getValueType() == MVT::i32)
6470 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6471 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6472 Op.getOperand(0),
6473 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6474 Op.getOperand(1))));
6475 return SDValue();
6476 }
6477 case Intrinsic::aarch64_neon_sqxtn:
6478 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6479 Op.getOperand(1));
6480 case Intrinsic::aarch64_neon_sqxtun:
6481 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6482 Op.getOperand(1));
6483 case Intrinsic::aarch64_neon_uqxtn:
6484 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6485 Op.getOperand(1));
6486 case Intrinsic::aarch64_neon_sqshrn:
6487 if (Op.getValueType().isVector())
6488 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6489 DAG.getNode(AArch64ISD::VASHR, DL,
6490 Op.getOperand(1).getValueType(),
6491 Op.getOperand(1), Op.getOperand(2)));
6492 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRN, DAG,
6493 /*LastOperandIsImm=*/true);
6494 case Intrinsic::aarch64_neon_sqshrun:
6495 if (Op.getValueType().isVector())
6496 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6497 DAG.getNode(AArch64ISD::VASHR, DL,
6498 Op.getOperand(1).getValueType(),
6499 Op.getOperand(1), Op.getOperand(2)));
6500 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRUN, DAG,
6501 /*LastOperandIsImm=*/true);
6502 case Intrinsic::aarch64_neon_uqshrn:
6503 if (Op.getValueType().isVector())
6504 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6505 DAG.getNode(AArch64ISD::VLSHR, DL,
6506 Op.getOperand(1).getValueType(),
6507 Op.getOperand(1), Op.getOperand(2)));
6508 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHRN, DAG,
6509 /*LastOperandIsImm=*/true);
6510 case Intrinsic::aarch64_neon_sqrshrn:
6511 if (Op.getValueType().isVector())
6512 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6513 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6514 Op.getOperand(1).getValueType(),
6515 Op.getOperand(1), Op.getOperand(2)));
6516 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRN, DAG,
6517 /*LastOperandIsImm=*/true);
6518 case Intrinsic::aarch64_neon_sqrshrun:
6519 if (Op.getValueType().isVector())
6520 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6521 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6522 Op.getOperand(1).getValueType(),
6523 Op.getOperand(1), Op.getOperand(2)));
6524 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRUN, DAG,
6525 /*LastOperandIsImm=*/true);
6526 case Intrinsic::aarch64_neon_uqrshrn:
6527 if (Op.getValueType().isVector())
6528 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6529 DAG.getNode(AArch64ISD::URSHR_I, DL,
6530 Op.getOperand(1).getValueType(),
6531 Op.getOperand(1), Op.getOperand(2)));
6532 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHRN, DAG,
6533 /*LastOperandIsImm=*/true);
6534 case Intrinsic::aarch64_neon_sqdmulh:
6535 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULH, DAG);
6536 case Intrinsic::aarch64_neon_sqrdmulh:
6537 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMULH, DAG);
6538 case Intrinsic::aarch64_neon_sqrdmlah:
6539 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLAH, DAG);
6540 case Intrinsic::aarch64_neon_sqrdmlsh:
6541 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLSH, DAG);
6542 case Intrinsic::aarch64_neon_sqrshl:
6543 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
6544 case Intrinsic::aarch64_neon_sqshl:
6545 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
6546 case Intrinsic::aarch64_neon_uqrshl:
6547 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
6548 case Intrinsic::aarch64_neon_uqshl:
6549 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
6550 case Intrinsic::aarch64_neon_sqadd:
6551 if (Op.getValueType().isVector())
6552 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6553 Op.getOperand(2));
6554 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
6555
6556 case Intrinsic::aarch64_neon_sqsub:
6557 if (Op.getValueType().isVector())
6558 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6559 Op.getOperand(2));
6560 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
6561
6562 case Intrinsic::aarch64_neon_uqadd:
6563 if (Op.getValueType().isVector())
6564 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6565 Op.getOperand(2));
6566 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
6567 case Intrinsic::aarch64_neon_uqsub:
6568 if (Op.getValueType().isVector())
6569 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6570 Op.getOperand(2));
6571 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
6572 case Intrinsic::aarch64_neon_sqdmulls_scalar:
6573 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULL, DAG);
6574 case Intrinsic::aarch64_sve_whilelt:
6575 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6576 /*IsEqual=*/false);
6577 case Intrinsic::aarch64_sve_whilels:
6578 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6579 /*IsEqual=*/true);
6580 case Intrinsic::aarch64_sve_whilele:
6581 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6582 /*IsEqual=*/true);
6583 case Intrinsic::aarch64_sve_sunpkhi:
6584 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6585 Op.getOperand(1));
6586 case Intrinsic::aarch64_sve_sunpklo:
6587 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6588 Op.getOperand(1));
6589 case Intrinsic::aarch64_sve_uunpkhi:
6590 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6591 Op.getOperand(1));
6592 case Intrinsic::aarch64_sve_uunpklo:
6593 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6594 Op.getOperand(1));
6595 case Intrinsic::aarch64_sve_clasta_n:
6596 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6597 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6598 case Intrinsic::aarch64_sve_clastb_n:
6599 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6600 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6601 case Intrinsic::aarch64_sve_lasta:
6602 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6603 Op.getOperand(1), Op.getOperand(2));
6604 case Intrinsic::aarch64_sve_lastb:
6605 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6606 Op.getOperand(1), Op.getOperand(2));
6607 case Intrinsic::aarch64_sve_tbl:
6608 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6609 Op.getOperand(2));
6610 case Intrinsic::aarch64_sve_trn1:
6611 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6612 Op.getOperand(1), Op.getOperand(2));
6613 case Intrinsic::aarch64_sve_trn2:
6614 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6615 Op.getOperand(1), Op.getOperand(2));
6616 case Intrinsic::aarch64_sve_uzp1:
6617 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6618 Op.getOperand(1), Op.getOperand(2));
6619 case Intrinsic::aarch64_sve_uzp2:
6620 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6621 Op.getOperand(1), Op.getOperand(2));
6622 case Intrinsic::aarch64_sve_zip1:
6623 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6624 Op.getOperand(1), Op.getOperand(2));
6625 case Intrinsic::aarch64_sve_zip2:
6626 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6627 Op.getOperand(1), Op.getOperand(2));
6628 case Intrinsic::aarch64_sve_splice:
6629 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6630 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6631 case Intrinsic::aarch64_sve_ptrue:
6632 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6633 case Intrinsic::aarch64_sve_clz:
6634 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6635 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6636 case Intrinsic::aarch64_sme_cntsd: {
6637 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6638 DAG.getConstant(1, DL, MVT::i32));
6639 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6640 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6641 }
6642 case Intrinsic::aarch64_sve_cnt: {
6643 SDValue Data = Op.getOperand(3);
6644 // CTPOP only supports integer operands.
6645 if (Data.getValueType().isFloatingPoint())
6646 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6647 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6648 Op.getOperand(2), Data, Op.getOperand(1));
6649 }
6650 case Intrinsic::aarch64_sve_dupq_lane:
6651 return LowerDUPQLane(Op, DAG);
6652 case Intrinsic::aarch64_sve_convert_from_svbool:
6653 if (Op.getValueType() == MVT::aarch64svcount)
6654 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6655 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6656 case Intrinsic::aarch64_sve_convert_to_svbool:
6657 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6658 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6659 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6660 case Intrinsic::aarch64_sve_fneg:
6661 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6662 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6663 case Intrinsic::aarch64_sve_frintp:
6664 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6665 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6666 case Intrinsic::aarch64_sve_frintm:
6667 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6668 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6669 case Intrinsic::aarch64_sve_frinti:
6670 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6671 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6672 Op.getOperand(1));
6673 case Intrinsic::aarch64_sve_frintx:
6674 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6675 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6676 case Intrinsic::aarch64_sve_frint32x:
6677 return DAG.getNode(AArch64ISD::FRINT32_MERGE_PASSTHRU, DL,
6678 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6679 Op.getOperand(1));
6680 case Intrinsic::aarch64_sve_frint64x:
6681 return DAG.getNode(AArch64ISD::FRINT64_MERGE_PASSTHRU, DL,
6682 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6683 Op.getOperand(1));
6684 case Intrinsic::aarch64_sve_frinta:
6685 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6686 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6687 case Intrinsic::aarch64_sve_frintn:
6688 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6689 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6690 Op.getOperand(1));
6691 case Intrinsic::aarch64_sve_frintz:
6692 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6693 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6694 case Intrinsic::aarch64_sve_frint32z:
6695 return DAG.getNode(AArch64ISD::FTRUNC32_MERGE_PASSTHRU, DL,
6696 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6697 Op.getOperand(1));
6698 case Intrinsic::aarch64_sve_frint64z:
6699 return DAG.getNode(AArch64ISD::FTRUNC64_MERGE_PASSTHRU, DL,
6700 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6701 Op.getOperand(1));
6702 case Intrinsic::aarch64_sve_ucvtf:
6703 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6704 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6705 Op.getOperand(1));
6706 case Intrinsic::aarch64_sve_scvtf:
6707 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6708 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6709 Op.getOperand(1));
6710 case Intrinsic::aarch64_sve_fcvtzu:
6711 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6712 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6713 case Intrinsic::aarch64_sve_fcvtzs:
6714 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6715 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6716 case Intrinsic::aarch64_sve_fsqrt:
6717 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6718 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6719 case Intrinsic::aarch64_sve_frecpx:
6720 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6721 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6722 case Intrinsic::aarch64_sve_frecpe_x:
6723 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6724 Op.getOperand(1));
6725 case Intrinsic::aarch64_sve_frecps_x:
6726 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6727 Op.getOperand(1), Op.getOperand(2));
6728 case Intrinsic::aarch64_sve_frsqrte_x:
6729 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6730 Op.getOperand(1));
6731 case Intrinsic::aarch64_sve_frsqrts_x:
6732 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6733 Op.getOperand(1), Op.getOperand(2));
6734 case Intrinsic::aarch64_sve_fabs:
6735 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6736 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6737 case Intrinsic::aarch64_sve_abs:
6738 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6739 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6740 case Intrinsic::aarch64_sve_neg:
6741 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6742 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6743 case Intrinsic::aarch64_sve_insr: {
6744 SDValue Scalar = Op.getOperand(2);
6745 EVT ScalarTy = Scalar.getValueType();
6746 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6747 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6748
6749 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6750 Op.getOperand(1), Scalar);
6751 }
6752 case Intrinsic::aarch64_sve_rbit:
6753 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6754 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6755 Op.getOperand(1));
6756 case Intrinsic::aarch64_sve_revb:
6757 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6758 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6759 case Intrinsic::aarch64_sve_revh:
6760 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6761 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6762 case Intrinsic::aarch64_sve_revw:
6763 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6764 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6765 case Intrinsic::aarch64_sve_revd:
6766 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6767 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6768 case Intrinsic::aarch64_sve_sxtb:
6769 return DAG.getNode(
6770 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6771 Op.getOperand(2), Op.getOperand(3),
6772 DAG.getValueType(Op.getValueType().changeVectorElementType(
6773 *DAG.getContext(), MVT::i8)),
6774 Op.getOperand(1));
6775 case Intrinsic::aarch64_sve_sxth:
6776 return DAG.getNode(
6777 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6778 Op.getOperand(2), Op.getOperand(3),
6779 DAG.getValueType(Op.getValueType().changeVectorElementType(
6780 *DAG.getContext(), MVT::i16)),
6781 Op.getOperand(1));
6782 case Intrinsic::aarch64_sve_sxtw:
6783 return DAG.getNode(
6784 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6785 Op.getOperand(2), Op.getOperand(3),
6786 DAG.getValueType(Op.getValueType().changeVectorElementType(
6787 *DAG.getContext(), MVT::i32)),
6788 Op.getOperand(1));
6789 case Intrinsic::aarch64_sve_uxtb:
6790 return DAG.getNode(
6791 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6792 Op.getOperand(2), Op.getOperand(3),
6793 DAG.getValueType(Op.getValueType().changeVectorElementType(
6794 *DAG.getContext(), MVT::i8)),
6795 Op.getOperand(1));
6796 case Intrinsic::aarch64_sve_uxth:
6797 return DAG.getNode(
6798 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6799 Op.getOperand(2), Op.getOperand(3),
6800 DAG.getValueType(Op.getValueType().changeVectorElementType(
6801 *DAG.getContext(), MVT::i16)),
6802 Op.getOperand(1));
6803 case Intrinsic::aarch64_sve_uxtw:
6804 return DAG.getNode(
6805 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6806 Op.getOperand(2), Op.getOperand(3),
6807 DAG.getValueType(Op.getValueType().changeVectorElementType(
6808 *DAG.getContext(), MVT::i32)),
6809 Op.getOperand(1));
6810 case Intrinsic::localaddress: {
6811 const auto &MF = DAG.getMachineFunction();
6812 const auto *RegInfo = Subtarget->getRegisterInfo();
6813 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6814 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6815 Op.getSimpleValueType());
6816 }
6817
6818 case Intrinsic::eh_recoverfp: {
6819 // FIXME: This needs to be implemented to correctly handle highly aligned
6820 // stack objects. For now we simply return the incoming FP. Refer D53541
6821 // for more details.
6822 SDValue FnOp = Op.getOperand(1);
6823 SDValue IncomingFPOp = Op.getOperand(2);
6824 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6825 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6826 if (!Fn)
6828 "llvm.eh.recoverfp must take a function as the first argument");
6829 return IncomingFPOp;
6830 }
6831 case Intrinsic::aarch64_neon_vsri:
6832 case Intrinsic::aarch64_neon_vsli:
6833 case Intrinsic::aarch64_sve_sri:
6834 case Intrinsic::aarch64_sve_sli: {
6835 EVT Ty = Op.getValueType();
6836
6837 if (!Ty.isVector())
6838 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6839
6840 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6841
6842 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6843 IntNo == Intrinsic::aarch64_sve_sri;
6844 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6845 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6846 Op.getOperand(3));
6847 }
6848
6849 case Intrinsic::aarch64_neon_srhadd:
6850 case Intrinsic::aarch64_neon_urhadd:
6851 case Intrinsic::aarch64_neon_shadd:
6852 case Intrinsic::aarch64_neon_uhadd: {
6853 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6854 IntNo == Intrinsic::aarch64_neon_shadd);
6855 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6856 IntNo == Intrinsic::aarch64_neon_urhadd);
6857 unsigned Opcode = IsSignedAdd
6858 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6859 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6860 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6861 Op.getOperand(2));
6862 }
6863 case Intrinsic::aarch64_neon_saddlp:
6864 case Intrinsic::aarch64_neon_uaddlp: {
6865 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6866 ? AArch64ISD::UADDLP
6867 : AArch64ISD::SADDLP;
6868 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6869 }
6870 case Intrinsic::aarch64_neon_sdot:
6871 case Intrinsic::aarch64_neon_udot:
6872 case Intrinsic::aarch64_sve_sdot:
6873 case Intrinsic::aarch64_sve_udot: {
6874 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6875 IntNo == Intrinsic::aarch64_sve_udot)
6876 ? AArch64ISD::UDOT
6877 : AArch64ISD::SDOT;
6878 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6879 Op.getOperand(2), Op.getOperand(3));
6880 }
6881 case Intrinsic::aarch64_neon_usdot:
6882 case Intrinsic::aarch64_sve_usdot: {
6883 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6884 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6885 }
6886 case Intrinsic::aarch64_neon_saddlv:
6887 case Intrinsic::aarch64_neon_uaddlv: {
6888 EVT OpVT = Op.getOperand(1).getValueType();
6889 EVT ResVT = Op.getValueType();
6890 assert(
6891 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6892 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6893 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6894 "Unexpected aarch64_neon_u/saddlv type");
6895 (void)OpVT;
6896 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6897 SDValue ADDLV = DAG.getNode(
6898 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6899 : AArch64ISD::SADDLV,
6900 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6901 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6902 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6903 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6904 return EXTRACT_VEC_ELT;
6905 }
6906 case Intrinsic::experimental_cttz_elts: {
6907 SDValue CttzOp = Op.getOperand(1);
6908 EVT VT = CttzOp.getValueType();
6909 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6910
6911 if (VT.isFixedLengthVector()) {
6912 // We can use SVE instructions to lower this intrinsic by first creating
6913 // an SVE predicate register mask from the fixed-width vector.
6914 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6915 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6916 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6917 }
6918
6919 SDValue NewCttzElts =
6920 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6921 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6922 }
6923 case Intrinsic::experimental_vector_match: {
6924 return LowerVectorMatch(Op, DAG);
6925 }
6926 case Intrinsic::aarch64_cls:
6927 case Intrinsic::aarch64_cls64: {
6928 SDValue Res = DAG.getNode(ISD::CTLS, DL, Op.getOperand(1).getValueType(),
6929 Op.getOperand(1));
6930 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
6931 }
6932 case Intrinsic::aarch64_neon_cls: {
6933 // Lower NEON CLS intrinsic to ISD::CTLS
6934 return DAG.getNode(ISD::CTLS, DL, Op.getValueType(), Op.getOperand(1));
6935 }
6936 case Intrinsic::aarch64_sve_pmul:
6937 case Intrinsic::aarch64_neon_pmul:
6938 return DAG.getNode(ISD::CLMUL, DL, Op.getValueType(), Op.getOperand(1),
6939 Op.getOperand(2));
6940 }
6941}
6942
6943bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6944 if (VT.getVectorElementType() == MVT::i8 ||
6945 VT.getVectorElementType() == MVT::i16) {
6946 EltTy = MVT::i32;
6947 return true;
6948 }
6949 return false;
6950}
6951
6952bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6953 EVT DataVT) const {
6954 const EVT IndexVT = Extend.getOperand(0).getValueType();
6955 // SVE only supports implicit extension of 32-bit indices.
6956 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6957 return false;
6958
6959 // Indices cannot be smaller than the main data type.
6960 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6961 return false;
6962
6963 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6964 // element container type, which would violate the previous clause.
6965 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6966}
6967
6968/// Helper function to check if a small vector load can be optimized.
6970 const AArch64Subtarget &Subtarget) {
6971 if (!Subtarget.isNeonAvailable())
6972 return false;
6973 if (LD->isVolatile())
6974 return false;
6975
6976 EVT MemVT = LD->getMemoryVT();
6977 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
6978 return false;
6979
6980 Align Alignment = LD->getAlign();
6981 Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
6982 if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
6983 return false;
6984
6985 return true;
6986}
6987
6988bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6989 EVT ExtVT = ExtVal.getValueType();
6990 // Small, illegal vectors can be extended inreg.
6991 if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
6992 if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
6993 isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
6994 return true;
6995 }
6996 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6997 return false;
6998
6999 // It may be worth creating extending masked loads if there are multiple
7000 // masked loads using the same predicate. That way we'll end up creating
7001 // extending masked loads that may then get split by the legaliser. This
7002 // results in just one set of predicate unpacks at the start, instead of
7003 // multiple sets of vector unpacks after each load.
7004 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
7005 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
7006 // Disable extending masked loads for fixed-width for now, since the code
7007 // quality doesn't look great.
7008 if (!ExtVT.isScalableVector())
7009 return false;
7010
7011 unsigned NumExtMaskedLoads = 0;
7012 for (auto *U : Ld->getMask()->users())
7013 if (isa<MaskedLoadSDNode>(U))
7014 NumExtMaskedLoads++;
7015
7016 if (NumExtMaskedLoads <= 1)
7017 return false;
7018 }
7019 }
7020
7021 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
7022 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
7023 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
7024}
7025
7026unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
7027 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
7028 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
7029 AArch64ISD::GLD1_MERGE_ZERO},
7030 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
7031 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
7032 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
7033 AArch64ISD::GLD1_MERGE_ZERO},
7034 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
7035 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
7036 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
7037 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7038 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
7039 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
7040 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
7041 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7042 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
7043 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
7044 };
7045 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
7046 return AddrModes.find(Key)->second;
7047}
7048
7049unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
7050 switch (Opcode) {
7051 default:
7052 llvm_unreachable("unimplemented opcode");
7053 return Opcode;
7054 case AArch64ISD::GLD1_MERGE_ZERO:
7055 return AArch64ISD::GLD1S_MERGE_ZERO;
7056 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
7057 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
7058 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
7059 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
7060 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
7061 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
7062 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
7063 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
7064 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
7065 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
7066 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
7067 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
7068 }
7069}
7070
7071SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
7072 SelectionDAG &DAG) const {
7073 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
7074
7075 SDLoc DL(Op);
7076 SDValue Chain = MGT->getChain();
7077 SDValue PassThru = MGT->getPassThru();
7078 SDValue Mask = MGT->getMask();
7079 SDValue BasePtr = MGT->getBasePtr();
7080 SDValue Index = MGT->getIndex();
7081 SDValue Scale = MGT->getScale();
7082 EVT VT = Op.getValueType();
7083 EVT MemVT = MGT->getMemoryVT();
7084 ISD::LoadExtType ExtType = MGT->getExtensionType();
7085 ISD::MemIndexType IndexType = MGT->getIndexType();
7086
7087 // SVE supports zero (and so undef) passthrough values only, everything else
7088 // must be handled manually by an explicit select on the load's output.
7089 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
7090 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
7091 SDValue Load =
7092 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7093 MGT->getMemOperand(), IndexType, ExtType);
7094 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7095 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
7096 }
7097
7098 bool IsScaled = MGT->isIndexScaled();
7099 bool IsSigned = MGT->isIndexSigned();
7100
7101 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7102 // must be calculated before hand.
7103 uint64_t ScaleVal = Scale->getAsZExtVal();
7104 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7105 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7106 EVT IndexVT = Index.getValueType();
7107 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7108 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7109 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7110
7111 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7112 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7113 MGT->getMemOperand(), IndexType, ExtType);
7114 }
7115
7116 // Lower fixed length gather to a scalable equivalent.
7117 if (VT.isFixedLengthVector()) {
7118 assert(Subtarget->useSVEForFixedLengthVectors() &&
7119 "Cannot lower when not using SVE for fixed vectors!");
7120
7121 // NOTE: Handle floating-point as if integer then bitcast the result.
7122 EVT DataVT = VT.changeVectorElementTypeToInteger();
7123 MemVT = MemVT.changeVectorElementTypeToInteger();
7124
7125 // Find the smallest integer fixed length vector we can use for the gather.
7126 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7127 if (DataVT.getVectorElementType() == MVT::i64 ||
7128 Index.getValueType().getVectorElementType() == MVT::i64 ||
7129 Mask.getValueType().getVectorElementType() == MVT::i64)
7130 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7131
7132 // Promote vector operands except for passthrough, which we know is either
7133 // undef or zero, and thus best constructed directly.
7134 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7135 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7136 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7137
7138 // A promoted result type forces the need for an extending load.
7139 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
7140 ExtType = ISD::EXTLOAD;
7141
7142 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7143
7144 // Convert fixed length vector operands to scalable.
7145 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7146 MemVT.getVectorElementType());
7147 Index = convertToScalableVector(DAG, ContainerVT, Index);
7149 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
7150 : DAG.getConstant(0, DL, ContainerVT);
7151
7152 // Emit equivalent scalable vector gather.
7153 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7154 SDValue Load =
7155 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
7156 Ops, MGT->getMemOperand(), IndexType, ExtType);
7157
7158 // Extract fixed length data then convert to the required result type.
7159 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
7160 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
7161 if (VT.isFloatingPoint())
7162 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
7163
7164 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7165 }
7166
7167 // Everything else is legal.
7168 return Op;
7169}
7170
7171SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
7172 SelectionDAG &DAG) const {
7173 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
7174
7175 SDLoc DL(Op);
7176 SDValue Chain = MSC->getChain();
7177 SDValue StoreVal = MSC->getValue();
7178 SDValue Mask = MSC->getMask();
7179 SDValue BasePtr = MSC->getBasePtr();
7180 SDValue Index = MSC->getIndex();
7181 SDValue Scale = MSC->getScale();
7182 EVT VT = StoreVal.getValueType();
7183 EVT MemVT = MSC->getMemoryVT();
7184 ISD::MemIndexType IndexType = MSC->getIndexType();
7185 bool Truncating = MSC->isTruncatingStore();
7186
7187 bool IsScaled = MSC->isIndexScaled();
7188 bool IsSigned = MSC->isIndexSigned();
7189
7190 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7191 // must be calculated before hand.
7192 uint64_t ScaleVal = Scale->getAsZExtVal();
7193 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7194 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7195 EVT IndexVT = Index.getValueType();
7196 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7197 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7198 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7199
7200 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7201 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7202 MSC->getMemOperand(), IndexType, Truncating);
7203 }
7204
7205 // Lower fixed length scatter to a scalable equivalent.
7206 if (VT.isFixedLengthVector()) {
7207 assert(Subtarget->useSVEForFixedLengthVectors() &&
7208 "Cannot lower when not using SVE for fixed vectors!");
7209
7210 // Once bitcast we treat floating-point scatters as if integer.
7211 if (VT.isFloatingPoint()) {
7213 MemVT = MemVT.changeVectorElementTypeToInteger();
7214 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
7215 }
7216
7217 // Find the smallest integer fixed length vector we can use for the scatter.
7218 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7219 if (VT.getVectorElementType() == MVT::i64 ||
7220 Index.getValueType().getVectorElementType() == MVT::i64 ||
7221 Mask.getValueType().getVectorElementType() == MVT::i64)
7222 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7223
7224 // Promote vector operands.
7225 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7226 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7227 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7228 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
7229
7230 // A promoted value type forces the need for a truncating store.
7231 if (PromotedVT != VT)
7232 Truncating = true;
7233
7234 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7235
7236 // Convert fixed length vector operands to scalable.
7237 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7238 MemVT.getVectorElementType());
7239 Index = convertToScalableVector(DAG, ContainerVT, Index);
7241 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
7242
7243 // Emit equivalent scalable vector scatter.
7244 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7245 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7246 MSC->getMemOperand(), IndexType, Truncating);
7247 }
7248
7249 // Everything else is legal.
7250 return Op;
7251}
7252
7253SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7254 SDLoc DL(Op);
7255 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7256 assert(LoadNode && "Expected custom lowering of a masked load node");
7257 EVT VT = Op->getValueType(0);
7258
7259 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7260 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7261
7262 SDValue PassThru = LoadNode->getPassThru();
7263 SDValue Mask = LoadNode->getMask();
7264
7265 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7266 return Op;
7267
7269 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7270 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7271 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7272 LoadNode->getExtensionType());
7273
7274 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7275
7276 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7277}
7278
7279// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7281 EVT VT, EVT MemVT,
7282 SelectionDAG &DAG) {
7283 assert(VT.isVector() && "VT should be a vector type");
7284 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7285
7286 SDValue Value = ST->getValue();
7287
7288 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7289 // the word lane which represent the v4i8 subvector. It optimizes the store
7290 // to:
7291 //
7292 // xtn v0.8b, v0.8h
7293 // str s0, [x0]
7294
7295 SDValue Poison = DAG.getPOISON(MVT::i16);
7296 SDValue PoisonVec =
7297 DAG.getBuildVector(MVT::v4i16, DL, {Poison, Poison, Poison, Poison});
7298
7299 SDValue TruncExt =
7300 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, PoisonVec);
7301 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7302
7303 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7304 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7305 Trunc, DAG.getConstant(0, DL, MVT::i64));
7306
7307 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7308 ST->getBasePtr(), ST->getMemOperand());
7309}
7310
7312 SDLoc DL(Op);
7313 SDValue Src = Op.getOperand(0);
7314 MVT DestVT = Op.getSimpleValueType();
7315 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7317
7318 unsigned SrcAS = N->getSrcAddressSpace();
7319 unsigned DestAS = N->getDestAddressSpace();
7320 assert(SrcAS != DestAS &&
7321 "addrspacecast must be between different address spaces");
7322 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7323 TLI.getTargetMachine().getPointerSize(DestAS) &&
7324 "addrspacecast must be between different ptr sizes");
7325 (void)TLI;
7326
7327 if (SrcAS == ARM64AS::PTR32_SPTR) {
7328 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7329 DAG.getTargetConstant(0, DL, DestVT));
7330 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7331 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7332 DAG.getTargetConstant(0, DL, DestVT));
7333 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7334 (DestAS == ARM64AS::PTR32_UPTR)) {
7335 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7336 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7337 return Trunc;
7338 } else {
7339 return Src;
7340 }
7341}
7342
7343// Lower non-temporal stores that would otherwise be broken by legalization.
7344//
7345// Coordinated with STNP constraints in
7346// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
7347// `AArch64TargetLowering::ReplaceNodeResults`
7348static SDValue LowerNTStore(StoreSDNode *StoreNode, EVT VT, EVT MemVT,
7349 const SDLoc &DL, SelectionDAG &DAG) {
7350 assert(StoreNode && "Expected a store operation");
7351 assert(StoreNode->isNonTemporal() && "Expected a non-temporal store");
7352
7353 // Currently, STNP lowering can only either keep or increase code size, thus
7354 // we predicate it to not apply when optimizing for code size.
7355 if (DAG.shouldOptForSize())
7356 return SDValue();
7357
7358 // Currently we only support NT stores lowering for little-endian targets.
7359 if (!DAG.getDataLayout().isLittleEndian())
7360 return SDValue();
7361
7362 if (VT.isVector()) {
7363 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7364 // the custom lowering, as there are no un-paired non-temporal stores and
7365 // legalization will break up 256 bit inputs.
7367 if (VT.isVector() && MemVT.getSizeInBits() == 256u && EC.isKnownEven() &&
7368 (MemVT.getScalarSizeInBits() == 8u ||
7369 MemVT.getScalarSizeInBits() == 16u ||
7370 MemVT.getScalarSizeInBits() == 32u ||
7371 MemVT.getScalarSizeInBits() == 64u)) {
7372 SDValue Lo =
7375 StoreNode->getValue(), DAG.getConstant(0, DL, MVT::i64));
7376 SDValue Hi =
7379 StoreNode->getValue(),
7380 DAG.getConstant(EC.getKnownMinValue() / 2, DL, MVT::i64));
7381 SDValue Result = DAG.getMemIntrinsicNode(
7382 AArch64ISD::STNP, DL, DAG.getVTList(MVT::Other),
7383 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7384 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7385 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7386 return Result;
7387 }
7388 }
7389 return SDValue();
7390}
7391
7392// Custom lowering for any store, vector or scalar and/or default or with
7393// a truncate operations. Currently only custom lower truncate operation
7394// from vector v4i16 to v4i8 or volatile stores of i128.
7395SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7396 SelectionDAG &DAG) const {
7397 SDLoc Dl(Op);
7398 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7399 assert (StoreNode && "Can only custom lower store nodes");
7400
7401 SDValue Value = StoreNode->getValue();
7402
7403 EVT VT = Value.getValueType();
7404 EVT MemVT = StoreNode->getMemoryVT();
7405
7406 if (StoreNode->isNonTemporal()) {
7407 if (auto MaybeSTNP = LowerNTStore(StoreNode, VT, MemVT, Dl, DAG))
7408 return MaybeSTNP;
7409 }
7410
7411 if (VT.isVector()) {
7413 VT,
7414 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7415 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7416
7417 unsigned AS = StoreNode->getAddressSpace();
7418 Align Alignment = StoreNode->getAlign();
7419 if (Alignment < MemVT.getStoreSize() &&
7420 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7421 StoreNode->getMemOperand()->getFlags(),
7422 nullptr)) {
7423 return scalarizeVectorStore(StoreNode, DAG);
7424 }
7425
7426 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7427 MemVT == MVT::v4i8) {
7428 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7429 }
7430 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7431 return LowerStore128(Op, DAG);
7432 } else if (MemVT == MVT::i64x8) {
7433 SDValue Value = StoreNode->getValue();
7434 assert(Value->getValueType(0) == MVT::i64x8);
7435 SDValue Chain = StoreNode->getChain();
7436 SDValue Base = StoreNode->getBasePtr();
7437 EVT PtrVT = Base.getValueType();
7438 for (unsigned i = 0; i < 8; i++) {
7439 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, Value,
7440 DAG.getConstant(i, Dl, MVT::i32));
7441 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7442 DAG.getConstant(i * 8, Dl, PtrVT));
7443 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7444 StoreNode->getBaseAlign());
7445 }
7446 return Chain;
7447 }
7448
7449 return SDValue();
7450}
7451
7452/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7453SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7454 SelectionDAG &DAG) const {
7455 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7456 assert(StoreNode->getMemoryVT() == MVT::i128);
7457 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7458
7459 bool IsStoreRelease =
7461 if (StoreNode->isAtomic())
7462 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7463 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7466
7467 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7468 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7469 ? StoreNode->getOperand(1)
7470 : StoreNode->getOperand(2);
7471 SDLoc DL(Op);
7472 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7473 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7474 if (DAG.getDataLayout().isBigEndian())
7475 std::swap(StoreValue.first, StoreValue.second);
7477 Opcode, DL, DAG.getVTList(MVT::Other),
7478 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7479 StoreNode->getBasePtr()},
7480 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7481 return Result;
7482}
7483
7484/// Helper function to optimize loads of extended small vectors.
7485/// These patterns would otherwise get scalarized into inefficient sequences.
7487 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7488 if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7489 return SDValue();
7490
7491 EVT MemVT = Load->getMemoryVT();
7492 EVT ResVT = Load->getValueType(0);
7493 unsigned NumElts = ResVT.getVectorNumElements();
7494 unsigned DstEltBits = ResVT.getScalarSizeInBits();
7495 unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7496
7497 unsigned ExtOpcode;
7498 switch (Load->getExtensionType()) {
7499 case ISD::EXTLOAD:
7500 case ISD::ZEXTLOAD:
7501 ExtOpcode = ISD::ZERO_EXTEND;
7502 break;
7503 case ISD::SEXTLOAD:
7504 ExtOpcode = ISD::SIGN_EXTEND;
7505 break;
7506 case ISD::NON_EXTLOAD:
7507 return SDValue();
7508 }
7509
7510 SDLoc DL(Load);
7511 SDValue Chain = Load->getChain();
7512 SDValue BasePtr = Load->getBasePtr();
7513 const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7514 Align Alignment = Load->getAlign();
7515
7516 // Load the data as an FP scalar to avoid issues with integer loads.
7517 unsigned LoadBits = MemVT.getStoreSizeInBits();
7518 MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7519 SDValue ScalarLoad =
7520 DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7521
7522 MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7523 SDValue ScalarToVec =
7524 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7525 MVT BitcastTy =
7526 MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7527 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7528
7529 SDValue Res = Bitcast;
7530 unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7531 unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7532 while (CurrentEltBits < DstEltBits) {
7533 if (Res.getValueSizeInBits() >= 128) {
7534 CurrentNumElts = CurrentNumElts / 2;
7535 MVT ExtractVT =
7536 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7537 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7538 DAG.getConstant(0, DL, MVT::i64));
7539 }
7540 CurrentEltBits = CurrentEltBits * 2;
7541 MVT ExtVT =
7542 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7543 Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7544 }
7545
7546 if (CurrentNumElts != NumElts) {
7547 MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7548 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7549 DAG.getConstant(0, DL, MVT::i64));
7550 }
7551
7552 return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7553}
7554
7555SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7556 SelectionDAG &DAG) const {
7557 SDLoc DL(Op);
7558 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7559 assert(LoadNode && "Expected custom lowering of a load node");
7560
7561 if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7562 return Result;
7563
7564 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7566 SDValue Base = LoadNode->getBasePtr();
7567 SDValue Chain = LoadNode->getChain();
7568 EVT PtrVT = Base.getValueType();
7569 for (unsigned i = 0; i < 8; i++) {
7570 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7571 DAG.getConstant(i * 8, DL, PtrVT));
7572 SDValue Part =
7573 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7574 LoadNode->getBaseAlign());
7575 Ops.push_back(Part);
7576 Chain = SDValue(Part.getNode(), 1);
7577 }
7578 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7579 return DAG.getMergeValues({Loaded, Chain}, DL);
7580 }
7581
7582 return SDValue();
7583}
7584
7585SDValue AArch64TargetLowering::LowerFixedLengthVectorCompressToSVE(
7586 SDValue Op, SelectionDAG &DAG) const {
7587 SDLoc DL(Op);
7588 EVT VT = Op.getValueType();
7589
7590 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7591 SDValue Vec = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
7592 SDValue Mask = convertFixedMaskToScalableVector(Op.getOperand(1), DAG);
7593 SDValue Passthru =
7594 convertToScalableVector(DAG, ContainerVT, Op.getOperand(2));
7595
7596 SDValue Result =
7597 DAG.getNode(ISD::VECTOR_COMPRESS, DL, ContainerVT, Vec, Mask, Passthru);
7598 return convertFromScalableVector(DAG, VT, Result);
7599}
7600
7601SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7602 SelectionDAG &DAG) const {
7603 EVT VT = Op.getValueType();
7604 if (!Subtarget->isSVEAvailable())
7605 return SDValue();
7606
7607 if (VT.isFixedLengthVector())
7608 return LowerFixedLengthVectorCompressToSVE(Op, DAG);
7609
7610 SDLoc DL(Op);
7611 SDValue Vec = Op.getOperand(0);
7612 SDValue Mask = Op.getOperand(1);
7613 SDValue Passthru = Op.getOperand(2);
7614 EVT MaskVT = Mask.getValueType();
7615
7616 SDValue Compressed = DAG.getNode(
7618 DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
7619 Vec);
7620
7621 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7622 if (Passthru.isUndef() ||
7624 return Compressed;
7625
7626 SDValue CntActive = DAG.getNode(
7627 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7628 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7629 Mask);
7630
7631 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7632 SDValue CompressedMask =
7633 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
7634
7635 return DAG.getNode(ISD::VSELECT, DL, VT, CompressedMask, Compressed,
7636 Passthru);
7637}
7638
7639// Generate SUBS and CSEL for integer abs.
7640SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7641 MVT VT = Op.getSimpleValueType();
7642
7643 if (VT.isVector())
7644 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7645
7646 SDLoc DL(Op);
7647 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7648
7649 // Generate SUBS & CSEL.
7650 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7651 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7652 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7653 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7654}
7655
7657 SDValue Chain = Op.getOperand(0);
7658 SDValue Cond = Op.getOperand(1);
7659 SDValue Dest = Op.getOperand(2);
7660
7662 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7663 SDLoc DL(Op);
7664 SDValue CCVal = getCondCode(DAG, CC);
7665 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7666 Cmp);
7667 }
7668
7669 return SDValue();
7670}
7671
7672// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7673// FSHL is converted to FSHR before deciding what to do with it
7675 SDValue Shifts = Op.getOperand(2);
7676 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7677 // If opcode is FSHL, convert it to FSHR
7678 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7679 SDLoc DL(Op);
7680 MVT VT = Op.getSimpleValueType();
7681 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7682
7683 if (Op.getOpcode() == ISD::FSHL) {
7684 if (NewShiftNo == 0)
7685 return Op.getOperand(0);
7686
7687 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7688 return DAG.getNode(
7689 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7690 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7691 }
7692
7693 if (Op.getOpcode() == ISD::FSHR) {
7694 if (NewShiftNo == 0)
7695 return Op.getOperand(1);
7696
7697 if (ShiftNo->getZExtValue() == NewShiftNo)
7698 return Op;
7699
7700 // Rewrite using the normalised shift amount.
7701 return DAG.getNode(
7702 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7703 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7704 }
7705 }
7706
7707 return SDValue();
7708}
7709
7711 SDValue X = Op.getOperand(0);
7712 EVT XScalarTy = X.getValueType();
7713 SDValue Exp = Op.getOperand(1);
7714
7715 SDLoc DL(Op);
7716 EVT XVT, ExpVT;
7717 switch (Op.getSimpleValueType().SimpleTy) {
7718 default:
7719 return SDValue();
7720 case MVT::bf16:
7721 case MVT::f16:
7722 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7723 [[fallthrough]];
7724 case MVT::f32:
7725 XVT = MVT::nxv4f32;
7726 ExpVT = MVT::nxv4i32;
7727 break;
7728 case MVT::f64:
7729 XVT = MVT::nxv2f64;
7730 ExpVT = MVT::nxv2i64;
7731 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7732 break;
7733 }
7734
7735 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7736 SDValue VX =
7737 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getPOISON(XVT), X, Zero);
7738 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7739 DAG.getPOISON(ExpVT), Exp, Zero);
7740 SDValue VPg = DAG.getConstant(
7741 1, DL, XVT.changeVectorElementType(*DAG.getContext(), MVT::i1));
7742 SDValue FScale = DAG.getNode(
7744 DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
7745 VX, VExp);
7746 SDValue Final =
7747 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7748 if (X.getValueType() != XScalarTy)
7749 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7750 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7751 return Final;
7752}
7753
7754SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7755 SelectionDAG &DAG) const {
7756 return Op.getOperand(0);
7757}
7758
7759SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7760 SelectionDAG &DAG) const {
7761 SDValue Chain = Op.getOperand(0);
7762 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7763 SDValue FPtr = Op.getOperand(2); // nested function
7764 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7765
7766 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7767
7768 // ldr NestReg, .+16
7769 // ldr x17, .+20
7770 // br x17
7771 // .word 0
7772 // .nest: .qword nest
7773 // .fptr: .qword fptr
7774 SDValue OutChains[5];
7775
7776 const Function *Func =
7777 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7778 CallingConv::ID CC = Func->getCallingConv();
7779 unsigned NestReg;
7780
7781 switch (CC) {
7782 default:
7783 NestReg = 0x0f; // X15
7784 break;
7786 // Must be kept in sync with AArch64CallingConv.td
7787 NestReg = 0x04; // X4
7788 break;
7789 }
7790
7791 const char FptrReg = 0x11; // X17
7792
7793 SDValue Addr = Trmp;
7794
7795 SDLoc DL(Op);
7796 OutChains[0] = DAG.getStore(
7797 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7798 MachinePointerInfo(TrmpAddr));
7799
7800 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7801 DAG.getConstant(4, DL, MVT::i64));
7802 OutChains[1] = DAG.getStore(
7803 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7804 MachinePointerInfo(TrmpAddr, 4));
7805
7806 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7807 DAG.getConstant(8, DL, MVT::i64));
7808 OutChains[2] =
7809 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7810 MachinePointerInfo(TrmpAddr, 8));
7811
7812 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7813 DAG.getConstant(16, DL, MVT::i64));
7814 OutChains[3] =
7815 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7816
7817 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7818 DAG.getConstant(24, DL, MVT::i64));
7819 OutChains[4] =
7820 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7821
7822 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7823
7824 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7825 DAG.getConstant(12, DL, MVT::i64));
7826
7827 // Call clear cache on the trampoline instructions.
7828 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7829 EndOfTrmp);
7830}
7831
7832SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const {
7833 SDLoc DL(Op);
7834 EVT VT = Op.getValueType();
7835 if (VT.getScalarType() != MVT::bf16 ||
7836 (Subtarget->hasSVEB16B16() &&
7837 Subtarget->isNonStreamingSVEorSME2Available()))
7838 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7839
7840 assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering");
7841 assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16 || VT == MVT::v8bf16) &&
7842 "Unexpected FMUL VT");
7843
7844 auto MakeGetIntrinsic = [&](Intrinsic::ID IID) {
7845 return [&, IID](EVT VT, auto... Ops) {
7846 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7847 DAG.getConstant(IID, DL, MVT::i32), Ops...);
7848 };
7849 };
7850
7851 auto Reinterpret = [&](SDValue Value, EVT VT) {
7852 EVT SrcVT = Value.getValueType();
7853 if (VT == SrcVT)
7854 return Value;
7855 if (SrcVT.isFixedLengthVector())
7856 return convertToScalableVector(DAG, VT, Value);
7857 if (VT.isFixedLengthVector())
7858 return convertFromScalableVector(DAG, VT, Value);
7859 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value);
7860 };
7861
7862 bool UseSVEBFMLAL = VT.isScalableVector();
7863 auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2);
7864 auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2);
7865
7866 // Note: The NEON BFMLAL[BT] reads even/odd lanes like the SVE variant.
7867 // This does not match BFCVTN[2], so we use SVE to convert back to bf16.
7868 auto BFMLALB =
7869 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalb
7870 : Intrinsic::aarch64_neon_bfmlalb);
7871 auto BFMLALT =
7872 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalt
7873 : Intrinsic::aarch64_neon_bfmlalt);
7874
7875 EVT AccVT = UseSVEBFMLAL ? MVT::nxv4f32 : MVT::v4f32;
7876 bool IgnoreZeroSign = DAG.canIgnoreSignBitOfZero(Op);
7877 SDValue Zero = DAG.getConstantFP(IgnoreZeroSign ? +0.0F : -0.0F, DL, AccVT);
7878 SDValue Pg = getPredicateForVector(DAG, DL, AccVT);
7879
7880 // Lower bf16 FMUL as a pair (VT == [nx]v8bf16) of BFMLAL top/bottom
7881 // instructions. These result in two f32 vectors, which can be converted back
7882 // to bf16 with FCVT and FCVTNT.
7883 SDValue LHS = Op.getOperand(0);
7884 SDValue RHS = Op.getOperand(1);
7885
7886 // All SVE intrinsics expect to operate on full bf16 vector types.
7887 if (UseSVEBFMLAL) {
7888 LHS = Reinterpret(LHS, MVT::nxv8bf16);
7889 RHS = Reinterpret(RHS, MVT::nxv8bf16);
7890 }
7891
7892 SDValue BottomF32 = Reinterpret(BFMLALB(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7893 SDValue BottomBF16 =
7894 FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32);
7895 // Note: nxv4bf16 only uses even lanes.
7896 if (VT == MVT::nxv4bf16)
7897 return Reinterpret(BottomBF16, VT);
7898
7899 SDValue TopF32 = Reinterpret(BFMLALT(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7900 SDValue TopBF16 = FCVTNT(MVT::nxv8bf16, BottomBF16, Pg, TopF32);
7901 return Reinterpret(TopBF16, VT);
7902}
7903
7904SDValue AArch64TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const {
7905 SDValue OpA = Op->getOperand(0);
7906 SDValue OpB = Op->getOperand(1);
7907 SDValue OpC = Op->getOperand(2);
7908 EVT VT = Op.getValueType();
7909 SDLoc DL(Op);
7910
7911 assert(VT.isVector() && "Scalar fma lowering should be handled by patterns");
7912
7913 // Bail early if we're definitely not looking to merge FNEGs into the FMA.
7914 if (VT != MVT::v8f16 && VT != MVT::v4f32 && VT != MVT::v2f64)
7915 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7916
7917 if (OpC.getOpcode() != ISD::FNEG)
7918 return useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())
7919 ? LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED)
7920 : Op; // Fallback to NEON lowering.
7921
7922 // Convert FMA/FNEG nodes to SVE to enable the following patterns:
7923 // fma(a, b, neg(c)) -> fnmls(a, b, c)
7924 // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
7925 // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
7926 SDValue Pg = getPredicateForVector(DAG, DL, VT);
7927 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7928
7929 auto ConvertToScalableFnegMt = [&](SDValue Op) {
7930 if (Op.getOpcode() == ISD::FNEG)
7931 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7932 return convertToScalableVector(DAG, ContainerVT, Op);
7933 };
7934
7935 OpA = ConvertToScalableFnegMt(OpA);
7936 OpB = ConvertToScalableFnegMt(OpB);
7937 OpC = ConvertToScalableFnegMt(OpC);
7938
7939 SDValue ScalableRes =
7940 DAG.getNode(AArch64ISD::FMA_PRED, DL, ContainerVT, Pg, OpA, OpB, OpC);
7941 return convertFromScalableVector(DAG, VT, ScalableRes);
7942}
7943
7945 SelectionDAG &DAG) const {
7946 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7947 LLVM_DEBUG(Op.dump());
7948
7949 switch (Op.getOpcode()) {
7950 default:
7951 llvm_unreachable("unimplemented operand");
7952 return SDValue();
7955 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7956 case ISD::BITCAST:
7957 return LowerBITCAST(Op, DAG);
7958 case ISD::GlobalAddress:
7959 return LowerGlobalAddress(Op, DAG);
7961 return LowerGlobalTLSAddress(Op, DAG);
7963 return LowerPtrAuthGlobalAddress(Op, DAG);
7965 return LowerADJUST_TRAMPOLINE(Op, DAG);
7967 return LowerINIT_TRAMPOLINE(Op, DAG);
7968 case ISD::SETCC:
7969 case ISD::STRICT_FSETCC:
7971 return LowerSETCC(Op, DAG);
7972 case ISD::SETCCCARRY:
7973 return LowerSETCCCARRY(Op, DAG);
7974 case ISD::BRCOND:
7975 return LowerBRCOND(Op, DAG);
7976 case ISD::BR_CC:
7977 return LowerBR_CC(Op, DAG);
7978 case ISD::SELECT:
7979 return LowerSELECT(Op, DAG);
7980 case ISD::SELECT_CC:
7981 return LowerSELECT_CC(Op, DAG);
7982 case ISD::JumpTable:
7983 return LowerJumpTable(Op, DAG);
7984 case ISD::BR_JT:
7985 return LowerBR_JT(Op, DAG);
7986 case ISD::BRIND:
7987 return LowerBRIND(Op, DAG);
7988 case ISD::ConstantPool:
7989 return LowerConstantPool(Op, DAG);
7990 case ISD::BlockAddress:
7991 return LowerBlockAddress(Op, DAG);
7992 case ISD::VASTART:
7993 return LowerVASTART(Op, DAG);
7994 case ISD::VACOPY:
7995 return LowerVACOPY(Op, DAG);
7996 case ISD::VAARG:
7997 return LowerVAARG(Op, DAG);
7998 case ISD::UADDO_CARRY:
7999 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
8000 case ISD::USUBO_CARRY:
8001 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
8002 case ISD::SADDO_CARRY:
8003 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
8004 case ISD::SSUBO_CARRY:
8005 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
8006 case ISD::SADDO:
8007 case ISD::UADDO:
8008 case ISD::SSUBO:
8009 case ISD::USUBO:
8010 case ISD::SMULO:
8011 case ISD::UMULO:
8012 return LowerXALUO(Op, DAG);
8013 case ISD::FADD:
8014 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
8015 case ISD::FSUB:
8016 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
8017 case ISD::FMUL:
8018 return LowerFMUL(Op, DAG);
8019 case ISD::FMA:
8020 return LowerFMA(Op, DAG);
8021 case ISD::FDIV:
8022 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
8023 case ISD::FNEG:
8024 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
8025 case ISD::FCEIL:
8026 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
8027 case ISD::FFLOOR:
8028 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
8029 case ISD::FNEARBYINT:
8030 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
8031 case ISD::FRINT:
8032 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
8033 case ISD::FROUND:
8034 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
8035 case ISD::FROUNDEVEN:
8036 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
8037 case ISD::FTRUNC:
8038 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
8039 case ISD::FSQRT:
8040 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
8041 case ISD::FABS:
8042 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
8043 case ISD::FP_ROUND:
8045 return LowerFP_ROUND(Op, DAG);
8046 case ISD::FP_EXTEND:
8048 return LowerFP_EXTEND(Op, DAG);
8049 case ISD::FRAMEADDR:
8050 return LowerFRAMEADDR(Op, DAG);
8051 case ISD::SPONENTRY:
8052 return LowerSPONENTRY(Op, DAG);
8053 case ISD::RETURNADDR:
8054 return LowerRETURNADDR(Op, DAG);
8056 return LowerADDROFRETURNADDR(Op, DAG);
8058 return LowerCONCAT_VECTORS(Op, DAG);
8060 return LowerINSERT_VECTOR_ELT(Op, DAG);
8062 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
8063 case ISD::BUILD_VECTOR:
8064 return LowerBUILD_VECTOR(Op, DAG);
8067 return LowerEXTEND_VECTOR_INREG(Op, DAG);
8069 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
8071 return LowerVECTOR_SHUFFLE(Op, DAG);
8072 case ISD::SPLAT_VECTOR:
8073 return LowerSPLAT_VECTOR(Op, DAG);
8075 return LowerEXTRACT_SUBVECTOR(Op, DAG);
8077 return LowerINSERT_SUBVECTOR(Op, DAG);
8078 case ISD::SDIV:
8079 case ISD::UDIV:
8080 return LowerDIV(Op, DAG);
8081 case ISD::SMIN:
8082 case ISD::UMIN:
8083 case ISD::SMAX:
8084 case ISD::UMAX:
8085 return LowerMinMax(Op, DAG);
8086 case ISD::SRA:
8087 case ISD::SRL:
8088 case ISD::SHL:
8089 return LowerVectorSRA_SRL_SHL(Op, DAG);
8090 case ISD::SHL_PARTS:
8091 case ISD::SRL_PARTS:
8092 case ISD::SRA_PARTS:
8093 return LowerShiftParts(Op, DAG);
8094 case ISD::CTPOP:
8095 case ISD::PARITY:
8096 return LowerCTPOP_PARITY(Op, DAG);
8097 case ISD::FCOPYSIGN:
8098 return LowerFCOPYSIGN(Op, DAG);
8099 case ISD::OR:
8100 return LowerVectorOR(Op, DAG);
8101 case ISD::XOR:
8102 return LowerXOR(Op, DAG);
8103 case ISD::PREFETCH:
8104 return LowerPREFETCH(Op, DAG);
8105 case ISD::SINT_TO_FP:
8106 case ISD::UINT_TO_FP:
8109 return LowerINT_TO_FP(Op, DAG);
8110 case ISD::FP_TO_SINT:
8111 case ISD::FP_TO_UINT:
8114 return LowerFP_TO_INT(Op, DAG);
8117 return LowerFP_TO_INT_SAT(Op, DAG);
8118 case ISD::GET_ROUNDING:
8119 return LowerGET_ROUNDING(Op, DAG);
8120 case ISD::SET_ROUNDING:
8121 return LowerSET_ROUNDING(Op, DAG);
8122 case ISD::GET_FPMODE:
8123 return LowerGET_FPMODE(Op, DAG);
8124 case ISD::SET_FPMODE:
8125 return LowerSET_FPMODE(Op, DAG);
8126 case ISD::RESET_FPMODE:
8127 return LowerRESET_FPMODE(Op, DAG);
8128 case ISD::MUL:
8129 return LowerMUL(Op, DAG);
8130 case ISD::MULHS:
8131 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
8132 case ISD::MULHU:
8133 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
8135 return LowerINTRINSIC_W_CHAIN(Op, DAG);
8137 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
8139 return LowerINTRINSIC_VOID(Op, DAG);
8140 case ISD::ATOMIC_STORE:
8141 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
8142 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
8143 return LowerStore128(Op, DAG);
8144 }
8145 return SDValue();
8146 case ISD::STORE:
8147 return LowerSTORE(Op, DAG);
8148 case ISD::MSTORE:
8149 return LowerMSTORE(Op, DAG);
8150 case ISD::MGATHER:
8151 return LowerMGATHER(Op, DAG);
8152 case ISD::MSCATTER:
8153 return LowerMSCATTER(Op, DAG);
8155 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
8156 case ISD::VECREDUCE_ADD:
8157 case ISD::VECREDUCE_AND:
8158 case ISD::VECREDUCE_OR:
8159 case ISD::VECREDUCE_XOR:
8169 return LowerVECREDUCE(Op, DAG);
8170 case ISD::VECREDUCE_MUL:
8172 return LowerVECREDUCE_MUL(Op, DAG);
8174 return LowerATOMIC_LOAD_AND(Op, DAG);
8176 return LowerDYNAMIC_STACKALLOC(Op, DAG);
8177 case ISD::VSCALE:
8178 return LowerVSCALE(Op, DAG);
8180 return LowerVECTOR_COMPRESS(Op, DAG);
8181 case ISD::ANY_EXTEND:
8182 case ISD::SIGN_EXTEND:
8183 case ISD::ZERO_EXTEND:
8184 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
8185 case ISD::ADDRSPACECAST:
8186 return LowerADDRSPACECAST(Op, DAG);
8188 // Only custom lower when ExtraVT has a legal byte based element type.
8189 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
8190 EVT ExtraEltVT = ExtraVT.getVectorElementType();
8191 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
8192 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
8193 return SDValue();
8194
8195 return LowerToPredicatedOp(Op, DAG,
8196 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
8197 }
8198 case ISD::TRUNCATE:
8199 return LowerTRUNCATE(Op, DAG);
8200 case ISD::MLOAD:
8201 return LowerMLOAD(Op, DAG);
8202 case ISD::LOAD:
8203 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
8204 !Subtarget->isNeonAvailable()))
8205 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
8206 return LowerLOAD(Op, DAG);
8207 case ISD::ADD:
8208 case ISD::AND:
8209 case ISD::SUB:
8210 return LowerToScalableOp(Op, DAG);
8211 case ISD::FMAXIMUM:
8212 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
8213 case ISD::FMAXNUM:
8214 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
8215 case ISD::FMINIMUM:
8216 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
8217 case ISD::FMINNUM:
8218 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
8219 case ISD::VSELECT:
8220 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
8221 case ISD::ABS:
8222 return LowerABS(Op, DAG);
8223 case ISD::ABDS:
8224 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
8225 case ISD::ABDU:
8226 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
8227 case ISD::AVGFLOORS:
8228 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
8229 case ISD::AVGFLOORU:
8230 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
8231 case ISD::AVGCEILS:
8232 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
8233 case ISD::AVGCEILU:
8234 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
8235 case ISD::BITREVERSE:
8236 return LowerBitreverse(Op, DAG);
8237 case ISD::BSWAP:
8238 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
8239 case ISD::CTLZ:
8240 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
8241 case ISD::CTTZ:
8242 return LowerCTTZ(Op, DAG);
8245 return LowerVECTOR_SPLICE(Op, DAG);
8247 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
8249 return LowerVECTOR_INTERLEAVE(Op, DAG);
8251 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
8252 case ISD::LRINT:
8253 case ISD::LLRINT:
8254 if (Op.getValueType().isVector())
8255 return LowerVectorXRINT(Op, DAG);
8256 [[fallthrough]];
8257 case ISD::LROUND:
8258 case ISD::LLROUND: {
8259 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
8260 Op.getOperand(0).getValueType() == MVT::bf16) &&
8261 "Expected custom lowering of rounding operations only for f16");
8262 SDLoc DL(Op);
8263 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
8264 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
8265 }
8266 case ISD::STRICT_LROUND:
8268 case ISD::STRICT_LRINT:
8269 case ISD::STRICT_LLRINT: {
8270 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
8271 Op.getOperand(1).getValueType() == MVT::bf16) &&
8272 "Expected custom lowering of rounding operations only for f16");
8273 SDLoc DL(Op);
8274 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
8275 {Op.getOperand(0), Op.getOperand(1)});
8276 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
8277 {Ext.getValue(1), Ext.getValue(0)});
8278 }
8279 case ISD::WRITE_REGISTER: {
8280 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
8281 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
8282 SDLoc DL(Op);
8283
8284 SDValue Chain = Op.getOperand(0);
8285 SDValue SysRegName = Op.getOperand(1);
8286 std::pair<SDValue, SDValue> Pair =
8287 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
8288
8289 // chain = MSRR(chain, sysregname, lo, hi)
8290 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
8291 SysRegName, Pair.first, Pair.second);
8292
8293 return Result;
8294 }
8295 case ISD::FSHL:
8296 case ISD::FSHR:
8297 return LowerFunnelShift(Op, DAG);
8298 case ISD::FLDEXP:
8299 return LowerFLDEXP(Op, DAG);
8301 return LowerVECTOR_HISTOGRAM(Op, DAG);
8306 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
8307 }
8308}
8309
8311 return !Subtarget->useSVEForFixedLengthVectors();
8312}
8313
8315 EVT VT, bool OverrideNEON) const {
8316 if (!VT.isFixedLengthVector() || !VT.isSimple())
8317 return false;
8318
8319 // Don't use SVE for vectors we cannot scalarize if required.
8320 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8321 // Fixed length predicates should be promoted to i8.
8322 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
8323 case MVT::i1:
8324 default:
8325 return false;
8326 case MVT::i8:
8327 case MVT::i16:
8328 case MVT::i32:
8329 case MVT::i64:
8330 case MVT::f16:
8331 case MVT::f32:
8332 case MVT::f64:
8333 break;
8334 }
8335
8336 // NEON-sized vectors can be emulated using SVE instructions.
8337 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
8338 return Subtarget->isSVEorStreamingSVEAvailable();
8339
8340 // Ensure NEON MVTs only belong to a single register class.
8341 if (VT.getFixedSizeInBits() <= 128)
8342 return false;
8343
8344 // Ensure wider than NEON code generation is enabled.
8345 if (!Subtarget->useSVEForFixedLengthVectors())
8346 return false;
8347
8348 // Don't use SVE for types that don't fit.
8349 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
8350 return false;
8351
8352 // TODO: Perhaps an artificial restriction, but worth having whilst getting
8353 // the base fixed length SVE support in place.
8354 if (!VT.isPow2VectorType())
8355 return false;
8356
8357 return true;
8358}
8359
8360//===----------------------------------------------------------------------===//
8361// Calling Convention Implementation
8362//===----------------------------------------------------------------------===//
8363
8364static unsigned getIntrinsicID(const SDNode *N) {
8365 unsigned Opcode = N->getOpcode();
8366 switch (Opcode) {
8367 default:
8370 unsigned IID = N->getConstantOperandVal(0);
8371 if (IID < Intrinsic::num_intrinsics)
8372 return IID;
8374 }
8375 }
8376}
8377
8379 SDValue N1) const {
8380 if (!N0.hasOneUse())
8381 return false;
8382
8383 unsigned IID = getIntrinsicID(N1.getNode());
8384 // Avoid reassociating expressions that can be lowered to smlal/umlal.
8385 if (IID == Intrinsic::aarch64_neon_umull ||
8386 N1.getOpcode() == AArch64ISD::UMULL ||
8387 IID == Intrinsic::aarch64_neon_smull ||
8388 N1.getOpcode() == AArch64ISD::SMULL)
8389 return N0.getOpcode() != ISD::ADD;
8390
8391 return true;
8392}
8393
8394/// Selects the correct CCAssignFn for a given CallingConvention value.
8396 bool IsVarArg) const {
8397 switch (CC) {
8398 default:
8399 reportFatalUsageError("unsupported calling convention");
8400 case CallingConv::GHC:
8401 return CC_AArch64_GHC;
8403 // The VarArg implementation makes assumptions about register
8404 // argument passing that do not hold for preserve_none, so we
8405 // instead fall back to C argument passing.
8406 // The non-vararg case is handled in the CC function itself.
8407 if (!IsVarArg)
8409 [[fallthrough]];
8410 case CallingConv::C:
8411 case CallingConv::Fast:
8415 case CallingConv::Swift:
8417 case CallingConv::Tail:
8418 case CallingConv::GRAAL:
8419 if (Subtarget->isTargetWindows()) {
8420 if (IsVarArg) {
8421 if (Subtarget->isWindowsArm64EC())
8424 }
8425 return CC_AArch64_Win64PCS;
8426 }
8427 if (!Subtarget->isTargetDarwin())
8428 return CC_AArch64_AAPCS;
8429 if (!IsVarArg)
8430 return CC_AArch64_DarwinPCS;
8431 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8433 case CallingConv::Win64:
8434 if (IsVarArg) {
8435 if (Subtarget->isWindowsArm64EC())
8438 }
8439 return CC_AArch64_Win64PCS;
8441 if (Subtarget->isWindowsArm64EC())
8449 return CC_AArch64_AAPCS;
8454 }
8455}
8456
8457CCAssignFn *
8459 switch (CC) {
8460 default:
8461 return RetCC_AArch64_AAPCS;
8465 if (Subtarget->isWindowsArm64EC())
8467 return RetCC_AArch64_AAPCS;
8468 }
8469}
8470
8471static bool isPassedInFPR(EVT VT) {
8472 return VT.isFixedLengthVector() ||
8473 (VT.isFloatingPoint() && !VT.isScalableVector());
8474}
8475
8477 AArch64FunctionInfo &FuncInfo,
8478 SelectionDAG &DAG) {
8479 if (!FuncInfo.hasZT0SpillSlotIndex())
8480 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8481
8482 return DAG.getFrameIndex(
8483 FuncInfo.getZT0SpillSlotIndex(),
8485}
8486
8487// Emit a call to __arm_sme_save or __arm_sme_restore.
8489 SelectionDAG &DAG,
8491 SDValue Chain, bool IsSave) {
8494 FuncInfo->setSMESaveBufferUsed();
8496 Args.emplace_back(
8497 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8499
8500 RTLIB::Libcall LC =
8501 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8502 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
8503 SDValue Callee =
8504 DAG.getExternalSymbol(LCImpl, TLI.getPointerTy(DAG.getDataLayout()));
8505 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8507 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8508 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
8509 std::move(Args));
8510 return TLI.LowerCallTo(CLI).second;
8511}
8512
8514 const AArch64TargetLowering &TLI,
8515 const AArch64RegisterInfo &TRI,
8516 AArch64FunctionInfo &FuncInfo,
8517 SelectionDAG &DAG) {
8518 // Conditionally restore the lazy save using a pseudo node.
8519 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
8520 TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
8521
8522 RTLIB::LibcallImpl LibcallImpl = DAG.getLibcalls().getLibcallImpl(LC);
8523 SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
8524 DAG.getMachineFunction(),
8525 DAG.getLibcalls().getLibcallImplCallingConv(LibcallImpl)));
8526 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8527 LibcallImpl, TLI.getPointerTy(DAG.getDataLayout()));
8528 SDValue TPIDR2_EL0 = DAG.getNode(
8529 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
8530 DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8531 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8532 // RESTORE_ZA pseudo.
8533 SDValue Glue;
8534 SDValue TPIDR2Block = DAG.getFrameIndex(
8535 TPIDR2.FrameIndex,
8537 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
8538 Chain =
8539 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8540 {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8541 RestoreRoutine, RegMask, Chain.getValue(1)});
8542 // Finally reset the TPIDR2_EL0 register to 0.
8543 Chain = DAG.getNode(
8544 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8545 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8546 DAG.getConstant(0, DL, MVT::i64));
8547 TPIDR2.Uses++;
8548 return Chain;
8549}
8550
8551SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8552 SelectionDAG &DAG) const {
8553 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8554 SDValue Glue = Chain.getValue(1);
8555
8556 MachineFunction &MF = DAG.getMachineFunction();
8557 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8558 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
8559 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
8560
8561 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8562
8563 // The following conditions are true on entry to an exception handler:
8564 // - PSTATE.SM is 0.
8565 // - PSTATE.ZA is 0.
8566 // - TPIDR2_EL0 is null.
8567 // See:
8568 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8569 //
8570 // Therefore, if the function that contains this exception handler is a
8571 // streaming[-compatible] function, we must re-enable streaming mode.
8572 //
8573 // These mode changes are usually optimized away in catch blocks as they
8574 // occur before the __cxa_begin_catch (which is a non-streaming function),
8575 // but are necessary in some cases (such as for cleanups).
8576 //
8577 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8578
8579 // [COND_]SMSTART SM
8580 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8581 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8582 /*Glue*/ Glue, AArch64SME::Always);
8583 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8584 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8586
8587 if (getTM().useNewSMEABILowering())
8588 return Chain;
8589
8590 if (SMEFnAttrs.hasAgnosticZAInterface()) {
8591 // Restore full ZA
8592 Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
8593 /*IsSave=*/false);
8594 } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
8595 // SMSTART ZA
8596 Chain = DAG.getNode(
8597 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
8598 DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
8599
8600 // Restore ZT0
8601 if (SMEFnAttrs.hasZT0State()) {
8602 SDValue ZT0FrameIndex =
8603 getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
8604 Chain =
8605 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8606 {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
8607 }
8608
8609 // Restore ZA
8610 if (SMEFnAttrs.hasZAState())
8611 Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
8612 }
8613
8614 return Chain;
8615}
8616
8617SDValue AArch64TargetLowering::LowerFormalArguments(
8618 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8619 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8620 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8621 MachineFunction &MF = DAG.getMachineFunction();
8622 const Function &F = MF.getFunction();
8623 MachineFrameInfo &MFI = MF.getFrameInfo();
8624 bool IsWin64 =
8625 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8626 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8627 (isVarArg && Subtarget->isWindowsArm64EC());
8628 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8629
8631 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8633 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8634 FuncInfo->setIsSVECC(true);
8635
8636 // Assign locations to all of the incoming arguments.
8638 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8639
8640 // At this point, Ins[].VT may already be promoted to i32. To correctly
8641 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8642 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8643 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8644 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8645 // LocVT.
8646 unsigned NumArgs = Ins.size();
8647 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8648 unsigned CurArgIdx = 0;
8649 bool UseVarArgCC = false;
8650 if (IsWin64)
8651 UseVarArgCC = isVarArg;
8652
8653 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8654
8655 for (unsigned i = 0; i != NumArgs; ++i) {
8656 MVT ValVT = Ins[i].VT;
8657 if (Ins[i].isOrigArg()) {
8658 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8659 CurArgIdx = Ins[i].getOrigArgIndex();
8660
8661 // Get type of the original argument.
8662 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8663 /*AllowUnknown*/ true);
8664 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8665 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8666 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8667 ValVT = MVT::i8;
8668 else if (ActualMVT == MVT::i16)
8669 ValVT = MVT::i16;
8670 }
8671 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8672 Ins[i].OrigTy, CCInfo);
8673 assert(!Res && "Call operand has unhandled type");
8674 (void)Res;
8675 }
8676
8677 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8678 bool IsLocallyStreaming =
8679 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8680 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8681 SDValue Glue = Chain.getValue(1);
8682
8683 unsigned ExtraArgLocs = 0;
8684 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8685 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8686
8687 if (Ins[i].Flags.isByVal()) {
8688 // Byval is used for HFAs in the PCS, but the system should work in a
8689 // non-compliant manner for larger structs.
8690 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8691 int Size = Ins[i].Flags.getByValSize();
8692 unsigned NumRegs = (Size + 7) / 8;
8693
8694 // FIXME: This works on big-endian for composite byvals, which are the common
8695 // case. It should also work for fundamental types too.
8696 unsigned FrameIdx =
8697 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8698 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8699 InVals.push_back(FrameIdxN);
8700
8701 continue;
8702 }
8703
8704 if (Ins[i].Flags.isSwiftAsync())
8705 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8706
8707 SDValue ArgValue;
8708 if (VA.isRegLoc()) {
8709 // Arguments stored in registers.
8710 EVT RegVT = VA.getLocVT();
8711 const TargetRegisterClass *RC;
8712
8713 if (RegVT == MVT::i32)
8714 RC = &AArch64::GPR32RegClass;
8715 else if (RegVT == MVT::i64)
8716 RC = &AArch64::GPR64RegClass;
8717 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8718 RC = &AArch64::FPR16RegClass;
8719 else if (RegVT == MVT::f32)
8720 RC = &AArch64::FPR32RegClass;
8721 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8722 RC = &AArch64::FPR64RegClass;
8723 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8724 RC = &AArch64::FPR128RegClass;
8725 else if (RegVT.isScalableVector() &&
8726 RegVT.getVectorElementType() == MVT::i1) {
8727 FuncInfo->setIsSVECC(true);
8728 RC = &AArch64::PPRRegClass;
8729 } else if (RegVT == MVT::aarch64svcount) {
8730 FuncInfo->setIsSVECC(true);
8731 RC = &AArch64::PPRRegClass;
8732 } else if (RegVT.isScalableVector()) {
8733 FuncInfo->setIsSVECC(true);
8734 RC = &AArch64::ZPRRegClass;
8735 } else
8736 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8737
8738 // Transform the arguments in physical registers into virtual ones.
8739 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8740
8741 if (IsLocallyStreaming) {
8742 // LocallyStreamingFunctions must insert the SMSTART in the correct
8743 // position, so we use Glue to ensure no instructions can be scheduled
8744 // between the chain of:
8745 // t0: ch,glue = EntryNode
8746 // t1: res,ch,glue = CopyFromReg
8747 // ...
8748 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8749 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8750 // ^^^^^^
8751 // This will be the new Chain/Root node.
8752 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8753 Glue = ArgValue.getValue(2);
8754 if (isPassedInFPR(ArgValue.getValueType())) {
8755 ArgValue =
8756 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8757 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8758 {ArgValue, Glue});
8759 Glue = ArgValue.getValue(1);
8760 }
8761 } else
8762 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8763
8764 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8765 // to 64 bits. Insert an assert[sz]ext to capture this, then
8766 // truncate to the right size.
8767 switch (VA.getLocInfo()) {
8768 default:
8769 llvm_unreachable("Unknown loc info!");
8770 case CCValAssign::Full:
8771 break;
8773 assert(
8774 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8775 "Indirect arguments should be scalable on most subtargets");
8776 break;
8777 case CCValAssign::BCvt:
8778 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8779 break;
8780 case CCValAssign::AExt:
8781 case CCValAssign::SExt:
8782 case CCValAssign::ZExt:
8783 break;
8785 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8786 DAG.getConstant(32, DL, RegVT));
8787 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8788 break;
8789 }
8790 } else { // VA.isRegLoc()
8791 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8792 unsigned ArgOffset = VA.getLocMemOffset();
8793 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8794 ? VA.getLocVT().getSizeInBits()
8795 : VA.getValVT().getSizeInBits()) / 8;
8796
8797 uint32_t BEAlign = 0;
8798 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8799 !Ins[i].Flags.isInConsecutiveRegs())
8800 BEAlign = 8 - ArgSize;
8801
8802 SDValue FIN;
8803 MachinePointerInfo PtrInfo;
8804 if (StackViaX4) {
8805 // In both the ARM64EC varargs convention and the thunk convention,
8806 // arguments on the stack are accessed relative to x4, not sp. In
8807 // the thunk convention, there's an additional offset of 32 bytes
8808 // to account for the shadow store.
8809 unsigned ObjOffset = ArgOffset + BEAlign;
8810 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8811 ObjOffset += 32;
8812 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8813 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8814 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8815 DAG.getConstant(ObjOffset, DL, MVT::i64));
8817 } else {
8818 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8819
8820 // Create load nodes to retrieve arguments from the stack.
8821 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8822 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8823 }
8824
8825 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8827 MVT MemVT = VA.getValVT();
8828
8829 switch (VA.getLocInfo()) {
8830 default:
8831 break;
8832 case CCValAssign::Trunc:
8833 case CCValAssign::BCvt:
8834 MemVT = VA.getLocVT();
8835 break;
8837 assert(
8838 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8839 "Indirect arguments should be scalable on most subtargets");
8840 MemVT = VA.getLocVT();
8841 break;
8842 case CCValAssign::SExt:
8843 ExtType = ISD::SEXTLOAD;
8844 break;
8845 case CCValAssign::ZExt:
8846 ExtType = ISD::ZEXTLOAD;
8847 break;
8848 case CCValAssign::AExt:
8849 ExtType = ISD::EXTLOAD;
8850 break;
8851 }
8852
8853 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8854 MemVT);
8855 }
8856
8857 if (VA.getLocInfo() == CCValAssign::Indirect) {
8858 assert((VA.getValVT().isScalableVT() ||
8859 Subtarget->isWindowsArm64EC()) &&
8860 "Indirect arguments should be scalable on most subtargets");
8861
8862 TypeSize PartSize = VA.getValVT().getStoreSize();
8863 unsigned NumParts = 1;
8864 if (Ins[i].Flags.isInConsecutiveRegs()) {
8865 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8866 ++NumParts;
8867 }
8868
8869 MVT PartLoad = VA.getValVT();
8870 SDValue Ptr = ArgValue;
8871
8872 // Ensure we generate all loads for each tuple part, whilst updating the
8873 // pointer after each load correctly using vscale.
8874 while (NumParts > 0) {
8875 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8876 InVals.push_back(ArgValue);
8877 NumParts--;
8878 if (NumParts > 0) {
8879 SDValue BytesIncrement =
8880 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
8881 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8882 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8883 ExtraArgLocs++;
8884 i++;
8885 }
8886 }
8887 } else {
8888 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8889 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8890 ArgValue, DAG.getValueType(MVT::i32));
8891
8892 // i1 arguments are zero-extended to i8 by the caller. Emit a
8893 // hint to reflect this.
8894 if (Ins[i].isOrigArg()) {
8895 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8896 if (OrigArg->getType()->isIntegerTy(1)) {
8897 if (!Ins[i].Flags.isZExt()) {
8898 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8899 ArgValue.getValueType(), ArgValue);
8900 }
8901 }
8902 }
8903
8904 InVals.push_back(ArgValue);
8905 }
8906 }
8907 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8908
8909 if (Attrs.hasStreamingCompatibleInterface()) {
8910 SDValue EntryPStateSM =
8911 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8912 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8913
8914 // Copy the value to a virtual register, and save that in FuncInfo.
8915 Register EntryPStateSMReg =
8916 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8917 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8918 EntryPStateSM);
8919 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8920 }
8921
8922 // Insert the SMSTART if this is a locally streaming function and
8923 // make sure it is Glued to the last CopyFromReg value.
8924 if (IsLocallyStreaming) {
8925 if (Attrs.hasStreamingCompatibleInterface())
8926 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8928 else
8929 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8931
8932 // Ensure that the SMSTART happens after the CopyWithChain such that its
8933 // chain result is used.
8934 for (unsigned I=0; I<InVals.size(); ++I) {
8937 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8938 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8939 InVals[I].getValueType());
8940 }
8941 }
8942
8943 // varargs
8944 if (isVarArg) {
8946 if (!Subtarget->isTargetDarwin() || IsWin64) {
8947 // The AAPCS variadic function ABI is identical to the non-variadic
8948 // one. As a result there may be more arguments in registers and we
8949 // should save them for future reference.
8950 // Win64 variadic functions also pass arguments in registers, but all
8951 // float arguments are passed in integer registers.
8952 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8953 }
8954
8955 // This will point to the next argument passed via stack.
8956 unsigned VarArgsOffset = CCInfo.getStackSize();
8957 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8958 VarArgsOffset =
8959 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8960 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8961 FuncInfo->setVarArgsStackIndex(
8962 MFI.CreateFixedObject(4, VarArgsOffset, true));
8963 }
8964
8965 if (MFI.hasMustTailInVarArgFunc()) {
8966 SmallVector<MVT, 2> RegParmTypes;
8967 RegParmTypes.push_back(MVT::i64);
8968 RegParmTypes.push_back(MVT::f128);
8969 // Compute the set of forwarded registers. The rest are scratch.
8970 SmallVectorImpl<ForwardedRegister> &Forwards =
8971 FuncInfo->getForwardedMustTailRegParms();
8972 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8974
8975 // Conservatively forward X8, since it might be used for aggregate return.
8976 if (!CCInfo.isAllocated(AArch64::X8)) {
8977 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8978 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8979 }
8980 }
8981 }
8982
8983 // On Windows, InReg pointers must be returned, so record the pointer in a
8984 // virtual register at the start of the function so it can be returned in the
8985 // epilogue.
8986 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8987 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8988 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8989 Ins[I].Flags.isInReg()) &&
8990 Ins[I].Flags.isSRet()) {
8991 assert(!FuncInfo->getSRetReturnReg());
8992
8993 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8994 Register Reg =
8996 FuncInfo->setSRetReturnReg(Reg);
8997
8998 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8999 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
9000 break;
9001 }
9002 }
9003 }
9004
9005 unsigned StackArgSize = CCInfo.getStackSize();
9006 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9007 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
9008 // This is a non-standard ABI so by fiat I say we're allowed to make full
9009 // use of the stack area to be popped, which must be aligned to 16 bytes in
9010 // any case:
9011 StackArgSize = alignTo(StackArgSize, 16);
9012
9013 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
9014 // a multiple of 16.
9015 FuncInfo->setArgumentStackToRestore(StackArgSize);
9016
9017 // This realignment carries over to the available bytes below. Our own
9018 // callers will guarantee the space is free by giving an aligned value to
9019 // CALLSEQ_START.
9020 }
9021 // Even if we're not expected to free up the space, it's useful to know how
9022 // much is there while considering tail calls (because we can reuse it).
9023 FuncInfo->setBytesInStackArgArea(StackArgSize);
9024
9025 if (Subtarget->hasCustomCallingConv())
9026 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
9027
9028 if (getTM().useNewSMEABILowering()) {
9029 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
9030 SDValue Size;
9031 if (Attrs.hasZAState()) {
9032 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9033 DAG.getConstant(1, DL, MVT::i32));
9034 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
9035 } else if (Attrs.hasAgnosticZAInterface()) {
9036 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
9037 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9038
9039 SDValue Callee =
9040 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
9041 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
9042 TargetLowering::CallLoweringInfo CLI(DAG);
9043 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9044 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
9045 {});
9046 std::tie(Size, Chain) = LowerCallTo(CLI);
9047 }
9048 if (Size) {
9049 SDValue Buffer = DAG.getNode(
9050 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9051 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9052 Chain = Buffer.getValue(1);
9053
9054 Register BufferPtr =
9055 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9056 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
9057 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
9058 DAG.getVTList(MVT::Other), Chain);
9059 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
9060 MFI.CreateVariableSizedObject(Align(16), nullptr);
9061 }
9062 }
9063 } else {
9064 // Old SME ABI lowering (deprecated):
9065 // Create a 16 Byte TPIDR2 object. The dynamic buffer
9066 // will be expanded and stored in the static object later using a
9067 // pseudonode.
9068 if (Attrs.hasZAState()) {
9069 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9070 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
9071 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9072 DAG.getConstant(1, DL, MVT::i32));
9073 SDValue Buffer;
9074 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
9075 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
9076 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
9077 } else {
9078 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
9079 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
9080 DAG.getVTList(MVT::i64, MVT::Other),
9081 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9082 MFI.CreateVariableSizedObject(Align(16), nullptr);
9083 }
9084 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9085 DAG.getConstant(1, DL, MVT::i32));
9086 Chain = DAG.getNode(
9087 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
9088 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
9089 /*Num save slices*/ NumZaSaveSlices});
9090 } else if (Attrs.hasAgnosticZAInterface()) {
9091 // Call __arm_sme_state_size().
9092 SDValue BufferSize =
9093 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
9094 DAG.getVTList(MVT::i64, MVT::Other), Chain);
9095 Chain = BufferSize.getValue(1);
9096 SDValue Buffer;
9097 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
9098 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
9099 DAG.getVTList(MVT::i64, MVT::Other),
9100 {Chain, BufferSize});
9101 } else {
9102 // Allocate space dynamically.
9103 Buffer = DAG.getNode(
9104 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9105 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
9106 MFI.CreateVariableSizedObject(Align(16), nullptr);
9107 }
9108 // Copy the value to a virtual register, and save that in FuncInfo.
9109 Register BufferPtr =
9110 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9111 FuncInfo->setSMESaveBufferAddr(BufferPtr);
9112 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
9113 }
9114 }
9115
9116 if (CallConv == CallingConv::PreserveNone) {
9117 for (const ISD::InputArg &I : Ins) {
9118 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
9119 I.Flags.isSwiftAsync()) {
9120 MachineFunction &MF = DAG.getMachineFunction();
9121 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9122 MF.getFunction(),
9123 "Swift attributes can't be used with preserve_none",
9124 DL.getDebugLoc()));
9125 break;
9126 }
9127 }
9128 }
9129
9130 return Chain;
9131}
9132
9133void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
9134 SelectionDAG &DAG,
9135 const SDLoc &DL,
9136 SDValue &Chain) const {
9137 MachineFunction &MF = DAG.getMachineFunction();
9138 MachineFrameInfo &MFI = MF.getFrameInfo();
9139 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9140 auto PtrVT = getPointerTy(DAG.getDataLayout());
9141 Function &F = MF.getFunction();
9142 bool IsWin64 =
9143 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
9144
9146
9148 unsigned NumGPRArgRegs = GPRArgRegs.size();
9149 if (Subtarget->isWindowsArm64EC()) {
9150 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
9151 // functions.
9152 NumGPRArgRegs = 4;
9153 }
9154 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
9155
9156 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
9157 int GPRIdx = 0;
9158 if (GPRSaveSize != 0) {
9159 if (IsWin64) {
9160 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
9161 if (GPRSaveSize & 15)
9162 // The extra size here, if triggered, will always be 8.
9163 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
9164 } else
9165 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
9166
9167 SDValue FIN;
9168 if (Subtarget->isWindowsArm64EC()) {
9169 // With the Arm64EC ABI, we reserve the save area as usual, but we
9170 // compute its address relative to x4. For a normal AArch64->AArch64
9171 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
9172 // different address.
9173 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9174 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9175 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
9176 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
9177 } else {
9178 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
9179 }
9180
9181 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
9182 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
9183 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9184 SDValue Store =
9185 DAG.getStore(Val.getValue(1), DL, Val, FIN,
9187 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
9188 : MachinePointerInfo::getStack(MF, i * 8));
9189 MemOps.push_back(Store);
9190 FIN =
9191 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
9192 }
9193 }
9194 FuncInfo->setVarArgsGPRIndex(GPRIdx);
9195 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
9196
9197 if (Subtarget->hasFPARMv8() && !IsWin64) {
9199 const unsigned NumFPRArgRegs = FPRArgRegs.size();
9200 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
9201
9202 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
9203 int FPRIdx = 0;
9204 if (FPRSaveSize != 0) {
9205 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
9206
9207 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
9208
9209 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
9210 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
9211 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
9212
9213 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
9214 MachinePointerInfo::getStack(MF, i * 16));
9215 MemOps.push_back(Store);
9216 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
9217 DAG.getConstant(16, DL, PtrVT));
9218 }
9219 }
9220 FuncInfo->setVarArgsFPRIndex(FPRIdx);
9221 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
9222 }
9223
9224 if (!MemOps.empty()) {
9225 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9226 }
9227}
9228
9229/// LowerCallResult - Lower the result values of a call into the
9230/// appropriate copies out of appropriate physical registers.
9231SDValue AArch64TargetLowering::LowerCallResult(
9232 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
9233 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
9234 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
9235 SDValue ThisVal, bool RequiresSMChange) const {
9236 DenseMap<unsigned, SDValue> CopiedRegs;
9237 // Copy all of the result registers out of their specified physreg.
9238 for (unsigned i = 0; i != RVLocs.size(); ++i) {
9239 CCValAssign VA = RVLocs[i];
9240
9241 // Pass 'this' value directly from the argument to return value, to avoid
9242 // reg unit interference
9243 if (i == 0 && isThisReturn) {
9244 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
9245 "unexpected return calling convention register assignment");
9246 InVals.push_back(ThisVal);
9247 continue;
9248 }
9249
9250 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
9251 // allows one use of a physreg per block.
9252 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
9253 if (!Val) {
9254 Val =
9255 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
9256 Chain = Val.getValue(1);
9257 InGlue = Val.getValue(2);
9258 CopiedRegs[VA.getLocReg()] = Val;
9259 }
9260
9261 switch (VA.getLocInfo()) {
9262 default:
9263 llvm_unreachable("Unknown loc info!");
9264 case CCValAssign::Full:
9265 break;
9266 case CCValAssign::BCvt:
9267 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
9268 break;
9270 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
9271 DAG.getConstant(32, DL, VA.getLocVT()));
9272 [[fallthrough]];
9273 case CCValAssign::AExt:
9274 [[fallthrough]];
9275 case CCValAssign::ZExt:
9276 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
9277 break;
9278 }
9279
9280 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
9281 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9282 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
9283
9284 InVals.push_back(Val);
9285 }
9286
9287 return Chain;
9288}
9289
9290/// Return true if the calling convention is one that we can guarantee TCO for.
9291static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
9292 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
9294}
9295
9296/// Return true if we might ever do TCO for calls with this calling convention.
9298 switch (CC) {
9299 case CallingConv::C:
9304 case CallingConv::Swift:
9306 case CallingConv::Tail:
9307 case CallingConv::Fast:
9308 return true;
9309 default:
9310 return false;
9311 }
9312}
9313
9314/// Return true if the call convention supports varargs
9315/// Currently only those that pass varargs like the C
9316/// calling convention does are eligible
9317/// Calling conventions listed in this function must also
9318/// be properly handled in AArch64Subtarget::isCallingConvWin64
9320 switch (CC) {
9321 case CallingConv::C:
9323 // SVE vector call is only partially supported, but it should
9324 // support named arguments being passed. Any arguments being passed
9325 // as varargs, are still unsupported.
9327 return true;
9328 default:
9329 return false;
9330 }
9331}
9332
9334 const AArch64Subtarget *Subtarget,
9336 CCState &CCInfo) {
9337 const SelectionDAG &DAG = CLI.DAG;
9338 CallingConv::ID CalleeCC = CLI.CallConv;
9339 bool IsVarArg = CLI.IsVarArg;
9340 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9341 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
9342
9343 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
9344 // for the shadow store.
9345 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
9346 CCInfo.AllocateStack(32, Align(16));
9347
9348 unsigned NumArgs = Outs.size();
9349 for (unsigned i = 0; i != NumArgs; ++i) {
9350 MVT ArgVT = Outs[i].VT;
9351 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
9352
9353 bool UseVarArgCC = false;
9354 if (IsVarArg) {
9355 // On Windows, the fixed arguments in a vararg call are passed in GPRs
9356 // too, so use the vararg CC to force them to integer registers.
9357 if (IsCalleeWin64) {
9358 UseVarArgCC = true;
9359 } else {
9360 UseVarArgCC = ArgFlags.isVarArg();
9361 }
9362 }
9363
9364 if (!UseVarArgCC) {
9365 // Get type of the original argument.
9366 EVT ActualVT =
9367 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
9368 /*AllowUnknown*/ true);
9369 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
9370 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
9371 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
9372 ArgVT = MVT::i8;
9373 else if (ActualMVT == MVT::i16)
9374 ArgVT = MVT::i16;
9375 }
9376
9377 // FIXME: CCAssignFnForCall should be called once, for the call and not per
9378 // argument. This logic should exactly mirror LowerFormalArguments.
9379 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
9380 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
9381 Outs[i].OrigTy, CCInfo);
9382 assert(!Res && "Call operand has unhandled type");
9383 (void)Res;
9384 }
9385}
9386
9387static SMECallAttrs
9390 if (CLI.CB)
9391 return SMECallAttrs(*CLI.CB, &RTLCI);
9392 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9393 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
9395}
9396
9397bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9398 const CallLoweringInfo &CLI) const {
9399 CallingConv::ID CalleeCC = CLI.CallConv;
9400 if (!mayTailCallThisCC(CalleeCC))
9401 return false;
9402
9403 SDValue Callee = CLI.Callee;
9404 bool IsVarArg = CLI.IsVarArg;
9405 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9406 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9407 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9408 const SelectionDAG &DAG = CLI.DAG;
9409 MachineFunction &MF = DAG.getMachineFunction();
9410 const Function &CallerF = MF.getFunction();
9411 CallingConv::ID CallerCC = CallerF.getCallingConv();
9412
9413 // SME Streaming functions are not eligible for TCO as they may require
9414 // the streaming mode or ZA/ZT0 to be restored after returning from the call.
9415 SMECallAttrs CallAttrs =
9416 getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
9417 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9418 CallAttrs.requiresPreservingAllZAState() ||
9419 CallAttrs.requiresPreservingZT0() ||
9420 CallAttrs.caller().hasStreamingBody() || CallAttrs.caller().isNewZA() ||
9421 CallAttrs.caller().isNewZT0())
9422 return false;
9423
9424 // Functions using the C or Fast calling convention that have an SVE signature
9425 // preserve more registers and should assume the SVE_VectorCall CC.
9426 // The check for matching callee-saved regs will determine whether it is
9427 // eligible for TCO.
9428 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9429 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9431
9432 bool CCMatch = CallerCC == CalleeCC;
9433
9434 // When using the Windows calling convention on a non-windows OS, we want
9435 // to back up and restore X18 in such functions; we can't do a tail call
9436 // from those functions.
9437 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9438 CalleeCC != CallingConv::Win64)
9439 return false;
9440
9441 // Byval parameters hand the function a pointer directly into the stack area
9442 // we want to reuse during a tail call. Working around this *is* possible (see
9443 // X86) but less efficient and uglier in LowerCall.
9444 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9445 e = CallerF.arg_end();
9446 i != e; ++i) {
9447 if (i->hasByValAttr())
9448 return false;
9449
9450 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9451 // In this case, it is necessary to save X0/X1 in the callee and return it
9452 // in X0. Tail call opt may interfere with this, so we disable tail call
9453 // opt when the caller has an "inreg" attribute -- except if the callee
9454 // also has that attribute on the same argument, and the same value is
9455 // passed.
9456 if (i->hasInRegAttr()) {
9457 unsigned ArgIdx = i - CallerF.arg_begin();
9458 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9459 return false;
9460 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9461 if (!Attrs.hasAttribute(Attribute::InReg) ||
9462 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9463 CLI.CB->getArgOperand(ArgIdx) != i) {
9464 return false;
9465 }
9466 }
9467 }
9468
9469 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9470 return CCMatch;
9471
9472 // Externally-defined functions with weak linkage should not be
9473 // tail-called on AArch64 when the OS does not support dynamic
9474 // pre-emption of symbols, as the AAELF spec requires normal calls
9475 // to undefined weak functions to be replaced with a NOP or jump to the
9476 // next instruction. The behaviour of branch instructions in this
9477 // situation (as used for tail calls) is implementation-defined, so we
9478 // cannot rely on the linker replacing the tail call with a return.
9479 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9480 const GlobalValue *GV = G->getGlobal();
9481 const Triple &TT = getTargetMachine().getTargetTriple();
9482 if (GV->hasExternalWeakLinkage() &&
9483 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9484 return false;
9485 }
9486
9487 // Now we search for cases where we can use a tail call without changing the
9488 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9489 // concept.
9490
9491 // I want anyone implementing a new calling convention to think long and hard
9492 // about this assert.
9493 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9494 report_fatal_error("Unsupported variadic calling convention");
9495
9496 LLVMContext &C = *DAG.getContext();
9497 // Check that the call results are passed in the same way.
9498 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9499 CCAssignFnForCall(CalleeCC, IsVarArg),
9500 CCAssignFnForCall(CallerCC, IsVarArg)))
9501 return false;
9502 // The callee has to preserve all registers the caller needs to preserve.
9503 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9504 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9505 if (!CCMatch) {
9506 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9507 if (Subtarget->hasCustomCallingConv()) {
9508 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9509 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9510 }
9511 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9512 return false;
9513 }
9514
9515 // Nothing more to check if the callee is taking no arguments
9516 if (Outs.empty())
9517 return true;
9518
9520 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9521
9522 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9523
9524 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9525 // When we are musttail, additional checks have been done and we can safely ignore this check
9526 // At least two cases here: if caller is fastcc then we can't have any
9527 // memory arguments (we'd be expected to clean up the stack afterwards). If
9528 // caller is C then we could potentially use its argument area.
9529
9530 // FIXME: for now we take the most conservative of these in both cases:
9531 // disallow all variadic memory operands.
9532 for (const CCValAssign &ArgLoc : ArgLocs)
9533 if (!ArgLoc.isRegLoc())
9534 return false;
9535 }
9536
9537 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9538
9539 // If any of the arguments is passed indirectly, it must be SVE, so the
9540 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9541 // allocate space on the stack. That is why we determine this explicitly here
9542 // the call cannot be a tailcall.
9543 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9544 assert((A.getLocInfo() != CCValAssign::Indirect ||
9545 A.getValVT().isScalableVector() ||
9546 Subtarget->isWindowsArm64EC()) &&
9547 "Expected value to be scalable");
9548 return A.getLocInfo() == CCValAssign::Indirect;
9549 }))
9550 return false;
9551
9552 // If the stack arguments for this call do not fit into our own save area then
9553 // the call cannot be made tail.
9554 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9555 return false;
9556
9557 const MachineRegisterInfo &MRI = MF.getRegInfo();
9558 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9559 return false;
9560
9561 return true;
9562}
9563
9564SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9565 SelectionDAG &DAG,
9566 MachineFrameInfo &MFI,
9567 int ClobberedFI) const {
9568 SmallVector<SDValue, 8> ArgChains;
9569 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9570 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9571
9572 // Include the original chain at the beginning of the list. When this is
9573 // used by target LowerCall hooks, this helps legalize find the
9574 // CALLSEQ_BEGIN node.
9575 ArgChains.push_back(Chain);
9576
9577 // Add a chain value for each stack argument corresponding
9578 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9579 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9580 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9581 if (FI->getIndex() < 0) {
9582 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9583 int64_t InLastByte = InFirstByte;
9584 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9585
9586 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9587 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9588 ArgChains.push_back(SDValue(L, 1));
9589 }
9590
9591 // Build a tokenfactor for all the chains.
9592 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9593}
9594
9595bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9596 bool TailCallOpt) const {
9597 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9598 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9599}
9600
9601// Check if the value is zero-extended from i1 to i8
9602static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9603 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9604 if (SizeInBits < 8)
9605 return false;
9606
9607 APInt RequiredZero(SizeInBits, 0xFE);
9608 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9609 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9610 return ZExtBool;
9611}
9612
9613void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9614 SDNode *Node) const {
9615 // Live-in physreg copies that are glued to SMSTART are applied as
9616 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9617 // register allocator to pass call args in callee saved regs, without extra
9618 // copies to avoid these fake clobbers of actually-preserved GPRs.
9619 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9620 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9621 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9622 if (MachineOperand &MO = MI.getOperand(I);
9623 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9624 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9625 AArch64::GPR64RegClass.contains(MO.getReg())))
9626 MI.removeOperand(I);
9627
9628 // The SVE vector length can change when entering/leaving streaming mode.
9629 // FPMR is set to 0 when entering/leaving streaming mode.
9630 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9631 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9632 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9633 /*IsImplicit=*/true));
9634 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9635 /*IsImplicit=*/true));
9636 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9637 /*IsImplicit=*/true));
9638 }
9639 }
9640
9641 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9642 // have nothing to do with VG, were it not that they are used to materialise a
9643 // frame-address. If they contain a frame-index to a scalable vector, this
9644 // will likely require an ADDVL instruction to materialise the address, thus
9645 // reading VG.
9646 const MachineFunction &MF = *MI.getMF();
9647 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9648 (MI.getOpcode() == AArch64::ADDXri ||
9649 MI.getOpcode() == AArch64::SUBXri)) {
9650 const MachineOperand &MO = MI.getOperand(1);
9651 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9652 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9653 /*IsImplicit=*/true));
9654 }
9655}
9656
9658 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9659 unsigned Condition, bool InsertVectorLengthCheck) const {
9662 FuncInfo->setHasStreamingModeChanges(true);
9663
9664 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9665 SmallVector<SDValue, 2> Ops = {Chain};
9666 if (InGlue)
9667 Ops.push_back(InGlue);
9668 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9669 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9670 };
9671
9672 if (InsertVectorLengthCheck && Enable) {
9673 // Non-streaming -> Streaming
9674 // Insert vector length check before smstart
9675 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9676 Chain = CheckVL.getValue(0);
9677 InGlue = CheckVL.getValue(1);
9678 }
9679
9680 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9681 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9682 SDValue MSROp =
9683 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9684 SmallVector<SDValue> Ops = {Chain, MSROp};
9685 unsigned Opcode;
9686 if (Condition != AArch64SME::Always) {
9687 Register PStateReg = FuncInfo->getPStateSMReg();
9688 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9689 SDValue PStateSM =
9690 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9691 // Use chain and glue from the CopyFromReg.
9692 Ops[0] = PStateSM.getValue(1);
9693 InGlue = PStateSM.getValue(2);
9694 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9695 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9696 Ops.push_back(ConditionOp);
9697 Ops.push_back(PStateSM);
9698 } else {
9699 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9700 }
9701 Ops.push_back(RegMask);
9702
9703 if (InGlue)
9704 Ops.push_back(InGlue);
9705
9706 SDValue SMChange =
9707 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9708
9709 if (!InsertVectorLengthCheck || Enable)
9710 return SMChange;
9711
9712 // Streaming -> Non-streaming
9713 // Insert vector length check after smstop since we cannot read VL
9714 // in streaming mode
9715 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9716}
9717
9720 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9721 CallAttrs.caller().hasStreamingBody())
9722 return AArch64SME::Always;
9723 if (CallAttrs.callee().hasNonStreamingInterface())
9725 if (CallAttrs.callee().hasStreamingInterface())
9727
9728 llvm_unreachable("Unsupported attributes");
9729}
9730
9731/// Check whether a stack argument requires lowering in a tail call.
9733 const CCValAssign &VA, SDValue Arg,
9734 ISD::ArgFlagsTy Flags, int CallOffset) {
9735 // FIXME: We should be able to handle this case, but it's not clear how to.
9736 if (Flags.isZExt() || Flags.isSExt())
9737 return true;
9738
9739 for (;;) {
9740 // Look through nodes that don't alter the bits of the incoming value.
9741 unsigned Op = Arg.getOpcode();
9742 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9743 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9744 Arg = Arg.getOperand(0);
9745 continue;
9746 }
9747 break;
9748 }
9749
9750 // If the argument is a load from the same immutable stack slot, we can reuse
9751 // it.
9752 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9753 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9754 const MachineFrameInfo &MFI = MF.getFrameInfo();
9755 int FI = FINode->getIndex();
9756 if (!MFI.isImmutableObjectIndex(FI))
9757 return true;
9758 if (CallOffset != MFI.getObjectOffset(FI))
9759 return true;
9760 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9761 if (SizeInBits != VA.getValVT().getSizeInBits())
9762 return true;
9763 return false;
9764 }
9765 }
9766
9767 return true;
9768}
9769
9770/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9771/// and add input and output parameter nodes.
9772SDValue
9773AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9774 SmallVectorImpl<SDValue> &InVals) const {
9775 SelectionDAG &DAG = CLI.DAG;
9776 SDLoc &DL = CLI.DL;
9777 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9778 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9779 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9780 SDValue Chain = CLI.Chain;
9781 SDValue Callee = CLI.Callee;
9782 bool &IsTailCall = CLI.IsTailCall;
9783 CallingConv::ID &CallConv = CLI.CallConv;
9784 bool IsVarArg = CLI.IsVarArg;
9785 const CallBase *CB = CLI.CB;
9786
9787 MachineFunction &MF = DAG.getMachineFunction();
9788 MachineFunction::CallSiteInfo CSInfo;
9789 bool IsThisReturn = false;
9790
9791 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9792 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9793 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9794 bool IsSibCall = false;
9795 bool GuardWithBTI = false;
9796
9797 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9798 !Subtarget->noBTIAtReturnTwice()) {
9799 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9800 }
9801
9802 // Analyze operands of the call, assigning locations to each operand.
9804 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9805
9806 if (IsVarArg) {
9807 unsigned NumArgs = Outs.size();
9808
9809 for (unsigned i = 0; i != NumArgs; ++i) {
9810 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9811 report_fatal_error("Passing SVE types to variadic functions is "
9812 "currently not supported");
9813 }
9814 }
9815
9816 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9817
9818 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9819 // Assign locations to each value returned by this call.
9821 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9822 *DAG.getContext());
9823 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9824
9825 // Set type id for call site info.
9826 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9827 CSInfo = MachineFunction::CallSiteInfo(*CB);
9828
9829 // Check callee args/returns for SVE registers and set calling convention
9830 // accordingly.
9831 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9832 auto HasSVERegLoc = [](CCValAssign &Loc) {
9833 if (!Loc.isRegLoc())
9834 return false;
9835 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9836 AArch64::PPRRegClass.contains(Loc.getLocReg());
9837 };
9838 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9840 }
9841
9842 // Determine whether we need any streaming mode changes.
9843 SMECallAttrs CallAttrs =
9845
9846 std::optional<unsigned> ZAMarkerNode;
9847 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9848
9849 if (UseNewSMEABILowering) {
9850 if (CallAttrs.requiresLazySave() ||
9851 CallAttrs.requiresPreservingAllZAState())
9852 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9853 else if (CallAttrs.requiresPreservingZT0())
9854 ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
9855 else if (CallAttrs.caller().hasZAState() ||
9856 CallAttrs.caller().hasZT0State())
9857 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9858 }
9859
9860 if (IsTailCall) {
9861 // Check if it's really possible to do a tail call.
9862 IsTailCall = isEligibleForTailCallOptimization(CLI);
9863
9864 // A sibling call is one where we're under the usual C ABI and not planning
9865 // to change that but can still do a tail call:
9866 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9867 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9868 IsSibCall = true;
9869
9870 if (IsTailCall)
9871 ++NumTailCalls;
9872 }
9873
9874 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9875 report_fatal_error("failed to perform tail call elimination on a call "
9876 "site marked musttail");
9877
9878 // Get a count of how many bytes are to be pushed on the stack.
9879 unsigned NumBytes = CCInfo.getStackSize();
9880
9881 if (IsSibCall) {
9882 // Since we're not changing the ABI to make this a tail call, the memory
9883 // operands are already available in the caller's incoming argument space.
9884 NumBytes = 0;
9885 }
9886
9887 // FPDiff is the byte offset of the call's argument area from the callee's.
9888 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9889 // by this amount for a tail call. In a sibling call it must be 0 because the
9890 // caller will deallocate the entire stack and the callee still expects its
9891 // arguments to begin at SP+0. Completely unused for non-tail calls.
9892 int FPDiff = 0;
9893
9894 if (IsTailCall && !IsSibCall) {
9895 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9896
9897 // Since callee will pop argument stack as a tail call, we must keep the
9898 // popped size 16-byte aligned.
9899 NumBytes = alignTo(NumBytes, 16);
9900
9901 // FPDiff will be negative if this tail call requires more space than we
9902 // would automatically have in our incoming argument space. Positive if we
9903 // can actually shrink the stack.
9904 FPDiff = NumReusableBytes - NumBytes;
9905
9906 // Update the required reserved area if this is the tail call requiring the
9907 // most argument stack space.
9908 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9909 FuncInfo->setTailCallReservedStack(-FPDiff);
9910
9911 // The stack pointer must be 16-byte aligned at all times it's used for a
9912 // memory operation, which in practice means at *all* times and in
9913 // particular across call boundaries. Therefore our own arguments started at
9914 // a 16-byte aligned SP and the delta applied for the tail call should
9915 // satisfy the same constraint.
9916 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9917 }
9918
9919 auto DescribeCallsite =
9920 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9921 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9922 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9923 R << ore::NV("Callee", ES->getSymbol());
9924 else if (CLI.CB && CLI.CB->getCalledFunction())
9925 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9926 else
9927 R << "unknown callee";
9928 R << "'";
9929 return R;
9930 };
9931
9932 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9933 bool RequiresSaveAllZA =
9934 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9935 if (RequiresLazySave) {
9936 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9937 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9938 TPIDR2.FrameIndex,
9940 Chain = DAG.getNode(
9941 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9942 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9943 TPIDR2ObjAddr);
9944 OptimizationRemarkEmitter ORE(&MF.getFunction());
9945 ORE.emit([&]() {
9946 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9947 CLI.CB)
9948 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9949 &MF.getFunction());
9950 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9951 });
9952 } else if (RequiresSaveAllZA) {
9953 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9954 "Cannot share state that may not exist");
9955 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9956 /*IsSave=*/true);
9957 }
9958
9959 bool RequiresSMChange = CallAttrs.requiresSMChange();
9960 if (RequiresSMChange) {
9961 OptimizationRemarkEmitter ORE(&MF.getFunction());
9962 ORE.emit([&]() {
9963 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9964 CLI.CB)
9965 : OptimizationRemarkAnalysis("sme", "SMETransition",
9966 &MF.getFunction());
9967 DescribeCallsite(R) << " requires a streaming mode transition";
9968 return R;
9969 });
9970 }
9971
9972 SDValue ZTFrameIdx;
9973 MachineFrameInfo &MFI = MF.getFrameInfo();
9974 bool ShouldPreserveZT0 =
9975 !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0();
9976
9977 // If the caller has ZT0 state which will not be preserved by the callee,
9978 // spill ZT0 before the call.
9979 if (ShouldPreserveZT0) {
9980 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9981
9982 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9983 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9984 }
9985
9986 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9987 // PSTATE.ZA before the call if there is no lazy-save active.
9988 bool DisableZA =
9989 !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall();
9990 assert((!DisableZA || !RequiresLazySave) &&
9991 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9992
9993 if (DisableZA)
9994 Chain = DAG.getNode(
9995 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9996 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9997
9998 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9999 // These operations are automatically eliminated by the prolog/epilog pass
10000 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
10001 if (!IsSibCall) {
10002 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
10003 if (ZAMarkerNode) {
10004 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
10005 // using a chain can result in incorrect scheduling. The markers refer to
10006 // the position just before the CALLSEQ_START (though occur after as
10007 // CALLSEQ_START lacks in-glue).
10008 Chain =
10009 DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other, MVT::Glue),
10010 {Chain, Chain.getValue(1)});
10011 }
10012 }
10013
10014 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
10016
10018 SmallSet<unsigned, 8> RegsUsed;
10019 SmallVector<SDValue, 8> MemOpChains;
10020 auto PtrVT = getPointerTy(DAG.getDataLayout());
10021
10022 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
10023 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
10024 for (const auto &F : Forwards) {
10025 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
10026 RegsToPass.emplace_back(F.PReg, Val);
10027 }
10028 }
10029
10030 // Walk the register/memloc assignments, inserting copies/loads.
10031 unsigned ExtraArgLocs = 0;
10032 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
10033 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
10034 SDValue Arg = OutVals[i];
10035 ISD::ArgFlagsTy Flags = Outs[i].Flags;
10036
10037 // Promote the value if needed.
10038 switch (VA.getLocInfo()) {
10039 default:
10040 llvm_unreachable("Unknown loc info!");
10041 case CCValAssign::Full:
10042 break;
10043 case CCValAssign::SExt:
10044 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
10045 break;
10046 case CCValAssign::ZExt:
10047 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10048 break;
10049 case CCValAssign::AExt:
10050 if (Outs[i].ArgVT == MVT::i1) {
10051 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
10052 //
10053 // Check if we actually have to do this, because the value may
10054 // already be zero-extended.
10055 //
10056 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
10057 // and rely on DAGCombiner to fold this, because the following
10058 // (anyext i32) is combined with (zext i8) in DAG.getNode:
10059 //
10060 // (ext (zext x)) -> (zext x)
10061 //
10062 // This will give us (zext i32), which we cannot remove, so
10063 // try to check this beforehand.
10064 if (!checkZExtBool(Arg, DAG)) {
10065 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10066 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
10067 }
10068 }
10069 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10070 break;
10072 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10073 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10074 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10075 DAG.getConstant(32, DL, VA.getLocVT()));
10076 break;
10077 case CCValAssign::BCvt:
10078 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
10079 break;
10080 case CCValAssign::Trunc:
10081 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10082 break;
10083 case CCValAssign::FPExt:
10084 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
10085 break;
10087 bool isScalable = VA.getValVT().isScalableVT();
10088 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
10089 "Indirect arguments should be scalable on most subtargets");
10090
10091 TypeSize StoreSize = VA.getValVT().getStoreSize();
10092 TypeSize PartSize = StoreSize;
10093 unsigned NumParts = 1;
10094 if (Outs[i].Flags.isInConsecutiveRegs()) {
10095 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
10096 ++NumParts;
10097 StoreSize *= NumParts;
10098 }
10099
10100 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
10101 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
10102 MachineFrameInfo &MFI = MF.getFrameInfo();
10103 int FI =
10104 MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
10105 if (isScalable) {
10106 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
10107 VA.getValVT().getVectorElementType() == MVT::i1;
10110 }
10111
10112 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
10113 SDValue Ptr = DAG.getFrameIndex(
10115 SDValue SpillSlot = Ptr;
10116
10117 // Ensure we generate all stores for each tuple part, whilst updating the
10118 // pointer after each store correctly using vscale.
10119 while (NumParts) {
10120 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
10121 MemOpChains.push_back(Store);
10122
10123 NumParts--;
10124 if (NumParts > 0) {
10125 SDValue BytesIncrement =
10126 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
10127 MPI = MachinePointerInfo(MPI.getAddrSpace());
10128 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
10129 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
10130 ExtraArgLocs++;
10131 i++;
10132 }
10133 }
10134
10135 Arg = SpillSlot;
10136 break;
10137 }
10138
10139 if (VA.isRegLoc()) {
10140 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
10141 Outs[0].VT == MVT::i64) {
10142 assert(VA.getLocVT() == MVT::i64 &&
10143 "unexpected calling convention register assignment");
10144 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
10145 "unexpected use of 'returned'");
10146 IsThisReturn = true;
10147 }
10148 if (RegsUsed.count(VA.getLocReg())) {
10149 // If this register has already been used then we're trying to pack
10150 // parts of an [N x i32] into an X-register. The extension type will
10151 // take care of putting the two halves in the right place but we have to
10152 // combine them.
10153 SDValue &Bits =
10154 llvm::find_if(RegsToPass,
10155 [=](const std::pair<unsigned, SDValue> &Elt) {
10156 return Elt.first == VA.getLocReg();
10157 })
10158 ->second;
10159 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10160 // Call site info is used for function's parameter entry value
10161 // tracking. For now we track only simple cases when parameter
10162 // is transferred through whole register.
10164 [&VA](MachineFunction::ArgRegPair ArgReg) {
10165 return ArgReg.Reg == VA.getLocReg();
10166 });
10167 } else {
10168 // Add an extra level of indirection for streaming mode changes by
10169 // using a pseudo copy node that cannot be rematerialised between a
10170 // smstart/smstop and the call by the simple register coalescer.
10171 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
10172 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10173 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
10174 RegsToPass.emplace_back(VA.getLocReg(), Arg);
10175 RegsUsed.insert(VA.getLocReg());
10176 const TargetOptions &Options = DAG.getTarget().Options;
10177 if (Options.EmitCallSiteInfo)
10178 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
10179 }
10180 } else {
10181 assert(VA.isMemLoc());
10182
10183 SDValue DstAddr;
10184 MachinePointerInfo DstInfo;
10185
10186 // FIXME: This works on big-endian for composite byvals, which are the
10187 // common case. It should also work for fundamental types too.
10188 uint32_t BEAlign = 0;
10189 unsigned OpSize;
10190 if (VA.getLocInfo() == CCValAssign::Indirect ||
10192 OpSize = VA.getLocVT().getFixedSizeInBits();
10193 else
10194 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
10195 : VA.getValVT().getSizeInBits();
10196 OpSize = (OpSize + 7) / 8;
10197 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
10198 !Flags.isInConsecutiveRegs()) {
10199 if (OpSize < 8)
10200 BEAlign = 8 - OpSize;
10201 }
10202 unsigned LocMemOffset = VA.getLocMemOffset();
10203 int32_t Offset = LocMemOffset + BEAlign;
10204
10205 if (IsTailCall) {
10206 // When the frame pointer is perfectly aligned for the tail call and the
10207 // same stack argument is passed down intact, we can reuse it.
10208 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
10209 continue;
10210
10211 Offset = Offset + FPDiff;
10212 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
10213
10214 DstAddr = DAG.getFrameIndex(FI, PtrVT);
10215 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
10216
10217 // Make sure any stack arguments overlapping with where we're storing
10218 // are loaded before this eventual operation. Otherwise they'll be
10219 // clobbered.
10220 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
10221 } else {
10222 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
10223
10224 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
10225 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
10226 }
10227
10228 if (Outs[i].Flags.isByVal()) {
10229 SDValue SizeNode =
10230 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
10231 SDValue Cpy = DAG.getMemcpy(
10232 Chain, DL, DstAddr, Arg, SizeNode,
10233 Outs[i].Flags.getNonZeroByValAlign(),
10234 /*isVol = */ false, /*AlwaysInline = */ false,
10235 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
10236
10237 MemOpChains.push_back(Cpy);
10238 } else {
10239 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
10240 // promoted to a legal register type i32, we should truncate Arg back to
10241 // i1/i8/i16.
10242 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
10243 VA.getValVT() == MVT::i16)
10244 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
10245
10246 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
10247 MemOpChains.push_back(Store);
10248 }
10249 }
10250 }
10251
10252 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
10253 !(CLI.CB && CLI.CB->isMustTailCall())) {
10254 SDValue ParamPtr = StackPtr;
10255 if (IsTailCall) {
10256 // Create a dummy object at the top of the stack that can be used to get
10257 // the SP after the epilogue
10258 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
10259 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
10260 }
10261
10262 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
10263 // describing the argument list. x4 contains the address of the
10264 // first stack parameter. x5 contains the size in bytes of all parameters
10265 // passed on the stack.
10266 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
10267 RegsToPass.emplace_back(AArch64::X5,
10268 DAG.getConstant(NumBytes, DL, MVT::i64));
10269 }
10270
10271 if (!MemOpChains.empty())
10272 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
10273
10274 SDValue InGlue;
10275 if (RequiresSMChange) {
10276 bool InsertVectorLengthCheck =
10278 Chain = changeStreamingMode(
10279 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
10280 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
10281 InGlue = Chain.getValue(1);
10282 }
10283
10284 // Build a sequence of copy-to-reg nodes chained together with token chain
10285 // and flag operands which copy the outgoing args into the appropriate regs.
10286 for (auto &RegToPass : RegsToPass) {
10287 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
10288 RegToPass.second, InGlue);
10289 InGlue = Chain.getValue(1);
10290 }
10291
10292 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
10293 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
10294 // node so that legalize doesn't hack it.
10295 const GlobalValue *CalledGlobal = nullptr;
10296 unsigned OpFlags = 0;
10297 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
10298 CalledGlobal = G->getGlobal();
10299 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
10301 if (OpFlags & AArch64II::MO_GOT) {
10302 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
10303 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10304 } else {
10305 const GlobalValue *GV = G->getGlobal();
10306 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
10307 }
10308 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
10309 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
10310 Subtarget->isTargetMachO()) ||
10312 const char *Sym = S->getSymbol();
10313 if (UseGot) {
10315 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10316 } else {
10317 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
10318 }
10319 }
10320
10321 // We don't usually want to end the call-sequence here because we would tidy
10322 // the frame up *after* the call, however in the ABI-changing tail-call case
10323 // we've carefully laid out the parameters so that when sp is reset they'll be
10324 // in the correct location.
10325 if (IsTailCall && !IsSibCall) {
10326 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
10327 InGlue = Chain.getValue(1);
10328 }
10329
10330 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
10331
10332 std::vector<SDValue> Ops;
10333 Ops.push_back(Chain);
10334 Ops.push_back(Callee);
10335
10336 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
10337 // be expanded to the call, directly followed by a special marker sequence and
10338 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
10339 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
10340 assert(!IsTailCall &&
10341 "tail calls cannot be marked with clang.arc.attachedcall");
10342 Opc = AArch64ISD::CALL_RVMARKER;
10343
10344 // Add a target global address for the retainRV/claimRV runtime function
10345 // just before the call target.
10346 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
10347 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
10348 Ops.insert(Ops.begin() + 1, GA);
10349
10350 // We may or may not need to emit both the marker and the retain/claim call.
10351 // Tell the pseudo expansion using an additional boolean op.
10352 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
10353 SDValue DoEmitMarker =
10354 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
10355 Ops.insert(Ops.begin() + 2, DoEmitMarker);
10356 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10357 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
10358 } else if (GuardWithBTI) {
10359 Opc = AArch64ISD::CALL_BTI;
10360 }
10361
10362 if (IsTailCall) {
10363 // Each tail call may have to adjust the stack by a different amount, so
10364 // this information must travel along with the operation for eventual
10365 // consumption by emitEpilogue.
10366 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
10367 }
10368
10369 if (CLI.PAI) {
10370 const uint64_t Key = CLI.PAI->Key;
10372 "Invalid auth call key");
10373
10374 // Split the discriminator into address/integer components.
10375 SDValue AddrDisc, IntDisc;
10376 std::tie(IntDisc, AddrDisc) =
10377 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
10378
10379 if (Opc == AArch64ISD::CALL_RVMARKER)
10380 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
10381 else
10382 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
10383 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
10384 Ops.push_back(IntDisc);
10385 Ops.push_back(AddrDisc);
10386 }
10387
10388 // Add argument registers to the end of the list so that they are known live
10389 // into the call.
10390 for (auto &RegToPass : RegsToPass)
10391 Ops.push_back(DAG.getRegister(RegToPass.first,
10392 RegToPass.second.getValueType()));
10393
10394 // Add a register mask operand representing the call-preserved registers.
10395 const uint32_t *Mask;
10396 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10397 if (IsThisReturn) {
10398 // For 'this' returns, use the X0-preserving mask if applicable
10399 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10400 if (!Mask) {
10401 IsThisReturn = false;
10402 Mask = TRI->getCallPreservedMask(MF, CallConv);
10403 }
10404 } else
10405 Mask = TRI->getCallPreservedMask(MF, CallConv);
10406
10407 if (Subtarget->hasCustomCallingConv())
10408 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10409
10410 if (TRI->isAnyArgRegReserved(MF))
10411 TRI->emitReservedArgRegCallError(MF);
10412
10413 assert(Mask && "Missing call preserved mask for calling convention");
10414 Ops.push_back(DAG.getRegisterMask(Mask));
10415
10416 if (InGlue.getNode())
10417 Ops.push_back(InGlue);
10418
10419 if (CLI.DeactivationSymbol)
10420 Ops.push_back(DAG.getDeactivationSymbol(CLI.DeactivationSymbol));
10421
10422 // If we're doing a tall call, use a TC_RETURN here rather than an
10423 // actual call instruction.
10424 if (IsTailCall) {
10426 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10427 if (IsCFICall)
10428 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10429
10430 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10431 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10432 if (CalledGlobal &&
10433 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10434 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10435 return Ret;
10436 }
10437
10438 // Returns a chain and a flag for retval copy to use.
10439 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10440 if (IsCFICall)
10441 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10442
10443 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10444 InGlue = Chain.getValue(1);
10445 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10446 if (CalledGlobal &&
10447 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10448 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10449
10450 uint64_t CalleePopBytes =
10451 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10452
10453 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10454 InGlue = Chain.getValue(1);
10455
10456 // Handle result values, copying them out of physregs into vregs that we
10457 // return.
10458 SDValue Result = LowerCallResult(
10459 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10460 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10461
10462 if (!Ins.empty())
10463 InGlue = Result.getValue(Result->getNumValues() - 1);
10464
10465 if (RequiresSMChange) {
10467 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10468 getSMToggleCondition(CallAttrs));
10469 }
10470
10471 if (!UseNewSMEABILowering &&
10472 (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()))
10473 // Unconditionally resume ZA.
10474 Result = DAG.getNode(
10475 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10476 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10477
10478 if (ShouldPreserveZT0)
10479 Result =
10480 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10481 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10482
10483 if (RequiresLazySave) {
10484 Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
10485 } else if (RequiresSaveAllZA) {
10486 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10487 /*IsSave=*/false);
10488 }
10489
10490 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10491 RequiresSaveAllZA) {
10492 for (unsigned I = 0; I < InVals.size(); ++I) {
10493 // The smstart/smstop is chained as part of the call, but when the
10494 // resulting chain is discarded (which happens when the call is not part
10495 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10496 // smstart/smstop is chained to the result value. We can do that by doing
10497 // a vreg -> vreg copy.
10500 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10501 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10502 InVals[I].getValueType());
10503 }
10504 }
10505
10506 if (CallConv == CallingConv::PreserveNone) {
10507 for (const ISD::OutputArg &O : Outs) {
10508 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10509 O.Flags.isSwiftAsync()) {
10510 MachineFunction &MF = DAG.getMachineFunction();
10511 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10512 MF.getFunction(),
10513 "Swift attributes can't be used with preserve_none",
10514 DL.getDebugLoc()));
10515 break;
10516 }
10517 }
10518 }
10519
10520 return Result;
10521}
10522
10523bool AArch64TargetLowering::CanLowerReturn(
10524 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10526 const Type *RetTy) const {
10527 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10529 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10530 return CCInfo.CheckReturn(Outs, RetCC);
10531}
10532
10533SDValue
10534AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10535 bool isVarArg,
10537 const SmallVectorImpl<SDValue> &OutVals,
10538 const SDLoc &DL, SelectionDAG &DAG) const {
10539 auto &MF = DAG.getMachineFunction();
10540 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10541
10542 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10544 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10545 CCInfo.AnalyzeReturn(Outs, RetCC);
10546
10547 // Copy the result values into the output registers.
10548 SDValue Glue;
10550 SmallSet<unsigned, 4> RegsUsed;
10551 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10552 ++i, ++realRVLocIdx) {
10553 CCValAssign &VA = RVLocs[i];
10554 assert(VA.isRegLoc() && "Can only return in registers!");
10555 SDValue Arg = OutVals[realRVLocIdx];
10556
10557 switch (VA.getLocInfo()) {
10558 default:
10559 llvm_unreachable("Unknown loc info!");
10560 case CCValAssign::Full:
10561 if (Outs[i].ArgVT == MVT::i1) {
10562 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10563 // value. This is strictly redundant on Darwin (which uses "zeroext
10564 // i1"), but will be optimised out before ISel.
10565 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10566 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10567 }
10568 break;
10569 case CCValAssign::BCvt:
10570 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10571 break;
10572 case CCValAssign::AExt:
10573 case CCValAssign::ZExt:
10574 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10575 break;
10577 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10578 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10579 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10580 DAG.getConstant(32, DL, VA.getLocVT()));
10581 break;
10582 }
10583
10584 if (RegsUsed.count(VA.getLocReg())) {
10585 SDValue &Bits =
10586 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10587 return Elt.first == VA.getLocReg();
10588 })->second;
10589 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10590 } else {
10591 RetVals.emplace_back(VA.getLocReg(), Arg);
10592 RegsUsed.insert(VA.getLocReg());
10593 }
10594 }
10595
10596 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10597
10598 // Emit SMSTOP before returning from a locally streaming function
10599 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10600 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10601 if (FuncAttrs.hasStreamingCompatibleInterface())
10602 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10603 /*Glue*/ SDValue(),
10605 else
10606 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10607 /*Glue*/ SDValue(), AArch64SME::Always);
10608 Glue = Chain.getValue(1);
10609 }
10610
10611 SmallVector<SDValue, 4> RetOps(1, Chain);
10612 for (auto &RetVal : RetVals) {
10613 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10614 isPassedInFPR(RetVal.second.getValueType()))
10615 RetVal.second =
10616 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10617 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10618 RetVal.second);
10619 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10620 Glue = Chain.getValue(1);
10621 RetOps.push_back(
10622 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10623 }
10624
10625 // Windows AArch64 ABIs require that for returning structs by value we copy
10626 // the sret argument into X0 for the return.
10627 // We saved the argument into a virtual register in the entry block,
10628 // so now we copy the value out and into X0.
10629 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10630 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10632
10633 unsigned RetValReg = AArch64::X0;
10634 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10635 RetValReg = AArch64::X8;
10636 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10637 Glue = Chain.getValue(1);
10638
10639 RetOps.push_back(
10640 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10641 }
10642
10643 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10644 if (I) {
10645 for (; *I; ++I) {
10646 if (AArch64::GPR64RegClass.contains(*I))
10647 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10648 else if (AArch64::FPR64RegClass.contains(*I))
10649 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10650 else
10651 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10652 }
10653 }
10654
10655 RetOps[0] = Chain; // Update chain.
10656
10657 // Add the glue if we have it.
10658 if (Glue.getNode())
10659 RetOps.push_back(Glue);
10660
10661 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10662 // ARM64EC entry thunks use a special return sequence: instead of a regular
10663 // "ret" instruction, they need to explicitly call the emulator.
10664 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10665 SDValue Arm64ECRetDest =
10666 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10667 Arm64ECRetDest =
10668 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10669 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10670 MachinePointerInfo());
10671 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10672 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10673 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10674 }
10675
10676 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10677}
10678
10679//===----------------------------------------------------------------------===//
10680// Other Lowering Code
10681//===----------------------------------------------------------------------===//
10682
10683SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10684 SelectionDAG &DAG,
10685 unsigned Flag) const {
10686 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10687 N->getOffset(), Flag);
10688}
10689
10690SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10691 SelectionDAG &DAG,
10692 unsigned Flag) const {
10693 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10694}
10695
10696SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10697 SelectionDAG &DAG,
10698 unsigned Flag) const {
10699 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10700 N->getOffset(), Flag);
10701}
10702
10703SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10704 SelectionDAG &DAG,
10705 unsigned Flag) const {
10706 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10707}
10708
10709SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10710 SelectionDAG &DAG,
10711 unsigned Flag) const {
10712 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10713}
10714
10715// (loadGOT sym)
10716template <class NodeTy>
10717SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10718 unsigned Flags) const {
10719 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10720 SDLoc DL(N);
10721 EVT Ty = getPointerTy(DAG.getDataLayout());
10722 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10723 // FIXME: Once remat is capable of dealing with instructions with register
10724 // operands, expand this into two nodes instead of using a wrapper node.
10725 if (DAG.getMachineFunction()
10726 .getInfo<AArch64FunctionInfo>()
10727 ->hasELFSignedGOT())
10728 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10729 0);
10730 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10731}
10732
10733// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10734template <class NodeTy>
10735SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10736 unsigned Flags) const {
10737 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10738 SDLoc DL(N);
10739 EVT Ty = getPointerTy(DAG.getDataLayout());
10740 const unsigned char MO_NC = AArch64II::MO_NC;
10741 return DAG.getNode(
10742 AArch64ISD::WrapperLarge, DL, Ty,
10743 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10744 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10745 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10746 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10747}
10748
10749// (addlow (adrp %hi(sym)) %lo(sym))
10750template <class NodeTy>
10751SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10752 unsigned Flags) const {
10753 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10754 SDLoc DL(N);
10755 EVT Ty = getPointerTy(DAG.getDataLayout());
10756 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10757 SDValue Lo = getTargetNode(N, Ty, DAG,
10759 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10760 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10761}
10762
10763// (adr sym)
10764template <class NodeTy>
10765SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10766 unsigned Flags) const {
10767 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10768 SDLoc DL(N);
10769 EVT Ty = getPointerTy(DAG.getDataLayout());
10770 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10771 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10772}
10773
10774SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10775 SelectionDAG &DAG) const {
10776 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10777 const GlobalValue *GV = GN->getGlobal();
10778 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10779
10780 if (OpFlags != AArch64II::MO_NO_FLAG)
10782 "unexpected offset in global node");
10783
10784 // This also catches the large code model case for Darwin, and tiny code
10785 // model with got relocations.
10786 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10787 return getGOT(GN, DAG, OpFlags);
10788 }
10789
10793 Result = getAddrLarge(GN, DAG, OpFlags);
10794 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10795 Result = getAddrTiny(GN, DAG, OpFlags);
10796 } else {
10797 Result = getAddr(GN, DAG, OpFlags);
10798 }
10799 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10800 SDLoc DL(GN);
10802 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10804 return Result;
10805}
10806
10807/// Convert a TLS address reference into the correct sequence of loads
10808/// and calls to compute the variable's address (for Darwin, currently) and
10809/// return an SDValue containing the final node.
10810
10811/// Darwin only has one TLS scheme which must be capable of dealing with the
10812/// fully general situation, in the worst case. This means:
10813/// + "extern __thread" declaration.
10814/// + Defined in a possibly unknown dynamic library.
10815///
10816/// The general system is that each __thread variable has a [3 x i64] descriptor
10817/// which contains information used by the runtime to calculate the address. The
10818/// only part of this the compiler needs to know about is the first xword, which
10819/// contains a function pointer that must be called with the address of the
10820/// entire descriptor in "x0".
10821///
10822/// Since this descriptor may be in a different unit, in general even the
10823/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10824/// is:
10825/// adrp x0, _var@TLVPPAGE
10826/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10827/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10828/// ; the function pointer
10829/// blr x1 ; Uses descriptor address in x0
10830/// ; Address of _var is now in x0.
10831///
10832/// If the address of _var's descriptor *is* known to the linker, then it can
10833/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10834/// a slight efficiency gain.
10835SDValue
10836AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10837 SelectionDAG &DAG) const {
10838 assert(Subtarget->isTargetDarwin() &&
10839 "This function expects a Darwin target");
10840
10841 SDLoc DL(Op);
10842 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10843 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10844 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10845
10846 SDValue TLVPAddr =
10847 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10848 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10849
10850 // The first entry in the descriptor is a function pointer that we must call
10851 // to obtain the address of the variable.
10852 SDValue Chain = DAG.getEntryNode();
10853 SDValue FuncTLVGet = DAG.getLoad(
10854 PtrMemVT, DL, Chain, DescAddr,
10856 Align(PtrMemVT.getSizeInBits() / 8),
10858 Chain = FuncTLVGet.getValue(1);
10859
10860 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10861 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10862
10863 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10864 MFI.setAdjustsStack(true);
10865
10866 // TLS calls preserve all registers except those that absolutely must be
10867 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10868 // silly).
10869 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10870 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10871 if (Subtarget->hasCustomCallingConv())
10872 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10873
10874 // Finally, we can make the call. This is just a degenerate version of a
10875 // normal AArch64 call node: x0 takes the address of the descriptor, and
10876 // returns the address of the variable in this thread.
10877 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10878
10879 unsigned Opcode = AArch64ISD::CALL;
10881 Ops.push_back(Chain);
10882 Ops.push_back(FuncTLVGet);
10883
10884 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10885 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10886 Opcode = AArch64ISD::AUTH_CALL;
10887 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10888 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10889 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10890 }
10891
10892 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10893 Ops.push_back(DAG.getRegisterMask(Mask));
10894 Ops.push_back(Chain.getValue(1));
10895 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10896 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10897}
10898
10899/// Convert a thread-local variable reference into a sequence of instructions to
10900/// compute the variable's address for the local exec TLS model of ELF targets.
10901/// The sequence depends on the maximum TLS area size.
10902SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10903 SDValue ThreadBase,
10904 const SDLoc &DL,
10905 SelectionDAG &DAG) const {
10906 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10907 SDValue TPOff, Addr;
10908
10909 switch (DAG.getTarget().Options.TLSSize) {
10910 default:
10911 llvm_unreachable("Unexpected TLS size");
10912
10913 case 12: {
10914 // mrs x0, TPIDR_EL0
10915 // add x0, x0, :tprel_lo12:a
10917 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10918 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10919 Var,
10920 DAG.getTargetConstant(0, DL, MVT::i32)),
10921 0);
10922 }
10923
10924 case 24: {
10925 // mrs x0, TPIDR_EL0
10926 // add x0, x0, :tprel_hi12:a
10927 // add x0, x0, :tprel_lo12_nc:a
10928 SDValue HiVar = DAG.getTargetGlobalAddress(
10929 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10930 SDValue LoVar = DAG.getTargetGlobalAddress(
10931 GV, DL, PtrVT, 0,
10933 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10934 HiVar,
10935 DAG.getTargetConstant(0, DL, MVT::i32)),
10936 0);
10937 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10938 LoVar,
10939 DAG.getTargetConstant(0, DL, MVT::i32)),
10940 0);
10941 }
10942
10943 case 32: {
10944 // mrs x1, TPIDR_EL0
10945 // movz x0, #:tprel_g1:a
10946 // movk x0, #:tprel_g0_nc:a
10947 // add x0, x1, x0
10948 SDValue HiVar = DAG.getTargetGlobalAddress(
10949 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10950 SDValue LoVar = DAG.getTargetGlobalAddress(
10951 GV, DL, PtrVT, 0,
10953 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10954 DAG.getTargetConstant(16, DL, MVT::i32)),
10955 0);
10956 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10957 DAG.getTargetConstant(0, DL, MVT::i32)),
10958 0);
10959 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10960 }
10961
10962 case 48: {
10963 // mrs x1, TPIDR_EL0
10964 // movz x0, #:tprel_g2:a
10965 // movk x0, #:tprel_g1_nc:a
10966 // movk x0, #:tprel_g0_nc:a
10967 // add x0, x1, x0
10968 SDValue HiVar = DAG.getTargetGlobalAddress(
10969 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10970 SDValue MiVar = DAG.getTargetGlobalAddress(
10971 GV, DL, PtrVT, 0,
10973 SDValue LoVar = DAG.getTargetGlobalAddress(
10974 GV, DL, PtrVT, 0,
10976 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10977 DAG.getTargetConstant(32, DL, MVT::i32)),
10978 0);
10979 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10980 DAG.getTargetConstant(16, DL, MVT::i32)),
10981 0);
10982 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10983 DAG.getTargetConstant(0, DL, MVT::i32)),
10984 0);
10985 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10986 }
10987 }
10988}
10989
10990/// When accessing thread-local variables under either the general-dynamic or
10991/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10992/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10993/// is a function pointer to carry out the resolution.
10994///
10995/// The sequence is:
10996/// adrp x0, :tlsdesc:var
10997/// ldr x1, [x0, #:tlsdesc_lo12:var]
10998/// add x0, x0, #:tlsdesc_lo12:var
10999/// .tlsdesccall var
11000/// blr x1
11001/// (TPIDR_EL0 offset now in x0)
11002///
11003/// The above sequence must be produced unscheduled, to enable the linker to
11004/// optimize/relax this sequence.
11005/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
11006/// above sequence, and expanded really late in the compilation flow, to ensure
11007/// the sequence is produced as per above.
11008SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
11009 const SDLoc &DL,
11010 SelectionDAG &DAG) const {
11011 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11012 auto &MF = DAG.getMachineFunction();
11013 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
11014
11015 SDValue Glue;
11016 SDValue Chain = DAG.getEntryNode();
11017 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
11018
11019 SMECallAttrs TLSCallAttrs(FuncInfo->getSMEFnAttrs(), {}, SMEAttrs::Normal);
11020 bool RequiresSMChange = TLSCallAttrs.requiresSMChange();
11021
11022 auto ChainAndGlue = [](SDValue Chain) -> std::pair<SDValue, SDValue> {
11023 return {Chain, Chain.getValue(1)};
11024 };
11025
11026 if (RequiresSMChange)
11027 std::tie(Chain, Glue) =
11028 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/false, Chain, Glue,
11029 getSMToggleCondition(TLSCallAttrs)));
11030
11031 unsigned Opcode =
11032 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
11033 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
11034 : AArch64ISD::TLSDESC_CALLSEQ;
11035 SDValue Ops[] = {Chain, SymAddr, Glue};
11036 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11037 Opcode, DL, NodeTys, Glue ? ArrayRef(Ops) : ArrayRef(Ops).drop_back()));
11038
11039 if (TLSCallAttrs.requiresLazySave())
11040 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11041 AArch64ISD::REQUIRES_ZA_SAVE, DL, NodeTys, {Chain, Chain.getValue(1)}));
11042
11043 if (RequiresSMChange)
11044 std::tie(Chain, Glue) =
11045 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
11046 getSMToggleCondition(TLSCallAttrs)));
11047
11048 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
11049}
11050
11051SDValue
11052AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
11053 SelectionDAG &DAG) const {
11054 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
11055
11056 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11057 AArch64FunctionInfo *MFI =
11058 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11059
11063
11065 if (Model == TLSModel::LocalDynamic)
11067 }
11068
11070 Model != TLSModel::LocalExec)
11071 report_fatal_error("ELF TLS only supported in small memory model or "
11072 "in local exec TLS model");
11073 // Different choices can be made for the maximum size of the TLS area for a
11074 // module. For the small address model, the default TLS size is 16MiB and the
11075 // maximum TLS size is 4GiB.
11076 // FIXME: add tiny and large code model support for TLS access models other
11077 // than local exec. We currently generate the same code as small for tiny,
11078 // which may be larger than needed.
11079
11080 SDValue TPOff;
11081 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11082 SDLoc DL(Op);
11083 const GlobalValue *GV = GA->getGlobal();
11084
11085 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
11086
11087 if (Model == TLSModel::LocalExec) {
11088 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
11089 } else if (Model == TLSModel::InitialExec) {
11090 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11091 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
11092 } else if (Model == TLSModel::LocalDynamic) {
11093 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
11094 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
11095 // the beginning of the module's TLS region, followed by a DTPREL offset
11096 // calculation.
11097
11098 // These accesses will need deduplicating if there's more than one.
11100
11101 // The call needs a relocation too for linker relaxation. It doesn't make
11102 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11103 // the address.
11104 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
11106
11107 // Now we can calculate the offset from TPIDR_EL0 to this module's
11108 // thread-local area.
11109 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11110
11111 // Now use :dtprel_whatever: operations to calculate this variable's offset
11112 // in its thread-storage area.
11113 SDValue HiVar = DAG.getTargetGlobalAddress(
11114 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11115 SDValue LoVar = DAG.getTargetGlobalAddress(
11116 GV, DL, MVT::i64, 0,
11118
11119 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
11120 DAG.getTargetConstant(0, DL, MVT::i32)),
11121 0);
11122 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
11123 DAG.getTargetConstant(0, DL, MVT::i32)),
11124 0);
11125 } else if (Model == TLSModel::GeneralDynamic) {
11126 // The call needs a relocation too for linker relaxation. It doesn't make
11127 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11128 // the address.
11129 SDValue SymAddr =
11130 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11131
11132 // Finally we can make a call to calculate the offset from tpidr_el0.
11133 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11134 } else
11135 llvm_unreachable("Unsupported ELF TLS access model");
11136
11137 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
11138}
11139
11140SDValue
11141AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
11142 SelectionDAG &DAG) const {
11143 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
11144
11145 SDValue Chain = DAG.getEntryNode();
11146 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11147 SDLoc DL(Op);
11148
11149 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
11150
11151 // Load the ThreadLocalStoragePointer from the TEB
11152 // A pointer to the TLS array is located at offset 0x58 from the TEB.
11153 SDValue TLSArray =
11154 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
11155 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
11156 Chain = TLSArray.getValue(1);
11157
11158 // Load the TLS index from the C runtime;
11159 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
11160 // This also does the same as LOADgot, but using a generic i32 load,
11161 // while LOADgot only loads i64.
11162 SDValue TLSIndexHi =
11163 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
11164 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
11165 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
11166 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
11167 SDValue TLSIndex =
11168 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
11169 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
11170 Chain = TLSIndex.getValue(1);
11171
11172 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
11173 // offset into the TLSArray.
11174 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
11175 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
11176 DAG.getConstant(3, DL, PtrVT));
11177 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
11178 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
11179 MachinePointerInfo());
11180 Chain = TLS.getValue(1);
11181
11182 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11183 const GlobalValue *GV = GA->getGlobal();
11184 SDValue TGAHi = DAG.getTargetGlobalAddress(
11185 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11186 SDValue TGALo = DAG.getTargetGlobalAddress(
11187 GV, DL, PtrVT, 0,
11189
11190 // Add the offset from the start of the .tls section (section base).
11191 SDValue Addr =
11192 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
11193 DAG.getTargetConstant(0, DL, MVT::i32)),
11194 0);
11195 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
11196 return Addr;
11197}
11198
11199SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
11200 SelectionDAG &DAG) const {
11201 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11202 if (DAG.getTarget().useEmulatedTLS())
11203 return LowerToTLSEmulatedModel(GA, DAG);
11204
11205 if (Subtarget->isTargetDarwin())
11206 return LowerDarwinGlobalTLSAddress(Op, DAG);
11207 if (Subtarget->isTargetELF())
11208 return LowerELFGlobalTLSAddress(Op, DAG);
11209 if (Subtarget->isTargetWindows())
11210 return LowerWindowsGlobalTLSAddress(Op, DAG);
11211
11212 llvm_unreachable("Unexpected platform trying to use TLS");
11213}
11214
11215//===----------------------------------------------------------------------===//
11216// PtrAuthGlobalAddress lowering
11217//
11218// We have 3 lowering alternatives to choose from:
11219// - MOVaddrPAC: similar to MOVaddr, with added PAC.
11220// If the GV doesn't need a GOT load (i.e., is locally defined)
11221// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
11222//
11223// - LOADgotPAC: similar to LOADgot, with added PAC.
11224// If the GV needs a GOT load, materialize the pointer using the usual
11225// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
11226// section is assumed to be read-only (for example, via relro mechanism). See
11227// LowerMOVaddrPAC.
11228//
11229// - LOADauthptrstatic: similar to LOADgot, but use a
11230// special stub slot instead of a GOT slot.
11231// Load a signed pointer for symbol 'sym' from a stub slot named
11232// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
11233// resolving. This usually lowers to adrp+ldr, but also emits an entry into
11234// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
11235//
11236// All 3 are pseudos that are expand late to longer sequences: this lets us
11237// provide integrity guarantees on the to-be-signed intermediate values.
11238//
11239// LOADauthptrstatic is undesirable because it requires a large section filled
11240// with often similarly-signed pointers, making it a good harvesting target.
11241// Thus, it's only used for ptrauth references to extern_weak to avoid null
11242// checks.
11243
11245 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
11246 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
11247 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
11248 assert(TGN->getGlobal()->hasExternalWeakLinkage());
11249
11250 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
11251 // offset alone as a pointer if the symbol wasn't available, which would
11252 // probably break null checks in users. Ptrauth complicates things further:
11253 // error out.
11254 if (TGN->getOffset() != 0)
11256 "unsupported non-zero offset in weak ptrauth global reference");
11257
11258 if (!isNullConstant(AddrDiscriminator))
11259 report_fatal_error("unsupported weak addr-div ptrauth global");
11260
11261 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11262 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
11263 {TGA, Key, Discriminator}),
11264 0);
11265}
11266
11267SDValue
11268AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
11269 SelectionDAG &DAG) const {
11270 SDValue Ptr = Op.getOperand(0);
11271 uint64_t KeyC = Op.getConstantOperandVal(1);
11272 SDValue AddrDiscriminator = Op.getOperand(2);
11273 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
11274 EVT VT = Op.getValueType();
11275 SDLoc DL(Op);
11276
11277 if (KeyC > AArch64PACKey::LAST)
11278 report_fatal_error("key in ptrauth global out of range [0, " +
11279 Twine((int)AArch64PACKey::LAST) + "]");
11280
11281 // Blend only works if the integer discriminator is 16-bit wide.
11282 if (!isUInt<16>(DiscriminatorC))
11284 "constant discriminator in ptrauth global out of range [0, 0xffff]");
11285
11286 // Choosing between 3 lowering alternatives is target-specific.
11287 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
11288 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
11289
11290 int64_t PtrOffsetC = 0;
11291 if (Ptr.getOpcode() == ISD::ADD) {
11292 PtrOffsetC = Ptr.getConstantOperandVal(1);
11293 Ptr = Ptr.getOperand(0);
11294 }
11295 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
11296 const GlobalValue *PtrGV = PtrN->getGlobal();
11297
11298 // Classify the reference to determine whether it needs a GOT load.
11299 const unsigned OpFlags =
11300 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
11301 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
11302 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
11303 "unsupported non-GOT op flags on ptrauth global reference");
11304
11305 // Fold any offset into the GV; our pseudos expect it there.
11306 PtrOffsetC += PtrN->getOffset();
11307 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
11308 /*TargetFlags=*/0);
11309 assert(PtrN->getTargetFlags() == 0 &&
11310 "unsupported target flags on ptrauth global");
11311
11312 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11313 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
11314 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
11315 ? AddrDiscriminator
11316 : DAG.getRegister(AArch64::XZR, MVT::i64);
11317
11318 // No GOT load needed -> MOVaddrPAC
11319 if (!NeedsGOTLoad) {
11320 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
11321 return SDValue(
11322 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
11323 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11324 0);
11325 }
11326
11327 // GOT load -> LOADgotPAC
11328 // Note that we disallow extern_weak refs to avoid null checks later.
11329 if (!PtrGV->hasExternalWeakLinkage())
11330 return SDValue(
11331 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
11332 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11333 0);
11334
11335 // extern_weak ref -> LOADauthptrstatic
11337 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
11338 DAG);
11339}
11340
11341// Looks through \param Val to determine the bit that can be used to
11342// check the sign of the value. It returns the unextended value and
11343// the sign bit position.
11344std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
11345 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
11346 return {Val.getOperand(0),
11347 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
11348 1};
11349
11350 if (Val.getOpcode() == ISD::SIGN_EXTEND)
11351 return {Val.getOperand(0),
11352 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
11353
11354 return {Val, Val.getValueSizeInBits() - 1};
11355}
11356
11357// Op is an SDValue that is being compared to 0. If the comparison is a bit
11358// test, optimize it to a TBZ or TBNZ.
11360 SDValue Dest, unsigned Opcode,
11361 SelectionDAG &DAG) {
11362 if (Op.getOpcode() != ISD::AND)
11363 return SDValue();
11364
11365 // See if we can use a TBZ to fold in an AND as well.
11366 // TBZ has a smaller branch displacement than CBZ. If the offset is
11367 // out of bounds, a late MI-layer pass rewrites branches.
11368 // 403.gcc is an example that hits this case.
11369 if (isa<ConstantSDNode>(Op.getOperand(1)) &&
11370 isPowerOf2_64(Op.getConstantOperandVal(1))) {
11371 SDValue Test = Op.getOperand(0);
11372 uint64_t Mask = Op.getConstantOperandVal(1);
11373 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Test,
11374 DAG.getConstant(Log2_64(Mask), DL, MVT::i64), Dest);
11375 }
11376
11377 if (Op.getOperand(0).getOpcode() == ISD::SHL) {
11378 auto Op00 = Op.getOperand(0).getOperand(0);
11379 if (isa<ConstantSDNode>(Op00) && Op00->getAsZExtVal() == 1) {
11380 auto Shr = DAG.getNode(ISD::SRL, DL, Op00.getValueType(),
11381 Op.getOperand(1), Op.getOperand(0).getOperand(1));
11382 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Shr,
11383 DAG.getConstant(0, DL, MVT::i64), Dest);
11384 }
11385 }
11386
11387 return SDValue();
11388}
11389
11390SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
11391 SDValue Chain = Op.getOperand(0);
11392 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
11393 SDValue LHS = Op.getOperand(2);
11394 SDValue RHS = Op.getOperand(3);
11395 SDValue Dest = Op.getOperand(4);
11396 SDLoc DL(Op);
11397
11398 MachineFunction &MF = DAG.getMachineFunction();
11399 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
11400 // will not be produced, as they are conditional branch instructions that do
11401 // not set flags.
11402 bool ProduceNonFlagSettingCondBr =
11403 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
11404
11405 // Handle f128 first, since lowering it will result in comparing the return
11406 // value of a libcall against zero, which is just what the rest of LowerBR_CC
11407 // is expecting to deal with.
11408 if (LHS.getValueType() == MVT::f128) {
11409 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11410
11411 // If softenSetCCOperands returned a scalar, we need to compare the result
11412 // against zero to select between true and false values.
11413 if (!RHS.getNode()) {
11414 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11415 CC = ISD::SETNE;
11416 }
11417 }
11418
11419 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
11420 // instruction.
11422 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
11423 // Only lower legal XALUO ops.
11424 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
11425 return SDValue();
11426
11427 // The actual operation with overflow check.
11429 SDValue Value, Overflow;
11430 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
11431
11432 if (CC == ISD::SETNE)
11433 OFCC = getInvertedCondCode(OFCC);
11434 SDValue CCVal = getCondCode(DAG, OFCC);
11435
11436 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11437 Overflow);
11438 }
11439
11440 if (LHS.getValueType().isInteger()) {
11441 assert((LHS.getValueType() == RHS.getValueType()) &&
11442 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11443
11444 // If the RHS of the comparison is zero, we can potentially fold this
11445 // to a specialized branch.
11446 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11447 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
11448 if (CC == ISD::SETEQ) {
11449 if (SDValue Result =
11450 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBZ, DAG))
11451 return Result;
11452
11453 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11454 } else if (CC == ISD::SETNE) {
11455 if (SDValue Result =
11456 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBNZ, DAG))
11457 return Result;
11458
11459 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11460 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11461 // Don't combine AND since emitComparison converts the AND to an ANDS
11462 // (a.k.a. TST) and the test in the test bit and branch instruction
11463 // becomes redundant. This would also increase register pressure.
11464 uint64_t SignBitPos;
11465 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11466 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11467 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11468 }
11469 }
11470 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11471 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11472 // Don't combine AND since emitComparison converts the AND to an ANDS
11473 // (a.k.a. TST) and the test in the test bit and branch instruction
11474 // becomes redundant. This would also increase register pressure.
11475 uint64_t SignBitPos;
11476 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11477 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11478 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11479 }
11480
11481 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11482 // larger branch displacement but do prefer CB over cmp + br.
11483 if (Subtarget->hasCMPBR() &&
11485 ProduceNonFlagSettingCondBr) {
11486 SDValue Cond =
11488 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11489 Dest);
11490 }
11491
11492 SDValue CCVal;
11493 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11494 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11495 Cmp);
11496 }
11497
11498 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11499 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11500
11501 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11502 // clean. Some of them require two branches to implement.
11503 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11504 AArch64CC::CondCode CC1, CC2;
11505 changeFPCCToAArch64CC(CC, CC1, CC2);
11506 SDValue CC1Val = getCondCode(DAG, CC1);
11507 SDValue BR1 =
11508 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11509 if (CC2 != AArch64CC::AL) {
11510 SDValue CC2Val = getCondCode(DAG, CC2);
11511 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11512 Cmp);
11513 }
11514
11515 return BR1;
11516}
11517
11518SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11519 SelectionDAG &DAG) const {
11520 if (!Subtarget->isNeonAvailable() &&
11521 !Subtarget->useSVEForFixedLengthVectors())
11522 return SDValue();
11523
11524 EVT VT = Op.getValueType();
11525 EVT IntVT = VT.changeTypeToInteger();
11526 SDLoc DL(Op);
11527
11528 SDValue In1 = Op.getOperand(0);
11529 SDValue In2 = Op.getOperand(1);
11530 EVT SrcVT = In2.getValueType();
11531
11532 if (!SrcVT.bitsEq(VT))
11533 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11534
11535 if (VT.isScalableVector())
11536 IntVT =
11538
11539 if (VT.isFixedLengthVector() &&
11540 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11541 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11542
11543 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11544 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11545
11546 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11547 return convertFromScalableVector(DAG, VT, Res);
11548 }
11549
11550 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11551 // a SVE FCOPYSIGN.
11552 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11553 Subtarget->isSVEorStreamingSVEAvailable()) {
11554 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11555 return SDValue();
11556 EVT SVT = getPackedSVEVectorVT(VT);
11557
11558 SDValue Poison = DAG.getPOISON(SVT);
11559 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11560 SDValue Ins1 =
11561 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, Poison, In1, Zero);
11562 SDValue Ins2 =
11563 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, Poison, In2, Zero);
11564 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11565 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS, Zero);
11566 }
11567
11568 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11569 if (VT.isScalableVector())
11570 return getSVESafeBitCast(VT, Op, DAG);
11571
11572 return DAG.getBitcast(VT, Op);
11573 };
11574
11575 SDValue VecVal1, VecVal2;
11576 EVT VecVT;
11577 auto SetVecVal = [&](int Idx = -1) {
11578 if (!VT.isVector()) {
11579 SDValue Poison = DAG.getPOISON(VecVT);
11580 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, Poison, In1);
11581 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, Poison, In2);
11582 } else {
11583 VecVal1 = BitCast(VecVT, In1, DAG);
11584 VecVal2 = BitCast(VecVT, In2, DAG);
11585 }
11586 };
11587 if (VT.isVector()) {
11588 VecVT = IntVT;
11589 SetVecVal();
11590 } else if (VT == MVT::f64) {
11591 VecVT = MVT::v2i64;
11592 SetVecVal(AArch64::dsub);
11593 } else if (VT == MVT::f32) {
11594 VecVT = MVT::v4i32;
11595 SetVecVal(AArch64::ssub);
11596 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11597 VecVT = MVT::v8i16;
11598 SetVecVal(AArch64::hsub);
11599 } else {
11600 llvm_unreachable("Invalid type for copysign!");
11601 }
11602
11603 unsigned BitWidth = In1.getScalarValueSizeInBits();
11604 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11605
11606 // We want to materialize a mask with every bit but the high bit set, but the
11607 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11608 // 64-bit elements. Instead, materialize all bits set and then negate that.
11609 if (VT == MVT::f64 || VT == MVT::v2f64) {
11610 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11611 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11612 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11613 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11614 }
11615
11616 SDValue BSP =
11617 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11618 if (VT == MVT::f16 || VT == MVT::bf16)
11619 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11620 if (VT == MVT::f32)
11621 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11622 if (VT == MVT::f64)
11623 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11624
11625 return BitCast(VT, BSP, DAG);
11626}
11627
11628SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11629 SelectionDAG &DAG) const {
11631 Attribute::NoImplicitFloat))
11632 return SDValue();
11633
11634 EVT VT = Op.getValueType();
11635 if (VT.isScalableVector() ||
11636 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11637 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11638
11639 bool IsParity = Op.getOpcode() == ISD::PARITY;
11640 SDValue Val = Op.getOperand(0);
11641 SDLoc DL(Op);
11642
11643 // for i32, general parity function using EORs is more efficient compared to
11644 // using floating point
11645 if (VT == MVT::i32 && IsParity)
11646 return SDValue();
11647
11648 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11649 if (VT == MVT::i32 || VT == MVT::i64) {
11650 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11651 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11652 DAG.getPOISON(ContainerVT), Val,
11653 DAG.getVectorIdxConstant(0, DL));
11654 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11655 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11656 DAG.getVectorIdxConstant(0, DL));
11657 if (IsParity)
11658 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11659 return Val;
11660 }
11661
11662 if (VT == MVT::i128) {
11663 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11664 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11665 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11666 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11667 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11668 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11669 if (IsParity)
11670 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11671 return Val;
11672 }
11673 }
11674
11675 if (!Subtarget->isNeonAvailable())
11676 return SDValue();
11677
11678 // If there is no CNT instruction available, GPR popcount can
11679 // be more efficiently lowered to the following sequence that uses
11680 // AdvSIMD registers/instructions as long as the copies to/from
11681 // the AdvSIMD registers are cheap.
11682 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11683 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11684 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11685 // FMOV X0, D0 // copy result back to integer reg
11686 if (VT == MVT::i32 || VT == MVT::i64) {
11687 if (VT == MVT::i32)
11688 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11689 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11690
11691 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11692 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11693 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11694 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11695 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11696 DAG.getConstant(0, DL, MVT::i64));
11697 if (IsParity)
11698 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11699 return AddV;
11700 } else if (VT == MVT::i128) {
11701 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11702
11703 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11704 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11705 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11706 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11707 DAG.getConstant(0, DL, MVT::i64));
11708 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11709 if (IsParity)
11710 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11711 return AddV;
11712 }
11713
11714 assert(!IsParity && "ISD::PARITY of vector types not supported");
11715
11716 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11717 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11718 "Unexpected type for custom ctpop lowering");
11719
11720 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11721 Val = DAG.getBitcast(VT8Bit, Val);
11722 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11723
11724 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11725 VT.getVectorNumElements() >= 2) {
11726 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11727 SDValue Zeros = DAG.getConstant(0, DL, DT);
11728 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11729
11730 if (VT == MVT::v2i64) {
11731 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11732 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11733 } else if (VT == MVT::v2i32) {
11734 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11735 } else if (VT == MVT::v4i32) {
11736 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11737 } else {
11738 llvm_unreachable("Unexpected type for custom ctpop lowering");
11739 }
11740
11741 return Val;
11742 }
11743
11744 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11745 unsigned EltSize = 8;
11746 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11747 while (EltSize != VT.getScalarSizeInBits()) {
11748 EltSize *= 2;
11749 NumElts /= 2;
11750 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11751 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11752 }
11753
11754 return Val;
11755}
11756
11757SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11758 EVT VT = Op.getValueType();
11759 assert(VT.isScalableVector() ||
11761 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11762
11763 SDLoc DL(Op);
11764 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11765 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11766}
11767
11768SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11769 SelectionDAG &DAG) const {
11770
11771 EVT VT = Op.getValueType();
11772 SDLoc DL(Op);
11773 unsigned Opcode = Op.getOpcode();
11774 ISD::CondCode CC;
11775 switch (Opcode) {
11776 default:
11777 llvm_unreachable("Wrong instruction");
11778 case ISD::SMAX:
11779 CC = ISD::SETGT;
11780 break;
11781 case ISD::SMIN:
11782 CC = ISD::SETLT;
11783 break;
11784 case ISD::UMAX:
11785 CC = ISD::SETUGT;
11786 break;
11787 case ISD::UMIN:
11788 CC = ISD::SETULT;
11789 break;
11790 }
11791
11792 // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11793 // prefer using SVE if available.
11794 if (VT.isScalableVector() ||
11795 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
11796 switch (Opcode) {
11797 default:
11798 llvm_unreachable("Wrong instruction");
11799 case ISD::SMAX:
11800 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11801 case ISD::SMIN:
11802 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11803 case ISD::UMAX:
11804 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11805 case ISD::UMIN:
11806 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11807 }
11808 }
11809
11810 SDValue Op0 = Op.getOperand(0);
11811 SDValue Op1 = Op.getOperand(1);
11812 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11813 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11814}
11815
11816SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11817 SelectionDAG &DAG) const {
11818 EVT VT = Op.getValueType();
11819
11820 if (VT.isScalableVector() ||
11822 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11823 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11824
11825 SDLoc DL(Op);
11826 SDValue REVB;
11827 MVT VST;
11828
11829 switch (VT.getSimpleVT().SimpleTy) {
11830 default:
11831 llvm_unreachable("Invalid type for bitreverse!");
11832
11833 case MVT::v2i32: {
11834 VST = MVT::v8i8;
11835 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11836
11837 break;
11838 }
11839
11840 case MVT::v4i32: {
11841 VST = MVT::v16i8;
11842 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11843
11844 break;
11845 }
11846
11847 case MVT::v1i64: {
11848 VST = MVT::v8i8;
11849 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11850
11851 break;
11852 }
11853
11854 case MVT::v2i64: {
11855 VST = MVT::v16i8;
11856 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11857
11858 break;
11859 }
11860 }
11861
11862 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11863 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11864}
11865
11866// Check whether the continuous comparison sequence.
11867static bool
11868isOrXorChain(SDValue N, unsigned &Num,
11869 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11870 if (Num == MaxXors)
11871 return false;
11872
11873 // Skip the one-use zext
11874 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11875 N = N->getOperand(0);
11876
11877 // The leaf node must be XOR
11878 if (N->getOpcode() == ISD::XOR) {
11879 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11880 Num++;
11881 return true;
11882 }
11883
11884 // All the non-leaf nodes must be OR.
11885 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11886 return false;
11887
11888 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11889 isOrXorChain(N->getOperand(1), Num, WorkList))
11890 return true;
11891 return false;
11892}
11893
11894// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11896 SDValue LHS = N->getOperand(0);
11897 SDValue RHS = N->getOperand(1);
11898 SDLoc DL(N);
11899 EVT VT = N->getValueType(0);
11901
11902 // Only handle integer compares.
11903 if (N->getOpcode() != ISD::SETCC)
11904 return SDValue();
11905
11906 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11907 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11908 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11909 unsigned NumXors = 0;
11910 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11911 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11912 isOrXorChain(LHS, NumXors, WorkList)) {
11913 SDValue XOR0, XOR1;
11914 std::tie(XOR0, XOR1) = WorkList[0];
11915 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11916 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11917 for (unsigned I = 1; I < WorkList.size(); I++) {
11918 std::tie(XOR0, XOR1) = WorkList[I];
11919 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11920 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11921 }
11922
11923 // Exit early by inverting the condition, which help reduce indentations.
11924 return Cmp;
11925 }
11926
11927 return SDValue();
11928}
11929
11930SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11931
11932 if (Op.getValueType().isVector())
11933 return LowerVSETCC(Op, DAG);
11934
11935 bool IsStrict = Op->isStrictFPOpcode();
11936 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11937 unsigned OpNo = IsStrict ? 1 : 0;
11938 SDValue Chain;
11939 if (IsStrict)
11940 Chain = Op.getOperand(0);
11941 SDValue LHS = Op.getOperand(OpNo + 0);
11942 SDValue RHS = Op.getOperand(OpNo + 1);
11943 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11944 SDLoc DL(Op);
11945
11946 // We chose ZeroOrOneBooleanContents, so use zero and one.
11947 EVT VT = Op.getValueType();
11948 SDValue TVal = DAG.getConstant(1, DL, VT);
11949 SDValue FVal = DAG.getConstant(0, DL, VT);
11950
11951 // Handle f128 first, since one possible outcome is a normal integer
11952 // comparison which gets picked up by the next if statement.
11953 if (LHS.getValueType() == MVT::f128) {
11954 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11955 IsSignaling);
11956
11957 // If softenSetCCOperands returned a scalar, use it.
11958 if (!RHS.getNode()) {
11959 assert(LHS.getValueType() == Op.getValueType() &&
11960 "Unexpected setcc expansion!");
11961 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11962 }
11963 }
11964
11965 if (LHS.getValueType().isInteger()) {
11966 if (Subtarget->hasCSSC() && CC == ISD::SETNE && isNullConstant(RHS)) {
11967 SDValue One = DAG.getConstant(1, DL, LHS.getValueType());
11968 SDValue UMin = DAG.getNode(ISD::UMIN, DL, LHS.getValueType(), LHS, One);
11969 SDValue Res = DAG.getZExtOrTrunc(UMin, DL, VT);
11970 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11971 }
11972 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11973
11974 SDValue CCVal;
11976 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11977
11978 // Note that we inverted the condition above, so we reverse the order of
11979 // the true and false operands here. This will allow the setcc to be
11980 // matched to a single CSINC instruction.
11981 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11982 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11983 }
11984
11985 // Now we know we're dealing with FP values.
11986 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11987 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11988
11989 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11990 // and do the comparison.
11991 SDValue Cmp;
11992 if (IsStrict)
11993 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11994 else
11995 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11996
11997 AArch64CC::CondCode CC1, CC2;
11998 changeFPCCToAArch64CC(CC, CC1, CC2);
11999 SDValue Res;
12000 if (CC2 == AArch64CC::AL) {
12001 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
12002 CC2);
12003 SDValue CC1Val = getCondCode(DAG, CC1);
12004
12005 // Note that we inverted the condition above, so we reverse the order of
12006 // the true and false operands here. This will allow the setcc to be
12007 // matched to a single CSINC instruction.
12008 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
12009 } else {
12010 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
12011 // totally clean. Some of them require two CSELs to implement. As is in
12012 // this case, we emit the first CSEL and then emit a second using the output
12013 // of the first as the RHS. We're effectively OR'ing the two CC's together.
12014
12015 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
12016 SDValue CC1Val = getCondCode(DAG, CC1);
12017 SDValue CS1 =
12018 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12019
12020 SDValue CC2Val = getCondCode(DAG, CC2);
12021 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12022 }
12023 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
12024}
12025
12026SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
12027 SelectionDAG &DAG) const {
12028
12029 SDValue LHS = Op.getOperand(0);
12030 SDValue RHS = Op.getOperand(1);
12031 EVT VT = LHS.getValueType();
12032 if (VT != MVT::i32 && VT != MVT::i64)
12033 return SDValue();
12034
12035 SDLoc DL(Op);
12036 SDValue Carry = Op.getOperand(2);
12037 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
12038 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
12039 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
12040 LHS, RHS, InvCarry);
12041
12042 EVT OpVT = Op.getValueType();
12043 SDValue TVal = DAG.getConstant(1, DL, OpVT);
12044 SDValue FVal = DAG.getConstant(0, DL, OpVT);
12045
12046 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
12048 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
12049 // Inputs are swapped because the condition is inverted. This will allow
12050 // matching with a single CSINC instruction.
12051 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
12052 Cmp.getValue(1));
12053}
12054
12055/// Emit vector comparison for floating-point values, producing a mask.
12057 AArch64CC::CondCode CC, bool NoNans, EVT VT,
12058 const SDLoc &DL, SelectionDAG &DAG) {
12059 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
12060 "function only supposed to emit natural comparisons");
12061
12062 switch (CC) {
12063 default:
12064 return SDValue();
12065 case AArch64CC::NE: {
12066 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12067 // Use vector semantics for the inversion to potentially save a copy between
12068 // SIMD and regular registers.
12069 if (!LHS.getValueType().isVector()) {
12070 EVT VecVT =
12071 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12072 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12073 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
12074 DAG.getPOISON(VecVT), Fcmeq, Zero);
12075 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
12076 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
12077 }
12078 return DAG.getNOT(DL, Fcmeq, VT);
12079 }
12080 case AArch64CC::EQ:
12081 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12082 case AArch64CC::GE:
12083 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
12084 case AArch64CC::GT:
12085 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
12086 case AArch64CC::LE:
12087 if (!NoNans)
12088 return SDValue();
12089 // If we ignore NaNs then we can use to the LS implementation.
12090 [[fallthrough]];
12091 case AArch64CC::LS:
12092 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
12093 case AArch64CC::LT:
12094 if (!NoNans)
12095 return SDValue();
12096 // If we ignore NaNs then we can use to the MI implementation.
12097 [[fallthrough]];
12098 case AArch64CC::MI:
12099 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
12100 }
12101}
12102
12103/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
12104/// values are scalars, try to emit a mask generating vector instruction.
12106 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
12107 const SDLoc &DL, SelectionDAG &DAG) {
12108 assert(!LHS.getValueType().isVector());
12109 assert(!RHS.getValueType().isVector());
12110
12111 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
12112 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
12113 if (!CTVal || !CFVal)
12114 return {};
12115 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
12116 !(CTVal->isZero() && CFVal->isAllOnes()))
12117 return {};
12118
12119 if (CTVal->isZero())
12120 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12121
12122 EVT VT = TVal.getValueType();
12123 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
12124 return {};
12125
12126 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
12127 bool OneNaN = false;
12128 if (LHS == RHS) {
12129 OneNaN = true;
12130 } else if (DAG.isKnownNeverNaN(RHS)) {
12131 OneNaN = true;
12132 RHS = LHS;
12133 } else if (DAG.isKnownNeverNaN(LHS)) {
12134 OneNaN = true;
12135 LHS = RHS;
12136 }
12137 if (OneNaN)
12138 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
12139 }
12140
12143 bool ShouldInvert = false;
12144 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12145 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
12146 SDValue Cmp2;
12147 if (CC2 != AArch64CC::AL) {
12148 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
12149 if (!Cmp2)
12150 return {};
12151 }
12152 if (!Cmp2 && !ShouldInvert)
12153 return Cmp;
12154
12155 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12156 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12157 SDValue Poison = DAG.getPOISON(VecVT);
12158 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Poison, Cmp, Zero);
12159 if (Cmp2) {
12160 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Poison, Cmp2, Zero);
12161 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
12162 }
12163 if (ShouldInvert)
12164 Cmp = DAG.getNOT(DL, Cmp, VecVT);
12165 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
12166 return Cmp;
12167}
12168
12169SDValue AArch64TargetLowering::LowerSELECT_CC(
12172 const SDLoc &DL, SelectionDAG &DAG) const {
12173 // Handle f128 first, because it will result in a comparison of some RTLIB
12174 // call result against zero.
12175 if (LHS.getValueType() == MVT::f128) {
12176 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
12177
12178 // If softenSetCCOperands returned a scalar, we need to compare the result
12179 // against zero to select between true and false values.
12180 if (!RHS.getNode()) {
12181 RHS = DAG.getConstant(0, DL, LHS.getValueType());
12182 CC = ISD::SETNE;
12183 }
12184 }
12185
12186 // Also handle f16, for which we need to do a f32 comparison.
12187 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
12188 LHS.getValueType() == MVT::bf16) {
12189 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
12190 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
12191 }
12192
12193 // Next, handle integers.
12194 if (LHS.getValueType().isInteger()) {
12195 assert((LHS.getValueType() == RHS.getValueType()) &&
12196 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
12197
12198 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
12199 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
12200 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
12201
12202 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
12203 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
12204 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
12205 // Both require less instructions than compare and conditional select.
12206 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
12207 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
12208 LHS.getValueType() == RHS.getValueType()) {
12209 EVT VT = LHS.getValueType();
12210 SDValue Shift =
12211 DAG.getNode(ISD::SRA, DL, VT, LHS,
12212 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
12213
12214 if (CC == ISD::SETGT)
12215 Shift = DAG.getNOT(DL, Shift, VT);
12216
12217 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
12218 }
12219
12220 // Check for sign bit test patterns that can use TST optimization.
12221 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
12222 // -> TST %operand, sign_bit; CSEL
12223 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
12224 // -> TST %operand, sign_bit; CSEL
12225 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
12226 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
12227 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12228
12229 uint64_t SignBitPos;
12230 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
12231 EVT TestVT = LHS.getValueType();
12232 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
12233 SDValue TST =
12234 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
12235 LHS, SignBitConst);
12236
12237 SDValue Flags = TST.getValue(1);
12238 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
12239 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
12240 }
12241
12242 // Canonicalise absolute difference patterns:
12243 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
12244 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
12245 //
12246 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
12247 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
12248 // The second forms can be matched into subs+cneg.
12249 // NOTE: Drop poison generating flags from the negated operand to avoid
12250 // inadvertently propagating poison after the canonicalisation.
12251 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
12252 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
12253 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
12255 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
12256 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
12257 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
12259 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
12260 }
12261 }
12262
12263 unsigned Opcode = AArch64ISD::CSEL;
12264
12265 // If both the TVal and the FVal are constants, see if we can swap them in
12266 // order to for a CSINV or CSINC out of them.
12267 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
12268 std::swap(TVal, FVal);
12269 std::swap(CTVal, CFVal);
12270 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12271 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
12272 std::swap(TVal, FVal);
12273 std::swap(CTVal, CFVal);
12274 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12275 } else if (TVal.getOpcode() == ISD::XOR) {
12276 // If TVal is a NOT we want to swap TVal and FVal so that we can match
12277 // with a CSINV rather than a CSEL.
12278 if (isAllOnesConstant(TVal.getOperand(1))) {
12279 std::swap(TVal, FVal);
12280 std::swap(CTVal, CFVal);
12281 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12282 }
12283 } else if (TVal.getOpcode() == ISD::SUB) {
12284 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
12285 // that we can match with a CSNEG rather than a CSEL.
12286 if (isNullConstant(TVal.getOperand(0))) {
12287 std::swap(TVal, FVal);
12288 std::swap(CTVal, CFVal);
12289 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12290 }
12291 } else if (CTVal && CFVal) {
12292 const int64_t TrueVal = CTVal->getSExtValue();
12293 const int64_t FalseVal = CFVal->getSExtValue();
12294 bool Swap = false;
12295
12296 // If both TVal and FVal are constants, see if FVal is the
12297 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
12298 // instead of a CSEL in that case.
12299 if (TrueVal == ~FalseVal) {
12300 Opcode = AArch64ISD::CSINV;
12301 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
12302 TrueVal == -FalseVal) {
12303 Opcode = AArch64ISD::CSNEG;
12304 } else if (TVal.getValueType() == MVT::i32) {
12305 // If our operands are only 32-bit wide, make sure we use 32-bit
12306 // arithmetic for the check whether we can use CSINC. This ensures that
12307 // the addition in the check will wrap around properly in case there is
12308 // an overflow (which would not be the case if we do the check with
12309 // 64-bit arithmetic).
12310 const uint32_t TrueVal32 = CTVal->getZExtValue();
12311 const uint32_t FalseVal32 = CFVal->getZExtValue();
12312
12313 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
12314 Opcode = AArch64ISD::CSINC;
12315
12316 if (TrueVal32 > FalseVal32) {
12317 Swap = true;
12318 }
12319 }
12320 } else {
12321 // 64-bit check whether we can use CSINC.
12322 const uint64_t TrueVal64 = TrueVal;
12323 const uint64_t FalseVal64 = FalseVal;
12324
12325 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
12326 Opcode = AArch64ISD::CSINC;
12327
12328 if (TrueVal > FalseVal) {
12329 Swap = true;
12330 }
12331 }
12332 }
12333
12334 // Swap TVal and FVal if necessary.
12335 if (Swap) {
12336 std::swap(TVal, FVal);
12337 std::swap(CTVal, CFVal);
12338 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12339 }
12340
12341 if (Opcode != AArch64ISD::CSEL) {
12342 // Drop FVal since we can get its value by simply inverting/negating
12343 // TVal.
12344 FVal = TVal;
12345 }
12346 }
12347
12348 // Avoid materializing a constant when possible by reusing a known value in
12349 // a register. However, don't perform this optimization if the known value
12350 // is one, zero or negative one in the case of a CSEL. We can always
12351 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
12352 // FVal, respectively.
12353 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
12354 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
12355 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
12357 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
12358 // "a != C ? x : a" to avoid materializing C.
12359 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
12360 TVal = LHS;
12361 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
12362 FVal = LHS;
12363 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
12364 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
12365 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
12366 // avoid materializing C.
12368 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
12369 Opcode = AArch64ISD::CSINV;
12370 TVal = LHS;
12371 FVal = DAG.getConstant(0, DL, FVal.getValueType());
12372 }
12373 }
12374
12375 SDValue CCVal;
12376 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
12377 EVT VT = TVal.getValueType();
12378 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
12379 }
12380
12381 // Now we know we're dealing with FP values.
12382 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
12383 LHS.getValueType() == MVT::f64);
12384 assert(LHS.getValueType() == RHS.getValueType());
12385 EVT VT = TVal.getValueType();
12386
12387 // If the purpose of the comparison is to select between all ones
12388 // or all zeros, try to use a vector comparison because the operands are
12389 // already stored in SIMD registers.
12390 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
12391 switch (U->getOpcode()) {
12392 default:
12393 return false;
12396 case AArch64ISD::DUP:
12397 return true;
12398 }
12399 })) {
12400 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
12401 SDValue VectorCmp =
12402 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
12403 if (VectorCmp)
12404 return VectorCmp;
12405 }
12406
12407 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12408
12409 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12410 // clean. Some of them require two CSELs to implement.
12411 AArch64CC::CondCode CC1, CC2;
12412 changeFPCCToAArch64CC(CC, CC1, CC2);
12413
12414 if (Flags.hasNoSignedZeros()) {
12415 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
12416 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
12417 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
12418 if (RHSVal && RHSVal->isZero()) {
12419 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
12420 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
12421
12422 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
12423 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
12424 TVal = LHS;
12425 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
12426 CFVal && CFVal->isZero() &&
12427 FVal.getValueType() == LHS.getValueType())
12428 FVal = LHS;
12429 }
12430 }
12431
12432 // Emit first, and possibly only, CSEL.
12433 SDValue CC1Val = getCondCode(DAG, CC1);
12434 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12435
12436 // If we need a second CSEL, emit it, using the output of the first as the
12437 // RHS. We're effectively OR'ing the two CC's together.
12438 if (CC2 != AArch64CC::AL) {
12439 SDValue CC2Val = getCondCode(DAG, CC2);
12440 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12441 }
12442
12443 // Otherwise, return the output of the first CSEL.
12444 return CS1;
12445}
12446
12447SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12448 SelectionDAG &DAG) const {
12449 EVT Ty = Op.getValueType();
12450 if (!isa<ConstantSDNode>(Op.getOperand(2)))
12451 return SDValue();
12452 auto Idx = Op.getConstantOperandAPInt(2);
12453 int64_t IdxVal = Idx.getSExtValue();
12454 assert(Ty.isScalableVector() &&
12455 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12456
12457 // We can use the splice instruction for certain index values where we are
12458 // able to efficiently generate the correct predicate. The index will be
12459 // inverted and used directly as the input to the ptrue instruction, i.e.
12460 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12461 // splice predicate. However, we can only do this if we can guarantee that
12462 // there are enough elements in the vector, hence we check the index <= min
12463 // number of elements.
12464 std::optional<unsigned> PredPattern;
12465 if (Ty.isScalableVector() && Op.getOpcode() == ISD::VECTOR_SPLICE_RIGHT &&
12466 (PredPattern = getSVEPredPatternFromNumElements(IdxVal)) !=
12467 std::nullopt) {
12468 SDLoc DL(Op);
12469
12470 // Create a predicate where all but the last -IdxVal elements are false.
12471 EVT PredVT = Ty.changeVectorElementType(*DAG.getContext(), MVT::i1);
12472 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12473 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12474
12475 // Now splice the two inputs together using the predicate.
12476 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12477 Op.getOperand(1));
12478 }
12479
12480 // We can select to an EXT instruction when indexing the first 256 bytes.
12482 if (Op.getOpcode() == ISD::VECTOR_SPLICE_LEFT &&
12483 (IdxVal * BlockSize / 8) < 256)
12484 return Op;
12485
12486 return SDValue();
12487}
12488
12489SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12490 SelectionDAG &DAG) const {
12491 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12492 SDValue LHS = Op.getOperand(0);
12493 SDValue RHS = Op.getOperand(1);
12494 SDValue TVal = Op.getOperand(2);
12495 SDValue FVal = Op.getOperand(3);
12496 SDNodeFlags Flags = Op->getFlags();
12497 SDLoc DL(Op);
12498 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12499}
12500
12501SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12502 SelectionDAG &DAG) const {
12503 SDValue CCVal = Op->getOperand(0);
12504 SDValue TVal = Op->getOperand(1);
12505 SDValue FVal = Op->getOperand(2);
12506 SDLoc DL(Op);
12507
12508 EVT Ty = Op.getValueType();
12509 if (Ty == MVT::aarch64svcount) {
12510 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12511 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12512 SDValue Sel =
12513 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12514 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12515 }
12516
12517 if (Ty.isScalableVector()) {
12518 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12519 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12520 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12521 }
12522
12523 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12524 // FIXME: Ideally this would be the same as above using i1 types, however
12525 // for the moment we can't deal with fixed i1 vector types properly, so
12526 // instead extend the predicate to a result type sized integer vector.
12527 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12528 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12529 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12530 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12531 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12532 }
12533
12534 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12535 // instruction.
12536 if (ISD::isOverflowIntrOpRes(CCVal)) {
12537 // Only lower legal XALUO ops.
12538 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12539 return SDValue();
12540
12542 SDValue Value, Overflow;
12543 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12544 SDValue CCVal = getCondCode(DAG, OFCC);
12545
12546 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12547 CCVal, Overflow);
12548 }
12549
12550 // Lower it the same way as we would lower a SELECT_CC node.
12551 ISD::CondCode CC;
12552 SDValue LHS, RHS;
12553 if (CCVal.getOpcode() == ISD::SETCC) {
12554 LHS = CCVal.getOperand(0);
12555 RHS = CCVal.getOperand(1);
12556 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12557 } else {
12558 LHS = CCVal;
12559 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12560 CC = ISD::SETNE;
12561 }
12562
12563 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12564 // order to use FCSELSrrr
12565 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12566 SDValue Poison = DAG.getPOISON(MVT::f32);
12567 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, Poison, TVal);
12568 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, Poison, FVal);
12569 }
12570
12571 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12572 Op->getFlags(), DL, DAG);
12573
12574 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12575 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12576 }
12577
12578 return Res;
12579}
12580
12581SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12582 SelectionDAG &DAG) const {
12583 // Jump table entries as PC relative offsets. No additional tweaking
12584 // is necessary here. Just get the address of the jump table.
12585 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12586
12589 !Subtarget->isTargetMachO())
12590 return getAddrLarge(JT, DAG);
12591 if (CM == CodeModel::Tiny)
12592 return getAddrTiny(JT, DAG);
12593 return getAddr(JT, DAG);
12594}
12595
12596SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12597 SelectionDAG &DAG) const {
12598 // Jump table entries as PC relative offsets. No additional tweaking
12599 // is necessary here. Just get the address of the jump table.
12600 SDLoc DL(Op);
12601 SDValue JT = Op.getOperand(1);
12602 SDValue Entry = Op.getOperand(2);
12603 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12604
12605 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12606 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12607
12608 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12609 // sequence later, to guarantee the integrity of the intermediate values.
12611 "aarch64-jump-table-hardening")) {
12613 if (Subtarget->isTargetMachO()) {
12614 if (CM != CodeModel::Small && CM != CodeModel::Large)
12615 report_fatal_error("Unsupported code-model for hardened jump-table");
12616 } else {
12617 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12618 assert(Subtarget->isTargetELF() &&
12619 "jump table hardening only supported on MachO/ELF");
12620 if (CM != CodeModel::Small)
12621 report_fatal_error("Unsupported code-model for hardened jump-table");
12622 }
12623
12624 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12625 Entry, SDValue());
12626 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12627 DAG.getTargetJumpTable(JTI, MVT::i32),
12628 X16Copy.getValue(0), X16Copy.getValue(1));
12629 return SDValue(B, 0);
12630 }
12631
12632 SDNode *Dest =
12633 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12634 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12635 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12636 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12637}
12638
12639SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12640 SDValue Chain = Op.getOperand(0);
12641 SDValue Dest = Op.getOperand(1);
12642
12643 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12644 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12645 if (Dest->isMachineOpcode() &&
12646 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12647 return SDValue();
12648
12649 const MachineFunction &MF = DAG.getMachineFunction();
12650 std::optional<uint16_t> BADisc =
12651 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12652 if (!BADisc)
12653 return SDValue();
12654
12655 SDLoc DL(Op);
12656
12657 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12659 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12660
12661 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12662 {Dest, Key, Disc, AddrDisc, Chain});
12663 return SDValue(BrA, 0);
12664}
12665
12666SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12667 SelectionDAG &DAG) const {
12668 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12670 if (CM == CodeModel::Large) {
12671 // Use the GOT for the large code model on iOS.
12672 if (Subtarget->isTargetMachO()) {
12673 return getGOT(CP, DAG);
12674 }
12676 return getAddrLarge(CP, DAG);
12677 } else if (CM == CodeModel::Tiny) {
12678 return getAddrTiny(CP, DAG);
12679 }
12680 return getAddr(CP, DAG);
12681}
12682
12683SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12684 SelectionDAG &DAG) const {
12685 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12686 const BlockAddress *BA = BAN->getBlockAddress();
12687
12688 if (std::optional<uint16_t> BADisc =
12689 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12690 *BA->getFunction())) {
12691 SDLoc DL(Op);
12692
12693 // This isn't cheap, but BRIND is rare.
12694 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12695
12696 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12697
12699 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12700
12701 SDNode *MOV =
12702 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12703 {TargetBA, Key, AddrDisc, Disc});
12704 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12705 SDValue(MOV, 1));
12706 }
12707
12709 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12711 return getAddrLarge(BAN, DAG);
12712 } else if (CM == CodeModel::Tiny) {
12713 return getAddrTiny(BAN, DAG);
12714 }
12715 return getAddr(BAN, DAG);
12716}
12717
12718SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12719 SelectionDAG &DAG) const {
12720 AArch64FunctionInfo *FuncInfo =
12721 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12722
12723 SDLoc DL(Op);
12724 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12726 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12727 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12728 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12729 MachinePointerInfo(SV));
12730}
12731
12732SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12733 SelectionDAG &DAG) const {
12734 MachineFunction &MF = DAG.getMachineFunction();
12735 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12736
12737 SDLoc DL(Op);
12738 SDValue FR;
12739 if (Subtarget->isWindowsArm64EC()) {
12740 // With the Arm64EC ABI, we compute the address of the varargs save area
12741 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12742 // but calls from an entry thunk can pass in a different address.
12743 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12744 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12745 uint64_t StackOffset;
12746 if (FuncInfo->getVarArgsGPRSize() > 0)
12747 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12748 else
12749 StackOffset = FuncInfo->getVarArgsStackOffset();
12750 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12751 DAG.getConstant(StackOffset, DL, MVT::i64));
12752 } else {
12753 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12754 ? FuncInfo->getVarArgsGPRIndex()
12755 : FuncInfo->getVarArgsStackIndex(),
12757 }
12758 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12759 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12760 MachinePointerInfo(SV));
12761}
12762
12763SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12764 SelectionDAG &DAG) const {
12765 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12766 // Standard, section B.3.
12767 MachineFunction &MF = DAG.getMachineFunction();
12768 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12769 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12770 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12771 auto PtrVT = getPointerTy(DAG.getDataLayout());
12772 SDLoc DL(Op);
12773
12774 SDValue Chain = Op.getOperand(0);
12775 SDValue VAList = Op.getOperand(1);
12776 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12778
12779 // void *__stack at offset 0
12780 unsigned Offset = 0;
12781 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12782 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12783 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12784 MachinePointerInfo(SV), Align(PtrSize)));
12785
12786 // void *__gr_top at offset 8 (4 on ILP32)
12787 Offset += PtrSize;
12788 int GPRSize = FuncInfo->getVarArgsGPRSize();
12789 if (GPRSize > 0) {
12790 SDValue GRTop, GRTopAddr;
12791
12792 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12793 DAG.getConstant(Offset, DL, PtrVT));
12794
12795 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12796 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12797 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12798 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12799
12800 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12801 MachinePointerInfo(SV, Offset),
12802 Align(PtrSize)));
12803 }
12804
12805 // void *__vr_top at offset 16 (8 on ILP32)
12806 Offset += PtrSize;
12807 int FPRSize = FuncInfo->getVarArgsFPRSize();
12808 if (FPRSize > 0) {
12809 SDValue VRTop, VRTopAddr;
12810 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12811 DAG.getConstant(Offset, DL, PtrVT));
12812
12813 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12814 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12815 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12816 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12817
12818 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12819 MachinePointerInfo(SV, Offset),
12820 Align(PtrSize)));
12821 }
12822
12823 // int __gr_offs at offset 24 (12 on ILP32)
12824 Offset += PtrSize;
12825 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12826 DAG.getConstant(Offset, DL, PtrVT));
12827 MemOps.push_back(
12828 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12829 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12830
12831 // int __vr_offs at offset 28 (16 on ILP32)
12832 Offset += 4;
12833 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12834 DAG.getConstant(Offset, DL, PtrVT));
12835 MemOps.push_back(
12836 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12837 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12838
12839 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12840}
12841
12842SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12843 SelectionDAG &DAG) const {
12844 MachineFunction &MF = DAG.getMachineFunction();
12845 Function &F = MF.getFunction();
12846
12847 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12848 return LowerWin64_VASTART(Op, DAG);
12849 else if (Subtarget->isTargetDarwin())
12850 return LowerDarwin_VASTART(Op, DAG);
12851 else
12852 return LowerAAPCS_VASTART(Op, DAG);
12853}
12854
12855SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12856 SelectionDAG &DAG) const {
12857 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12858 // pointer.
12859 SDLoc DL(Op);
12860 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12861 unsigned VaListSize =
12862 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12863 ? PtrSize
12864 : Subtarget->isTargetILP32() ? 20 : 32;
12865 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12866 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12867
12868 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12869 DAG.getConstant(VaListSize, DL, MVT::i32),
12870 Align(PtrSize), false, false, /*CI=*/nullptr,
12871 std::nullopt, MachinePointerInfo(DestSV),
12872 MachinePointerInfo(SrcSV));
12873}
12874
12875SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12876 assert(Subtarget->isTargetDarwin() &&
12877 "automatic va_arg instruction only works on Darwin");
12878
12879 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12880 EVT VT = Op.getValueType();
12881 SDLoc DL(Op);
12882 SDValue Chain = Op.getOperand(0);
12883 SDValue Addr = Op.getOperand(1);
12884 MaybeAlign Align(Op.getConstantOperandVal(3));
12885 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12886 auto PtrVT = getPointerTy(DAG.getDataLayout());
12887 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12888 SDValue VAList =
12889 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12890 Chain = VAList.getValue(1);
12891 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12892
12893 if (VT.isScalableVector())
12894 report_fatal_error("Passing SVE types to variadic functions is "
12895 "currently not supported");
12896
12897 if (Align && *Align > MinSlotSize) {
12898 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12899 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12900 VAList =
12901 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12902 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12903 }
12904
12905 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12906 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12907
12908 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12909 // up to 64 bits. At the very least, we have to increase the striding of the
12910 // vaargs list to match this, and for FP values we need to introduce
12911 // FP_ROUND nodes as well.
12912 if (VT.isInteger() && !VT.isVector())
12913 ArgSize = std::max(ArgSize, MinSlotSize);
12914 bool NeedFPTrunc = false;
12915 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12916 ArgSize = 8;
12917 NeedFPTrunc = true;
12918 }
12919
12920 // Increment the pointer, VAList, to the next vaarg
12921 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12922 DAG.getConstant(ArgSize, DL, PtrVT));
12923 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12924
12925 // Store the incremented VAList to the legalized pointer
12926 SDValue APStore =
12927 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12928
12929 // Load the actual argument out of the pointer VAList
12930 if (NeedFPTrunc) {
12931 // Load the value as an f64.
12932 SDValue WideFP =
12933 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12934 // Round the value down to an f32.
12935 SDValue NarrowFP =
12936 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12937 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12938 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12939 // Merge the rounded value with the chain output of the load.
12940 return DAG.getMergeValues(Ops, DL);
12941 }
12942
12943 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12944}
12945
12946SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12947 SelectionDAG &DAG) const {
12948 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12949 MFI.setFrameAddressIsTaken(true);
12950
12951 EVT VT = Op.getValueType();
12952 SDLoc DL(Op);
12953 unsigned Depth = Op.getConstantOperandVal(0);
12954 SDValue FrameAddr =
12955 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12956 while (Depth--)
12957 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12958 MachinePointerInfo());
12959
12960 if (Subtarget->isTargetILP32())
12961 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12962 DAG.getValueType(VT));
12963
12964 return FrameAddr;
12965}
12966
12967SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12968 SelectionDAG &DAG) const {
12969 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12970
12971 EVT VT = getPointerTy(DAG.getDataLayout());
12972 int FI = MFI.CreateFixedObject(4, 0, false);
12973 return DAG.getFrameIndex(FI, VT);
12974}
12975
12976#define GET_REGISTER_MATCHER
12977#include "AArch64GenAsmMatcher.inc"
12978
12979// FIXME? Maybe this could be a TableGen attribute on some registers and
12980// this table could be generated automatically from RegInfo.
12981Register AArch64TargetLowering::
12982getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12984 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12985 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12986 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12987 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12988 !MRI->isReservedReg(MF, Reg))
12989 Reg = Register();
12990 }
12991 return Reg;
12992}
12993
12994SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12995 SelectionDAG &DAG) const {
12997
12998 EVT VT = Op.getValueType();
12999 SDLoc DL(Op);
13000
13001 SDValue FrameAddr =
13002 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
13004
13005 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
13006}
13007
13008SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
13009 SelectionDAG &DAG) const {
13010 MachineFunction &MF = DAG.getMachineFunction();
13011 MachineFrameInfo &MFI = MF.getFrameInfo();
13012 MFI.setReturnAddressIsTaken(true);
13013
13014 EVT VT = Op.getValueType();
13015 SDLoc DL(Op);
13016 unsigned Depth = Op.getConstantOperandVal(0);
13017 SDValue ReturnAddress;
13018 if (Depth) {
13019 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
13021 ReturnAddress = DAG.getLoad(
13022 VT, DL, DAG.getEntryNode(),
13023 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
13024 } else {
13025 // Return LR, which contains the return address. Mark it an implicit
13026 // live-in.
13027 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
13028 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
13029 }
13030
13031 // The XPACLRI instruction assembles to a hint-space instruction before
13032 // Armv8.3-A therefore this instruction can be safely used for any pre
13033 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
13034 // that instead.
13035 SDNode *St;
13036 if (Subtarget->hasPAuth()) {
13037 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
13038 } else {
13039 // XPACLRI operates on LR therefore we must move the operand accordingly.
13040 SDValue Chain =
13041 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
13042 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
13043 }
13044 return SDValue(St, 0);
13045}
13046
13047/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
13048/// i32 values and take a 2 x i32 value to shift plus a shift amount.
13049SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
13050 SelectionDAG &DAG) const {
13051 SDValue Lo, Hi;
13052 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
13053 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
13054}
13055
13057 const GlobalAddressSDNode *GA) const {
13058 // Offsets are folded in the DAG combine rather than here so that we can
13059 // intelligently choose an offset based on the uses.
13060 return false;
13061}
13062
13064 bool OptForSize) const {
13065 bool IsLegal = false;
13066 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
13067 // 16-bit case when target has full fp16 support.
13068 // We encode bf16 bit patterns as if they were fp16. This results in very
13069 // strange looking assembly but should populate the register with appropriate
13070 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
13071 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
13072 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
13073 // FIXME: We should be able to handle f128 as well with a clever lowering.
13074 const APInt ImmInt = Imm.bitcastToAPInt();
13075 if (VT == MVT::f64)
13076 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
13077 else if (VT == MVT::f32)
13078 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
13079 else if (VT == MVT::f16 || VT == MVT::bf16)
13080 IsLegal =
13081 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
13082 Imm.isPosZero();
13083
13084 // If we can not materialize in immediate field for fmov, check if the
13085 // value can be encoded as the immediate operand of a logical instruction.
13086 // The immediate value will be created with either MOVZ, MOVN, or ORR.
13087 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
13088 // generate that fmov.
13089 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
13090 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
13091 // however the mov+fmov sequence is always better because of the reduced
13092 // cache pressure. The timings are still the same if you consider
13093 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
13094 // movw+movk is fused). So we limit up to 2 instrdduction at most.
13097 assert(Insn.size() <= 4 &&
13098 "Should be able to build any value with at most 4 moves");
13099 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
13100 IsLegal = Insn.size() <= Limit;
13101 }
13102
13103 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
13104 << " imm value: "; Imm.dump(););
13105 return IsLegal;
13106}
13107
13108//===----------------------------------------------------------------------===//
13109// AArch64 Optimization Hooks
13110//===----------------------------------------------------------------------===//
13111
13112static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
13113 SDValue Operand, SelectionDAG &DAG,
13114 int &ExtraSteps) {
13115 EVT VT = Operand.getValueType();
13116 if ((ST->hasNEON() &&
13117 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
13118 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
13119 VT == MVT::v4f32)) ||
13120 (ST->hasSVE() &&
13121 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
13123 // For the reciprocal estimates, convergence is quadratic, so the number
13124 // of digits is doubled after each iteration. In ARMv8, the accuracy of
13125 // the initial estimate is 2^-8. Thus the number of extra steps to refine
13126 // the result for float (23 mantissa bits) is 2 and for double (52
13127 // mantissa bits) is 3.
13128 constexpr unsigned AccurateBits = 8;
13129 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
13130 ExtraSteps = DesiredBits <= AccurateBits
13131 ? 0
13132 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
13133 }
13134
13135 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
13136 }
13137
13138 return SDValue();
13139}
13140
13141SDValue
13142AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13143 const DenormalMode &Mode) const {
13144 SDLoc DL(Op);
13145 EVT VT = Op.getValueType();
13146 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
13147 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
13148 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
13149}
13150
13151SDValue
13152AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
13153 SelectionDAG &DAG) const {
13154 return Op;
13155}
13156
13157SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
13158 SelectionDAG &DAG, int Enabled,
13159 int &ExtraSteps,
13160 bool &UseOneConst,
13161 bool Reciprocal) const {
13163 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
13164 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
13165 DAG, ExtraSteps)) {
13166 SDLoc DL(Operand);
13167 EVT VT = Operand.getValueType();
13168
13169 // Ensure nodes can be recognized by isAssociativeAndCommutative.
13170 SDNodeFlags Flags =
13172
13173 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
13174 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
13175 for (int i = ExtraSteps; i > 0; --i) {
13176 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
13177 Flags);
13178 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
13179 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13180 }
13181 if (!Reciprocal)
13182 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
13183
13184 ExtraSteps = 0;
13185 return Estimate;
13186 }
13187
13188 return SDValue();
13189}
13190
13191SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
13192 SelectionDAG &DAG, int Enabled,
13193 int &ExtraSteps) const {
13195 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
13196 DAG, ExtraSteps)) {
13197 SDLoc DL(Operand);
13198 EVT VT = Operand.getValueType();
13199
13201
13202 // Newton reciprocal iteration: E * (2 - X * E)
13203 // AArch64 reciprocal iteration instruction: (2 - M * N)
13204 for (int i = ExtraSteps; i > 0; --i) {
13205 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
13206 Estimate, Flags);
13207 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13208 }
13209
13210 ExtraSteps = 0;
13211 return Estimate;
13212 }
13213
13214 return SDValue();
13215}
13216
13217//===----------------------------------------------------------------------===//
13218// AArch64 Inline Assembly Support
13219//===----------------------------------------------------------------------===//
13220
13221// Table of Constraints
13222// TODO: This is the current set of constraints supported by ARM for the
13223// compiler, not all of them may make sense.
13224//
13225// r - A general register
13226// w - An FP/SIMD register of some size in the range v0-v31
13227// x - An FP/SIMD register of some size in the range v0-v15
13228// I - Constant that can be used with an ADD instruction
13229// J - Constant that can be used with a SUB instruction
13230// K - Constant that can be used with a 32-bit logical instruction
13231// L - Constant that can be used with a 64-bit logical instruction
13232// M - Constant that can be used as a 32-bit MOV immediate
13233// N - Constant that can be used as a 64-bit MOV immediate
13234// Q - A memory reference with base register and no offset
13235// S - A symbolic address
13236// Y - Floating point constant zero
13237// Z - Integer constant zero
13238//
13239// Note that general register operands will be output using their 64-bit x
13240// register name, whatever the size of the variable, unless the asm operand
13241// is prefixed by the %w modifier. Floating-point and SIMD register operands
13242// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
13243// %q modifier.
13244const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
13245 // At this point, we have to lower this constraint to something else, so we
13246 // lower it to an "r" or "w". However, by doing this we will force the result
13247 // to be in register, while the X constraint is much more permissive.
13248 //
13249 // Although we are correct (we are free to emit anything, without
13250 // constraints), we might break use cases that would expect us to be more
13251 // efficient and emit something else.
13252 if (!Subtarget->hasFPARMv8())
13253 return "r";
13254
13255 if (ConstraintVT.isFloatingPoint())
13256 return "w";
13257
13258 if (ConstraintVT.isVector() &&
13259 (ConstraintVT.getSizeInBits() == 64 ||
13260 ConstraintVT.getSizeInBits() == 128))
13261 return "w";
13262
13263 return "r";
13264}
13265
13267
13268// Returns a {Reg, RegisterClass} tuple if the constraint is
13269// a specific predicate register.
13270//
13271// For some constraint like "{pn3}" the default path in
13272// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
13273// suitable register class for this register is "PPRorPNR", after which it
13274// determines that nxv16i1 is an appropriate type for the constraint, which is
13275// not what we want. The code here pre-empts this by matching the register
13276// explicitly.
13277static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
13279 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
13280 (Constraint[1] != 'p' && Constraint[1] != 'z'))
13281 return std::nullopt;
13282
13283 bool IsPredicate = Constraint[1] == 'p';
13284 Constraint = Constraint.substr(2, Constraint.size() - 3);
13285 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
13286 if (IsPredicateAsCount)
13287 Constraint = Constraint.drop_front(1);
13288
13289 unsigned V;
13290 if (Constraint.getAsInteger(10, V) || V > 31)
13291 return std::nullopt;
13292
13293 if (IsPredicateAsCount)
13294 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
13295 if (IsPredicate)
13296 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
13297 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
13298}
13299
13300static std::optional<PredicateConstraint>
13303 .Case("Uph", PredicateConstraint::Uph)
13306 .Default(std::nullopt);
13307}
13308
13309static const TargetRegisterClass *
13311 if (VT != MVT::aarch64svcount &&
13312 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
13313 return nullptr;
13314
13315 switch (Constraint) {
13317 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
13318 : &AArch64::PPR_p8to15RegClass;
13320 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
13321 : &AArch64::PPR_3bRegClass;
13323 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
13324 : &AArch64::PPRRegClass;
13325 }
13326
13327 llvm_unreachable("Missing PredicateConstraint!");
13328}
13329
13331
13332static std::optional<ReducedGprConstraint>
13335 .Case("Uci", ReducedGprConstraint::Uci)
13337 .Default(std::nullopt);
13338}
13339
13340static const TargetRegisterClass *
13342 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
13343 return nullptr;
13344
13345 switch (Constraint) {
13347 return &AArch64::MatrixIndexGPR32_8_11RegClass;
13349 return &AArch64::MatrixIndexGPR32_12_15RegClass;
13350 }
13351
13352 llvm_unreachable("Missing ReducedGprConstraint!");
13353}
13354
13355// The set of cc code supported is from
13356// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
13359 .Case("{@cchi}", AArch64CC::HI)
13360 .Case("{@cccs}", AArch64CC::HS)
13361 .Case("{@cclo}", AArch64CC::LO)
13362 .Case("{@ccls}", AArch64CC::LS)
13363 .Case("{@cccc}", AArch64CC::LO)
13364 .Case("{@cceq}", AArch64CC::EQ)
13365 .Case("{@ccgt}", AArch64CC::GT)
13366 .Case("{@ccge}", AArch64CC::GE)
13367 .Case("{@cclt}", AArch64CC::LT)
13368 .Case("{@ccle}", AArch64CC::LE)
13369 .Case("{@cchs}", AArch64CC::HS)
13370 .Case("{@ccne}", AArch64CC::NE)
13371 .Case("{@ccvc}", AArch64CC::VC)
13372 .Case("{@ccpl}", AArch64CC::PL)
13373 .Case("{@ccvs}", AArch64CC::VS)
13374 .Case("{@ccmi}", AArch64CC::MI)
13376 return Cond;
13377}
13378
13379/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
13380/// WZR, invert(<cond>)'.
13382 SelectionDAG &DAG) {
13383 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
13384 DAG.getConstant(0, DL, MVT::i32),
13385 DAG.getConstant(0, DL, MVT::i32),
13386 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
13387}
13388
13389// Lower @cc flag output via getSETCC.
13390SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
13391 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
13392 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
13393 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
13394 if (Cond == AArch64CC::Invalid)
13395 return SDValue();
13396 // The output variable should be a scalar integer.
13397 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
13398 OpInfo.ConstraintVT.getSizeInBits() < 8)
13399 report_fatal_error("Flag output operand is of invalid type");
13400
13401 // Get NZCV register. Only update chain when copyfrom is glued.
13402 if (Glue.getNode()) {
13403 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
13404 Chain = Glue.getValue(1);
13405 } else
13406 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
13407 // Extract CC code.
13408 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
13409
13411
13412 // Truncate or ZERO_EXTEND based on value types.
13413 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
13414 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
13415 else
13416 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
13417
13418 return Result;
13419}
13420
13421/// getConstraintType - Given a constraint letter, return the type of
13422/// constraint it is for this target.
13424AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
13425 if (Constraint.size() == 1) {
13426 switch (Constraint[0]) {
13427 default:
13428 break;
13429 case 'x':
13430 case 'w':
13431 case 'y':
13432 return C_RegisterClass;
13433 // An address with a single base register. Due to the way we
13434 // currently handle addresses it is the same as 'r'.
13435 case 'Q':
13436 return C_Memory;
13437 case 'I':
13438 case 'J':
13439 case 'K':
13440 case 'L':
13441 case 'M':
13442 case 'N':
13443 case 'Y':
13444 case 'Z':
13445 return C_Immediate;
13446 case 'z':
13447 case 'S': // A symbol or label reference with a constant offset
13448 return C_Other;
13449 }
13450 } else if (parsePredicateConstraint(Constraint))
13451 return C_RegisterClass;
13452 else if (parseReducedGprConstraint(Constraint))
13453 return C_RegisterClass;
13454 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13455 return C_Other;
13456 return TargetLowering::getConstraintType(Constraint);
13457}
13458
13459/// Examine constraint type and operand type and determine a weight value.
13460/// This object must already have been set up with the operand type
13461/// and the current alternative constraint selected.
13463AArch64TargetLowering::getSingleConstraintMatchWeight(
13464 AsmOperandInfo &info, const char *constraint) const {
13466 Value *CallOperandVal = info.CallOperandVal;
13467 // If we don't have a value, we can't do a match,
13468 // but allow it at the lowest weight.
13469 if (!CallOperandVal)
13470 return CW_Default;
13471 Type *type = CallOperandVal->getType();
13472 // Look at the constraint type.
13473 switch (*constraint) {
13474 default:
13476 break;
13477 case 'x':
13478 case 'w':
13479 case 'y':
13480 if (type->isFloatingPointTy() || type->isVectorTy())
13481 weight = CW_Register;
13482 break;
13483 case 'z':
13484 weight = CW_Constant;
13485 break;
13486 case 'U':
13487 if (parsePredicateConstraint(constraint) ||
13488 parseReducedGprConstraint(constraint))
13489 weight = CW_Register;
13490 break;
13491 }
13492 return weight;
13493}
13494
13495std::pair<unsigned, const TargetRegisterClass *>
13496AArch64TargetLowering::getRegForInlineAsmConstraint(
13497 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13498 if (Constraint.size() == 1) {
13499 switch (Constraint[0]) {
13500 case 'r':
13501 if (VT.isScalableVector())
13502 return std::make_pair(0U, nullptr);
13503 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13504 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13505 if (VT.getFixedSizeInBits() == 64)
13506 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13507 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13508 case 'w': {
13509 if (!Subtarget->hasFPARMv8())
13510 break;
13511 if (VT.isScalableVector()) {
13512 if (VT.getVectorElementType() != MVT::i1)
13513 return std::make_pair(0U, &AArch64::ZPRRegClass);
13514 return std::make_pair(0U, nullptr);
13515 }
13516 if (VT == MVT::Other)
13517 break;
13518 uint64_t VTSize = VT.getFixedSizeInBits();
13519 if (VTSize == 16)
13520 return std::make_pair(0U, &AArch64::FPR16RegClass);
13521 if (VTSize == 32)
13522 return std::make_pair(0U, &AArch64::FPR32RegClass);
13523 if (VTSize == 64)
13524 return std::make_pair(0U, &AArch64::FPR64RegClass);
13525 if (VTSize == 128)
13526 return std::make_pair(0U, &AArch64::FPR128RegClass);
13527 break;
13528 }
13529 // The instructions that this constraint is designed for can
13530 // only take 128-bit registers so just use that regclass.
13531 case 'x':
13532 if (!Subtarget->hasFPARMv8())
13533 break;
13534 if (VT.isScalableVector())
13535 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13536 if (VT.getSizeInBits() == 128)
13537 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13538 break;
13539 case 'y':
13540 if (!Subtarget->hasFPARMv8())
13541 break;
13542 if (VT.isScalableVector())
13543 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13544 break;
13545 }
13546 } else {
13547 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13548 // SME functions that are not in streaming mode, should
13549 // still observe clobbers of Z-registers by clobbering
13550 // the lower 128bits of those registers.
13551 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13552 !Subtarget->isSVEorStreamingSVEAvailable())
13553 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13554 &AArch64::FPR128RegClass);
13555 return *P;
13556 }
13557 if (const auto PC = parsePredicateConstraint(Constraint))
13558 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13559 return std::make_pair(0U, RegClass);
13560
13561 if (const auto RGC = parseReducedGprConstraint(Constraint))
13562 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13563 return std::make_pair(0U, RegClass);
13564 }
13565 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13567 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13568
13569 if (Constraint == "{za}") {
13570 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13571 }
13572
13573 if (Constraint == "{zt0}") {
13574 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13575 }
13576
13577 // Use the default implementation in TargetLowering to convert the register
13578 // constraint into a member of a register class.
13579 std::pair<unsigned, const TargetRegisterClass *> Res;
13581
13582 // Not found as a standard register?
13583 if (!Res.second) {
13584 unsigned Size = Constraint.size();
13585 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13586 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13587 int RegNo;
13588 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13589 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13590 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13591 // By default we'll emit v0-v31 for this unless there's a modifier where
13592 // we'll emit the correct register as well.
13593 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13594 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13595 Res.second = &AArch64::FPR64RegClass;
13596 } else {
13597 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13598 Res.second = &AArch64::FPR128RegClass;
13599 }
13600 }
13601 }
13602 }
13603
13604 if (Res.second && !Subtarget->hasFPARMv8() &&
13605 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13606 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13607 return std::make_pair(0U, nullptr);
13608
13609 return Res;
13610}
13611
13613 llvm::Type *Ty,
13614 bool AllowUnknown) const {
13615 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13616 return EVT(MVT::i64x8);
13617
13618 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13619}
13620
13621/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13622/// vector. If it is invalid, don't add anything to Ops.
13623void AArch64TargetLowering::LowerAsmOperandForConstraint(
13624 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13625 SelectionDAG &DAG) const {
13626 SDValue Result;
13627
13628 // Currently only support length 1 constraints.
13629 if (Constraint.size() != 1)
13630 return;
13631
13632 char ConstraintLetter = Constraint[0];
13633 switch (ConstraintLetter) {
13634 default:
13635 break;
13636
13637 // This set of constraints deal with valid constants for various instructions.
13638 // Validate and return a target constant for them if we can.
13639 case 'z': {
13640 // 'z' maps to xzr or wzr so it needs an input of 0.
13641 if (!isNullConstant(Op))
13642 return;
13643
13644 if (Op.getValueType() == MVT::i64)
13645 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13646 else
13647 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13648 break;
13649 }
13650 case 'S':
13651 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13652 // supported for PIC while "s" isn't, making "s" less useful. We implement
13653 // "S" but not "s".
13655 break;
13656
13657 case 'I':
13658 case 'J':
13659 case 'K':
13660 case 'L':
13661 case 'M':
13662 case 'N':
13664 if (!C)
13665 return;
13666
13667 // Grab the value and do some validation.
13668 uint64_t CVal = C->getZExtValue();
13669 switch (ConstraintLetter) {
13670 // The I constraint applies only to simple ADD or SUB immediate operands:
13671 // i.e. 0 to 4095 with optional shift by 12
13672 // The J constraint applies only to ADD or SUB immediates that would be
13673 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13674 // instruction [or vice versa], in other words -1 to -4095 with optional
13675 // left shift by 12.
13676 case 'I':
13677 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13678 break;
13679 return;
13680 case 'J': {
13681 uint64_t NVal = -C->getSExtValue();
13682 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13683 CVal = C->getSExtValue();
13684 break;
13685 }
13686 return;
13687 }
13688 // The K and L constraints apply *only* to logical immediates, including
13689 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13690 // been removed and MOV should be used). So these constraints have to
13691 // distinguish between bit patterns that are valid 32-bit or 64-bit
13692 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13693 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13694 // versa.
13695 case 'K':
13696 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13697 break;
13698 return;
13699 case 'L':
13700 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13701 break;
13702 return;
13703 // The M and N constraints are a superset of K and L respectively, for use
13704 // with the MOV (immediate) alias. As well as the logical immediates they
13705 // also match 32 or 64-bit immediates that can be loaded either using a
13706 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13707 // (M) or 64-bit 0x1234000000000000 (N) etc.
13708 // As a note some of this code is liberally stolen from the asm parser.
13709 case 'M': {
13710 if (!isUInt<32>(CVal))
13711 return;
13712 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13713 break;
13714 if ((CVal & 0xFFFF) == CVal)
13715 break;
13716 if ((CVal & 0xFFFF0000ULL) == CVal)
13717 break;
13718 uint64_t NCVal = ~(uint32_t)CVal;
13719 if ((NCVal & 0xFFFFULL) == NCVal)
13720 break;
13721 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13722 break;
13723 return;
13724 }
13725 case 'N': {
13726 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13727 break;
13728 if ((CVal & 0xFFFFULL) == CVal)
13729 break;
13730 if ((CVal & 0xFFFF0000ULL) == CVal)
13731 break;
13732 if ((CVal & 0xFFFF00000000ULL) == CVal)
13733 break;
13734 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13735 break;
13736 uint64_t NCVal = ~CVal;
13737 if ((NCVal & 0xFFFFULL) == NCVal)
13738 break;
13739 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13740 break;
13741 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13742 break;
13743 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13744 break;
13745 return;
13746 }
13747 default:
13748 return;
13749 }
13750
13751 // All assembler immediates are 64-bit integers.
13752 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13753 break;
13754 }
13755
13756 if (Result.getNode()) {
13757 Ops.push_back(Result);
13758 return;
13759 }
13760
13761 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13762}
13763
13764//===----------------------------------------------------------------------===//
13765// AArch64 Advanced SIMD Support
13766//===----------------------------------------------------------------------===//
13767
13768/// WidenVector - Given a value in the V64 register class, produce the
13769/// equivalent value in the V128 register class.
13771 EVT VT = V64Reg.getValueType();
13772 unsigned NarrowSize = VT.getVectorNumElements();
13773 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13774 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13775 SDLoc DL(V64Reg);
13776
13777 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getPOISON(WideTy),
13778 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13779}
13780
13781/// getExtFactor - Determine the adjustment factor for the position when
13782/// generating an "extract from vector registers" instruction.
13783static unsigned getExtFactor(SDValue &V) {
13784 EVT EltType = V.getValueType().getVectorElementType();
13785 return EltType.getSizeInBits() / 8;
13786}
13787
13788// Check if a vector is built from one vector via extracted elements of
13789// another together with an AND mask, ensuring that all elements fit
13790// within range. This can be reconstructed using AND and NEON's TBL1.
13792 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13793 SDLoc DL(Op);
13794 EVT VT = Op.getValueType();
13795 assert(!VT.isScalableVector() &&
13796 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13797
13798 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13799 // directly to TBL1.
13800 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13801 return SDValue();
13802
13803 unsigned NumElts = VT.getVectorNumElements();
13804 assert((NumElts == 8 || NumElts == 16) &&
13805 "Need to have exactly 8 or 16 elements in vector.");
13806
13807 SDValue SourceVec;
13808 SDValue MaskSourceVec;
13809 SmallVector<SDValue, 16> AndMaskConstants;
13810
13811 for (unsigned i = 0; i < NumElts; ++i) {
13812 SDValue V = Op.getOperand(i);
13813 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13814 return SDValue();
13815
13816 SDValue OperandSourceVec = V.getOperand(0);
13817 if (!SourceVec)
13818 SourceVec = OperandSourceVec;
13819 else if (SourceVec != OperandSourceVec)
13820 return SDValue();
13821
13822 // This only looks at shuffles with elements that are
13823 // a) truncated by a constant AND mask extracted from a mask vector, or
13824 // b) extracted directly from a mask vector.
13825 SDValue MaskSource = V.getOperand(1);
13826 if (MaskSource.getOpcode() == ISD::AND) {
13827 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13828 return SDValue();
13829
13830 AndMaskConstants.push_back(MaskSource.getOperand(1));
13831 MaskSource = MaskSource->getOperand(0);
13832 } else if (!AndMaskConstants.empty()) {
13833 // Either all or no operands should have an AND mask.
13834 return SDValue();
13835 }
13836
13837 // An ANY_EXTEND may be inserted between the AND and the source vector
13838 // extraction. We don't care about that, so we can just skip it.
13839 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13840 MaskSource = MaskSource.getOperand(0);
13841
13842 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13843 return SDValue();
13844
13845 SDValue MaskIdx = MaskSource.getOperand(1);
13846 if (!isa<ConstantSDNode>(MaskIdx) ||
13847 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13848 return SDValue();
13849
13850 // We only apply this if all elements come from the same vector with the
13851 // same vector type.
13852 if (!MaskSourceVec) {
13853 MaskSourceVec = MaskSource->getOperand(0);
13854 if (MaskSourceVec.getValueType() != VT)
13855 return SDValue();
13856 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13857 return SDValue();
13858 }
13859 }
13860
13861 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13862 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13863 // insert, we know that the index in the mask must be smaller than the number
13864 // of elements in the source, or we would have an out-of-bounds access.
13865 if (NumElts == 8)
13866 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13867 DAG.getPOISON(VT));
13868
13869 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13870 if (!AndMaskConstants.empty())
13871 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13872 DAG.getBuildVector(VT, DL, AndMaskConstants));
13873
13874 return DAG.getNode(
13876 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
13877 SourceVec, MaskSourceVec);
13878}
13879
13880// Gather data to see if the operation can be modelled as a
13881// shuffle in combination with VEXTs.
13883 SelectionDAG &DAG) const {
13884 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13885 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13886 SDLoc DL(Op);
13887 EVT VT = Op.getValueType();
13888 assert(!VT.isScalableVector() &&
13889 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13890 unsigned NumElts = VT.getVectorNumElements();
13891
13892 struct ShuffleSourceInfo {
13893 SDValue Vec;
13894 unsigned MinElt;
13895 unsigned MaxElt;
13896
13897 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13898 // be compatible with the shuffle we intend to construct. As a result
13899 // ShuffleVec will be some sliding window into the original Vec.
13900 SDValue ShuffleVec;
13901
13902 // Code should guarantee that element i in Vec starts at element "WindowBase
13903 // + i * WindowScale in ShuffleVec".
13904 int WindowBase;
13905 int WindowScale;
13906
13907 ShuffleSourceInfo(SDValue Vec)
13908 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13909 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13910
13911 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13912 };
13913
13914 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13915 // node.
13917 for (unsigned i = 0; i < NumElts; ++i) {
13918 SDValue V = Op.getOperand(i);
13919 if (V.isUndef())
13920 continue;
13921 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13922 !isa<ConstantSDNode>(V.getOperand(1)) ||
13923 V.getOperand(0).getValueType().isScalableVector()) {
13924 LLVM_DEBUG(
13925 dbgs() << "Reshuffle failed: "
13926 "a shuffle can only come from building a vector from "
13927 "various elements of other fixed-width vectors, provided "
13928 "their indices are constant\n");
13929 return SDValue();
13930 }
13931
13932 // Add this element source to the list if it's not already there.
13933 SDValue SourceVec = V.getOperand(0);
13934 auto Source = find(Sources, SourceVec);
13935 if (Source == Sources.end())
13936 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13937
13938 // Update the minimum and maximum lane number seen.
13939 unsigned EltNo = V.getConstantOperandVal(1);
13940 Source->MinElt = std::min(Source->MinElt, EltNo);
13941 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13942 }
13943
13944 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13945 // better than moving to/from gpr registers for larger vectors.
13946 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13947 // Construct a mask for the tbl. We may need to adjust the index for types
13948 // larger than i8.
13950 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13951 for (unsigned I = 0; I < NumElts; ++I) {
13952 SDValue V = Op.getOperand(I);
13953 if (V.isUndef()) {
13954 for (unsigned OF = 0; OF < OutputFactor; OF++)
13955 Mask.push_back(-1);
13956 continue;
13957 }
13958 // Set the Mask lanes adjusted for the size of the input and output
13959 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13960 // output element, adjusted in their positions per input and output types.
13961 unsigned Lane = V.getConstantOperandVal(1);
13962 for (unsigned S = 0; S < Sources.size(); S++) {
13963 if (V.getOperand(0) == Sources[S].Vec) {
13964 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13965 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13966 for (unsigned OF = 0; OF < OutputFactor; OF++)
13967 Mask.push_back(InputBase + OF);
13968 break;
13969 }
13970 }
13971 }
13972
13973 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13974 // v16i8, and the TBLMask
13975 SmallVector<SDValue, 16> TBLOperands;
13976 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13977 ? Intrinsic::aarch64_neon_tbl3
13978 : Intrinsic::aarch64_neon_tbl4,
13979 DL, MVT::i32));
13980 for (unsigned i = 0; i < Sources.size(); i++) {
13981 SDValue Src = Sources[i].Vec;
13982 EVT SrcVT = Src.getValueType();
13983 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13984 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13985 "Expected a legally typed vector");
13986 if (SrcVT.is64BitVector())
13987 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13988 DAG.getPOISON(MVT::v8i8));
13989 TBLOperands.push_back(Src);
13990 }
13991
13993 for (unsigned i = 0; i < Mask.size(); i++)
13994 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13995 assert((Mask.size() == 8 || Mask.size() == 16) &&
13996 "Expected a v8i8 or v16i8 Mask");
13997 TBLOperands.push_back(DAG.getBuildVector(
13998 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13999
14000 SDValue Shuffle =
14002 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
14003 return DAG.getBitcast(VT, Shuffle);
14004 }
14005
14006 if (Sources.size() > 2) {
14007 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
14008 << "sensible when at most two source vectors are "
14009 << "involved\n");
14010 return SDValue();
14011 }
14012
14013 // Find out the smallest element size among result and two sources, and use
14014 // it as element size to build the shuffle_vector.
14015 EVT SmallestEltTy = VT.getVectorElementType();
14016 for (auto &Source : Sources) {
14017 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
14018 if (SrcEltTy.bitsLT(SmallestEltTy)) {
14019 SmallestEltTy = SrcEltTy;
14020 }
14021 }
14022 unsigned ResMultiplier =
14023 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14024 uint64_t VTSize = VT.getFixedSizeInBits();
14025 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
14026 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
14027
14028 // If the source vector is too wide or too narrow, we may nevertheless be able
14029 // to construct a compatible shuffle either by concatenating it with UNDEF or
14030 // extracting a suitable range of elements.
14031 for (auto &Src : Sources) {
14032 EVT SrcVT = Src.ShuffleVec.getValueType();
14033
14034 TypeSize SrcVTSize = SrcVT.getSizeInBits();
14035 if (SrcVTSize == TypeSize::getFixed(VTSize))
14036 continue;
14037
14038 // This stage of the search produces a source with the same element type as
14039 // the original, but with a total width matching the BUILD_VECTOR output.
14040 EVT EltVT = SrcVT.getVectorElementType();
14041 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
14042 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
14043
14044 if (SrcVTSize.getFixedValue() < VTSize) {
14045 assert(2 * SrcVTSize == VTSize);
14046 // We can pad out the smaller vector for free, so if it's part of a
14047 // shuffle...
14048 Src.ShuffleVec =
14049 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
14050 DAG.getPOISON(Src.ShuffleVec.getValueType()));
14051 continue;
14052 }
14053
14054 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
14055 LLVM_DEBUG(
14056 dbgs() << "Reshuffle failed: result vector too small to extract\n");
14057 return SDValue();
14058 }
14059
14060 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
14061 LLVM_DEBUG(
14062 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
14063 return SDValue();
14064 }
14065
14066 if (Src.MinElt >= NumSrcElts) {
14067 // The extraction can just take the second half
14068 Src.ShuffleVec =
14069 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14070 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14071 Src.WindowBase = -NumSrcElts;
14072 } else if (Src.MaxElt < NumSrcElts) {
14073 // The extraction can just take the first half
14074 Src.ShuffleVec =
14075 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14076 DAG.getConstant(0, DL, MVT::i64));
14077 } else {
14078 // An actual VEXT is needed
14079 SDValue VEXTSrc1 =
14080 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14081 DAG.getConstant(0, DL, MVT::i64));
14082 SDValue VEXTSrc2 =
14083 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14084 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14085 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
14086
14087 if (!SrcVT.is64BitVector()) {
14088 LLVM_DEBUG(
14089 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
14090 "for SVE vectors.");
14091 return SDValue();
14092 }
14093
14094 Src.ShuffleVec =
14095 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
14096 DAG.getConstant(Imm, DL, MVT::i32));
14097 Src.WindowBase = -Src.MinElt;
14098 }
14099 }
14100
14101 // Another possible incompatibility occurs from the vector element types. We
14102 // can fix this by bitcasting the source vectors to the same type we intend
14103 // for the shuffle.
14104 for (auto &Src : Sources) {
14105 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
14106 if (SrcEltTy == SmallestEltTy)
14107 continue;
14108 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
14109 if (DAG.getDataLayout().isBigEndian()) {
14110 Src.ShuffleVec =
14111 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
14112 } else {
14113 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
14114 }
14115 Src.WindowScale =
14116 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14117 Src.WindowBase *= Src.WindowScale;
14118 }
14119
14120 // Final check before we try to actually produce a shuffle.
14121 LLVM_DEBUG({
14122 for (auto Src : Sources)
14123 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
14124 });
14125
14126 // The stars all align, our next step is to produce the mask for the shuffle.
14127 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
14128 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
14129 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
14130 SDValue Entry = Op.getOperand(i);
14131 if (Entry.isUndef())
14132 continue;
14133
14134 auto Src = find(Sources, Entry.getOperand(0));
14135 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
14136
14137 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
14138 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
14139 // segment.
14140 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
14141 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
14142 VT.getScalarSizeInBits());
14143 int LanesDefined = BitsDefined / BitsPerShuffleLane;
14144
14145 // This source is expected to fill ResMultiplier lanes of the final shuffle,
14146 // starting at the appropriate offset.
14147 int *LaneMask = &Mask[i * ResMultiplier];
14148
14149 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
14150 ExtractBase += NumElts * (Src - Sources.begin());
14151 for (int j = 0; j < LanesDefined; ++j)
14152 LaneMask[j] = ExtractBase + j;
14153 }
14154
14155 // Final check before we try to produce nonsense...
14156 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
14157 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
14158 return SDValue();
14159 }
14160
14161 SDValue Poison = DAG.getPOISON(ShuffleVT);
14163 for (unsigned i = 0; i < Sources.size(); ++i)
14164 ShuffleOps[i] = Sources[i].ShuffleVec;
14165
14166 SDValue Shuffle =
14167 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
14168 SDValue V;
14169 if (DAG.getDataLayout().isBigEndian()) {
14170 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
14171 } else {
14172 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
14173 }
14174
14175 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
14176 dbgs() << "Reshuffle, creating node: "; V.dump(););
14177
14178 return V;
14179}
14180
14181// check if an EXT instruction can handle the shuffle mask when the
14182// vector sources of the shuffle are the same.
14183static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
14184 unsigned NumElts = VT.getVectorNumElements();
14185
14186 // Assume that the first shuffle index is not UNDEF. Fail if it is.
14187 if (M[0] < 0)
14188 return false;
14189
14190 Imm = M[0];
14191
14192 // If this is a VEXT shuffle, the immediate value is the index of the first
14193 // element. The other shuffle indices must be the successive elements after
14194 // the first one.
14195 unsigned ExpectedElt = Imm;
14196 for (unsigned i = 1; i < NumElts; ++i) {
14197 // Increment the expected index. If it wraps around, just follow it
14198 // back to index zero and keep going.
14199 ++ExpectedElt;
14200 if (ExpectedElt == NumElts)
14201 ExpectedElt = 0;
14202
14203 if (M[i] < 0)
14204 continue; // ignore UNDEF indices
14205 if (ExpectedElt != static_cast<unsigned>(M[i]))
14206 return false;
14207 }
14208
14209 return true;
14210}
14211
14212// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14213// v4i32s. This is really a truncate, which we can construct out of (legal)
14214// concats and truncate nodes.
14216 if (V.getValueType() != MVT::v16i8)
14217 return SDValue();
14218 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
14219
14220 for (unsigned X = 0; X < 4; X++) {
14221 // Check the first item in each group is an extract from lane 0 of a v4i32
14222 // or v4i16.
14223 SDValue BaseExt = V.getOperand(X * 4);
14224 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14225 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
14226 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
14227 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
14228 BaseExt.getConstantOperandVal(1) != 0)
14229 return SDValue();
14230 SDValue Base = BaseExt.getOperand(0);
14231 // And check the other items are extracts from the same vector.
14232 for (unsigned Y = 1; Y < 4; Y++) {
14233 SDValue Ext = V.getOperand(X * 4 + Y);
14234 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14235 Ext.getOperand(0) != Base ||
14237 Ext.getConstantOperandVal(1) != Y)
14238 return SDValue();
14239 }
14240 }
14241
14242 // Turn the buildvector into a series of truncates and concates, which will
14243 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
14244 // concat together to produce 2 v8i16. These are both truncated and concat
14245 // together.
14246 SDLoc DL(V);
14247 SDValue Trunc[4] = {
14248 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
14249 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
14250 for (SDValue &V : Trunc)
14251 if (V.getValueType() == MVT::v4i32)
14252 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
14253 SDValue Concat0 =
14254 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
14255 SDValue Concat1 =
14256 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
14257 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
14258 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
14259 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
14260}
14261
14262/// Check if a vector shuffle corresponds to a DUP instructions with a larger
14263/// element width than the vector lane type. If that is the case the function
14264/// returns true and writes the value of the DUP instruction lane operand into
14265/// DupLaneOp
14266static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
14267 unsigned &DupLaneOp) {
14268 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
14269 "Only possible block sizes for wide DUP are: 16, 32, 64");
14270
14271 if (BlockSize <= VT.getScalarSizeInBits())
14272 return false;
14273 if (BlockSize % VT.getScalarSizeInBits() != 0)
14274 return false;
14275 if (VT.getSizeInBits() % BlockSize != 0)
14276 return false;
14277
14278 size_t SingleVecNumElements = VT.getVectorNumElements();
14279 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
14280 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
14281
14282 // We are looking for masks like
14283 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
14284 // might be replaced by 'undefined'. BlockIndices will eventually contain
14285 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
14286 // for the above examples)
14287 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
14288 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
14289 for (size_t I = 0; I < NumEltsPerBlock; I++) {
14290 int Elt = M[BlockIndex * NumEltsPerBlock + I];
14291 if (Elt < 0)
14292 continue;
14293 // For now we don't support shuffles that use the second operand
14294 if ((unsigned)Elt >= SingleVecNumElements)
14295 return false;
14296 if (BlockElts[I] < 0)
14297 BlockElts[I] = Elt;
14298 else if (BlockElts[I] != Elt)
14299 return false;
14300 }
14301
14302 // We found a candidate block (possibly with some undefs). It must be a
14303 // sequence of consecutive integers starting with a value divisible by
14304 // NumEltsPerBlock with some values possibly replaced by undef-s.
14305
14306 // Find first non-undef element
14307 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
14308 assert(FirstRealEltIter != BlockElts.end() &&
14309 "Shuffle with all-undefs must have been caught by previous cases, "
14310 "e.g. isSplat()");
14311 if (FirstRealEltIter == BlockElts.end()) {
14312 DupLaneOp = 0;
14313 return true;
14314 }
14315
14316 // Index of FirstRealElt in BlockElts
14317 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
14318
14319 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
14320 return false;
14321 // BlockElts[0] must have the following value if it isn't undef:
14322 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
14323
14324 // Check the first element
14325 if (Elt0 % NumEltsPerBlock != 0)
14326 return false;
14327 // Check that the sequence indeed consists of consecutive integers (modulo
14328 // undefs)
14329 for (size_t I = 0; I < NumEltsPerBlock; I++)
14330 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
14331 return false;
14332
14333 DupLaneOp = Elt0 / NumEltsPerBlock;
14334 return true;
14335}
14336
14337// check if an EXT instruction can handle the shuffle mask when the
14338// vector sources of the shuffle are different.
14339static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
14340 unsigned &Imm) {
14341 // Look for the first non-undef element.
14342 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
14343
14344 // Benefit from APInt to handle overflow when calculating expected element.
14345 unsigned NumElts = VT.getVectorNumElements();
14346 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
14347 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
14348 /*implicitTrunc=*/true);
14349 // The following shuffle indices must be the successive elements after the
14350 // first real element.
14351 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
14352 return Elt != ExpectedElt++ && Elt >= 0;
14353 });
14354 if (FoundWrongElt)
14355 return false;
14356
14357 // The index of an EXT is the first element if it is not UNDEF.
14358 // Watch out for the beginning UNDEFs. The EXT index should be the expected
14359 // value of the first element. E.g.
14360 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
14361 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
14362 // ExpectedElt is the last mask index plus 1.
14363 Imm = ExpectedElt.getZExtValue();
14364
14365 // There are two difference cases requiring to reverse input vectors.
14366 // For example, for vector <4 x i32> we have the following cases,
14367 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
14368 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
14369 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
14370 // to reverse two input vectors.
14371 if (Imm < NumElts)
14372 ReverseEXT = true;
14373 else
14374 Imm -= NumElts;
14375
14376 return true;
14377}
14378
14379/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
14380/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14381/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
14382static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14383 unsigned NumElts = VT.getVectorNumElements();
14384 if (NumElts % 2 != 0)
14385 return false;
14386 WhichResult = (M[0] == 0 ? 0 : 1);
14387 unsigned Idx = WhichResult * NumElts / 2;
14388 for (unsigned i = 0; i != NumElts; i += 2) {
14389 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
14390 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
14391 return false;
14392 Idx += 1;
14393 }
14394
14395 return true;
14396}
14397
14398/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
14399/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14400/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
14401static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14402 unsigned Half = VT.getVectorNumElements() / 2;
14403 WhichResult = (M[0] == 0 ? 0 : 1);
14404 for (unsigned j = 0; j != 2; ++j) {
14405 unsigned Idx = WhichResult;
14406 for (unsigned i = 0; i != Half; ++i) {
14407 int MIdx = M[i + j * Half];
14408 if (MIdx >= 0 && (unsigned)MIdx != Idx)
14409 return false;
14410 Idx += 2;
14411 }
14412 }
14413
14414 return true;
14415}
14416
14417/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
14418/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14419/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
14420static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14421 unsigned NumElts = VT.getVectorNumElements();
14422 if (NumElts % 2 != 0)
14423 return false;
14424 WhichResult = (M[0] == 0 ? 0 : 1);
14425 for (unsigned i = 0; i < NumElts; i += 2) {
14426 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
14427 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
14428 return false;
14429 }
14430 return true;
14431}
14432
14433static bool isINSMask(ArrayRef<int> M, int NumInputElements,
14434 bool &DstIsLeft, int &Anomaly) {
14435 if (M.size() != static_cast<size_t>(NumInputElements))
14436 return false;
14437
14438 int NumLHSMatch = 0, NumRHSMatch = 0;
14439 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14440
14441 for (int i = 0; i < NumInputElements; ++i) {
14442 if (M[i] == -1) {
14443 ++NumLHSMatch;
14444 ++NumRHSMatch;
14445 continue;
14446 }
14447
14448 if (M[i] == i)
14449 ++NumLHSMatch;
14450 else
14451 LastLHSMismatch = i;
14452
14453 if (M[i] == i + NumInputElements)
14454 ++NumRHSMatch;
14455 else
14456 LastRHSMismatch = i;
14457 }
14458
14459 if (NumLHSMatch == NumInputElements - 1) {
14460 DstIsLeft = true;
14461 Anomaly = LastLHSMismatch;
14462 return true;
14463 } else if (NumRHSMatch == NumInputElements - 1) {
14464 DstIsLeft = false;
14465 Anomaly = LastRHSMismatch;
14466 return true;
14467 }
14468
14469 return false;
14470}
14471
14472static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14473 if (VT.getSizeInBits() != 128)
14474 return false;
14475
14476 unsigned NumElts = VT.getVectorNumElements();
14477
14478 for (int I = 0, E = NumElts / 2; I != E; I++) {
14479 if (Mask[I] != I)
14480 return false;
14481 }
14482
14483 int Offset = NumElts / 2;
14484 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14485 if (Mask[I] != I + SplitLHS * Offset)
14486 return false;
14487 }
14488
14489 return true;
14490}
14491
14493 SDLoc DL(Op);
14494 EVT VT = Op.getValueType();
14495 SDValue V0 = Op.getOperand(0);
14496 SDValue V1 = Op.getOperand(1);
14497 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14498
14501 return SDValue();
14502
14503 bool SplitV0 = V0.getValueSizeInBits() == 128;
14504
14505 if (!isConcatMask(Mask, VT, SplitV0))
14506 return SDValue();
14507
14508 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14509 if (SplitV0) {
14510 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14511 DAG.getConstant(0, DL, MVT::i64));
14512 }
14513 if (V1.getValueSizeInBits() == 128) {
14514 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14515 DAG.getConstant(0, DL, MVT::i64));
14516 }
14517 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14518}
14519
14520/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14521/// the specified operations to build the shuffle. ID is the perfect-shuffle
14522//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14523//table entry and LHS/RHS are the immediate inputs for this stage of the
14524//shuffle.
14526 unsigned PFEntry, SDValue LHS,
14527 SDValue RHS, SelectionDAG &DAG,
14528 const SDLoc &DL) {
14529 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14530 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14531 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14532
14533 enum {
14534 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14535 OP_VREV,
14536 OP_VDUP0,
14537 OP_VDUP1,
14538 OP_VDUP2,
14539 OP_VDUP3,
14540 OP_VEXT1,
14541 OP_VEXT2,
14542 OP_VEXT3,
14543 OP_VUZPL, // VUZP, left result
14544 OP_VUZPR, // VUZP, right result
14545 OP_VZIPL, // VZIP, left result
14546 OP_VZIPR, // VZIP, right result
14547 OP_VTRNL, // VTRN, left result
14548 OP_VTRNR, // VTRN, right result
14549 OP_MOVLANE // Move lane. RHSID is the lane to move into
14550 };
14551
14552 if (OpNum == OP_COPY) {
14553 if (LHSID == (1 * 9 + 2) * 9 + 3)
14554 return LHS;
14555 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14556 return RHS;
14557 }
14558
14559 if (OpNum == OP_MOVLANE) {
14560 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14561 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14562 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14563 Elt = 3 - Elt;
14564 while (Elt > 0) {
14565 ID /= 9;
14566 Elt--;
14567 }
14568 return (ID % 9 == 8) ? -1 : ID % 9;
14569 };
14570
14571 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14572 // get the lane to move from the PFID, which is always from the
14573 // original vectors (V1 or V2).
14575 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14576 EVT VT = OpLHS.getValueType();
14577 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14578 unsigned ExtLane = 0;
14579 SDValue Input;
14580
14581 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14582 // convert into a higher type.
14583 if (RHSID & 0x4) {
14584 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14585 if (MaskElt == -1)
14586 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14587 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14588 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14589 Input = MaskElt < 2 ? V1 : V2;
14590 if (VT.getScalarSizeInBits() == 16) {
14591 Input = DAG.getBitcast(MVT::v2f32, Input);
14592 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14593 } else {
14594 assert(VT.getScalarSizeInBits() == 32 &&
14595 "Expected 16 or 32 bit shuffle elements");
14596 Input = DAG.getBitcast(MVT::v2f64, Input);
14597 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14598 }
14599 } else {
14600 int MaskElt = getPFIDLane(ID, RHSID);
14601 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14602 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14603 Input = MaskElt < 4 ? V1 : V2;
14604 // Be careful about creating illegal types. Use f16 instead of i16.
14605 if (VT == MVT::v4i16) {
14606 Input = DAG.getBitcast(MVT::v4f16, Input);
14607 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14608 }
14609 }
14611 Input.getValueType().getVectorElementType(),
14612 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14613 SDValue Ins =
14614 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14615 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14616 return DAG.getBitcast(VT, Ins);
14617 }
14618
14619 SDValue OpLHS, OpRHS;
14620 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14621 RHS, DAG, DL);
14622 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14623 RHS, DAG, DL);
14624 EVT VT = OpLHS.getValueType();
14625
14626 switch (OpNum) {
14627 default:
14628 llvm_unreachable("Unknown shuffle opcode!");
14629 case OP_VREV:
14630 // VREV divides the vector in half and swaps within the half.
14631 if (VT.getVectorElementType() == MVT::i32 ||
14632 VT.getVectorElementType() == MVT::f32)
14633 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14634 // vrev <4 x i16> -> REV32
14635 if (VT.getVectorElementType() == MVT::i16 ||
14636 VT.getVectorElementType() == MVT::f16 ||
14637 VT.getVectorElementType() == MVT::bf16)
14638 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14639 // vrev <4 x i8> -> REV16
14640 assert(VT.getVectorElementType() == MVT::i8);
14641 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14642 case OP_VDUP0:
14643 case OP_VDUP1:
14644 case OP_VDUP2:
14645 case OP_VDUP3: {
14646 EVT EltTy = VT.getVectorElementType();
14647 unsigned Opcode;
14648 if (EltTy == MVT::i8)
14649 Opcode = AArch64ISD::DUPLANE8;
14650 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14651 Opcode = AArch64ISD::DUPLANE16;
14652 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14653 Opcode = AArch64ISD::DUPLANE32;
14654 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14655 Opcode = AArch64ISD::DUPLANE64;
14656 else
14657 llvm_unreachable("Invalid vector element type?");
14658
14659 if (VT.getSizeInBits() == 64)
14660 OpLHS = WidenVector(OpLHS, DAG);
14661 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14662 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14663 }
14664 case OP_VEXT1:
14665 case OP_VEXT2:
14666 case OP_VEXT3: {
14667 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14668 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14669 DAG.getConstant(Imm, DL, MVT::i32));
14670 }
14671 case OP_VUZPL:
14672 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14673 case OP_VUZPR:
14674 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14675 case OP_VZIPL:
14676 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14677 case OP_VZIPR:
14678 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14679 case OP_VTRNL:
14680 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14681 case OP_VTRNR:
14682 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14683 }
14684}
14685
14687 SelectionDAG &DAG) {
14688 // Check to see if we can use the TBL instruction.
14689 SDValue V1 = Op.getOperand(0);
14690 SDValue V2 = Op.getOperand(1);
14691 SDLoc DL(Op);
14692
14693 EVT EltVT = Op.getValueType().getVectorElementType();
14694 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14695
14696 bool Swap = false;
14697 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14698 std::swap(V1, V2);
14699 Swap = true;
14700 }
14701
14702 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14703 // out of range values with 0s. We do need to make sure that any out-of-range
14704 // values are really out-of-range for a v16i8 vector.
14705 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14706 MVT IndexVT = MVT::v8i8;
14707 unsigned IndexLen = 8;
14708 if (Op.getValueSizeInBits() == 128) {
14709 IndexVT = MVT::v16i8;
14710 IndexLen = 16;
14711 }
14712
14714 for (int Val : ShuffleMask) {
14715 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14716 unsigned Offset = Byte + Val * BytesPerElt;
14717 if (Swap)
14718 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14719 if (IsUndefOrZero && Offset >= IndexLen)
14720 Offset = 255;
14721 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14722 }
14723 }
14724
14725 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14726 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14727
14728 SDValue Shuffle;
14729 if (IsUndefOrZero) {
14730 if (IndexLen == 8)
14731 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14732 Shuffle = DAG.getNode(
14733 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14734 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14735 V1Cst,
14736 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14737 } else {
14738 if (IndexLen == 8) {
14739 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14740 Shuffle = DAG.getNode(
14741 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14742 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14743 V1Cst,
14744 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14745 } else {
14746 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14747 // cannot currently represent the register constraints on the input
14748 // table registers.
14749 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14750 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14751 // IndexLen));
14752 Shuffle = DAG.getNode(
14753 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14754 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
14755 V1Cst, V2Cst,
14756 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14757 }
14758 }
14759 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14760}
14761
14762static unsigned getDUPLANEOp(EVT EltType) {
14763 if (EltType == MVT::i8)
14764 return AArch64ISD::DUPLANE8;
14765 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14766 return AArch64ISD::DUPLANE16;
14767 if (EltType == MVT::i32 || EltType == MVT::f32)
14768 return AArch64ISD::DUPLANE32;
14769 if (EltType == MVT::i64 || EltType == MVT::f64)
14770 return AArch64ISD::DUPLANE64;
14771
14772 llvm_unreachable("Invalid vector element type?");
14773}
14774
14775static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14776 unsigned Opcode, SelectionDAG &DAG) {
14777 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14778 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14779 // Match: dup (bitcast (extract_subv X, C)), LaneC
14780 if (BitCast.getOpcode() != ISD::BITCAST ||
14782 return false;
14783
14784 // The extract index must align in the destination type. That may not
14785 // happen if the bitcast is from narrow to wide type.
14786 SDValue Extract = BitCast.getOperand(0);
14787 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14788 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14789 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14790 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14791 if (ExtIdxInBits % CastedEltBitWidth != 0)
14792 return false;
14793
14794 // Can't handle cases where vector size is not 128-bit
14795 if (!Extract.getOperand(0).getValueType().is128BitVector())
14796 return false;
14797
14798 // Update the lane value by offsetting with the scaled extract index.
14799 LaneC += ExtIdxInBits / CastedEltBitWidth;
14800
14801 // Determine the casted vector type of the wide vector input.
14802 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14803 // Examples:
14804 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14805 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14806 unsigned SrcVecNumElts =
14807 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14809 SrcVecNumElts);
14810 return true;
14811 };
14812 MVT CastVT;
14813 if (getScaledOffsetDup(V, Lane, CastVT)) {
14814 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14815 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14816 V.getOperand(0).getValueType().is128BitVector()) {
14817 // The lane is incremented by the index of the extract.
14818 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14819 Lane += V.getConstantOperandVal(1);
14820 V = V.getOperand(0);
14821 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14822 // The lane is decremented if we are splatting from the 2nd operand.
14823 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14824 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14825 Lane -= Idx * VT.getVectorNumElements() / 2;
14826 V = WidenVector(V.getOperand(Idx), DAG);
14827 } else if (VT.getSizeInBits() == 64) {
14828 // Widen the operand to 128-bit register with undef.
14829 V = WidenVector(V, DAG);
14830 }
14831 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14832}
14833
14834// Try to widen element type to get a new mask value for a better permutation
14835// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14836// UZP1/2, TRN1/2, REV, INS, etc.
14837// For example:
14838// shufflevector <4 x i32> %a, <4 x i32> %b,
14839// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14840// is equivalent to:
14841// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14842// Finally, we can get:
14843// mov v0.d[0], v1.d[1]
14845 SDLoc DL(Op);
14846 EVT VT = Op.getValueType();
14847 EVT ScalarVT = VT.getVectorElementType();
14848 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14849 SDValue V0 = Op.getOperand(0);
14850 SDValue V1 = Op.getOperand(1);
14851 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14852
14853 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14854 // We need to make sure the wider element type is legal. Thus, ElementSize
14855 // should be not larger than 32 bits, and i1 type should also be excluded.
14856 if (ElementSize > 32 || ElementSize == 1)
14857 return SDValue();
14858
14859 SmallVector<int, 8> NewMask;
14860 if (widenShuffleMaskElts(Mask, NewMask)) {
14861 MVT NewEltVT = VT.isFloatingPoint()
14862 ? MVT::getFloatingPointVT(ElementSize * 2)
14863 : MVT::getIntegerVT(ElementSize * 2);
14864 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14865 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14866 V0 = DAG.getBitcast(NewVT, V0);
14867 V1 = DAG.getBitcast(NewVT, V1);
14868 return DAG.getBitcast(VT,
14869 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14870 }
14871 }
14872
14873 return SDValue();
14874}
14875
14876// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14878 ArrayRef<int> ShuffleMask,
14879 SelectionDAG &DAG) {
14880 SDValue Tbl1 = Op->getOperand(0);
14881 SDValue Tbl2 = Op->getOperand(1);
14882 SDLoc DL(Op);
14883 SDValue Tbl2ID =
14884 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14885
14886 EVT VT = Op.getValueType();
14887 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14888 Tbl1.getOperand(0) != Tbl2ID ||
14890 Tbl2.getOperand(0) != Tbl2ID)
14891 return SDValue();
14892
14893 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14894 return SDValue();
14895
14896 SDValue Mask1 = Tbl1.getOperand(3);
14897 SDValue Mask2 = Tbl2.getOperand(3);
14898 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14899 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14900 return SDValue();
14901
14902 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14903 for (unsigned I = 0; I < 16; I++) {
14904 if (ShuffleMask[I] < 16)
14905 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14906 else {
14907 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14908 if (!C)
14909 return SDValue();
14910 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14911 }
14912 }
14913
14914 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14915 SDValue ID =
14916 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14917
14918 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14919 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14920 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14921}
14922
14923SDValue
14924AArch64TargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
14925 SelectionDAG &DAG) const {
14926 SDLoc DL(Op);
14927 EVT VT = Op.getValueType();
14928 assert(VT.isScalableVector() && "Unexpected result type!");
14929
14930 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
14931 unsigned UnpackOpcode = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14932
14933 // Repeatedly unpack Val until the result is of the desired type.
14934 SDValue Val = Op.getOperand(0);
14935 switch (Val.getSimpleValueType().SimpleTy) {
14936 default:
14937 return SDValue();
14938 case MVT::nxv16i8:
14939 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv8i16, Val);
14940 if (VT == MVT::nxv8i16)
14941 break;
14942 [[fallthrough]];
14943 case MVT::nxv8i16:
14944 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv4i32, Val);
14945 if (VT == MVT::nxv4i32)
14946 break;
14947 [[fallthrough]];
14948 case MVT::nxv4i32:
14949 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv2i64, Val);
14950 assert(VT == MVT::nxv2i64 && "Unexpected result type!");
14951 break;
14952 }
14953
14954 return Val;
14955}
14956
14957// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14958// but we don't have an appropriate instruction,
14959// so custom-lower it as ZIP1-with-zeros.
14960SDValue
14961AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14962 SelectionDAG &DAG) const {
14963 SDLoc DL(Op);
14964 EVT VT = Op.getValueType();
14965
14966 if (VT.isScalableVector())
14967 return LowerEXTEND_VECTOR_INREG(Op, DAG);
14968
14969 SDValue SrcOp = Op.getOperand(0);
14970 EVT SrcVT = SrcOp.getValueType();
14971 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14972 "Unexpected extension factor.");
14973 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14974 // FIXME: support multi-step zipping?
14975 if (Scale != 2)
14976 return SDValue();
14977 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14978 return DAG.getBitcast(VT,
14979 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14980}
14981
14982SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14983 SelectionDAG &DAG) const {
14984 SDLoc DL(Op);
14985 EVT VT = Op.getValueType();
14986
14987 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14988
14989 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14990 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14991
14992 // Convert shuffles that are directly supported on NEON to target-specific
14993 // DAG nodes, instead of keeping them as shuffles and matching them again
14994 // during code selection. This is more efficient and avoids the possibility
14995 // of inconsistencies between legalization and selection.
14996 ArrayRef<int> ShuffleMask = SVN->getMask();
14997
14998 SDValue V1 = Op.getOperand(0);
14999 SDValue V2 = Op.getOperand(1);
15000
15001 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
15002 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
15003 "Unexpected VECTOR_SHUFFLE mask size!");
15004
15005 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
15006 return Res;
15007
15008 if (SVN->isSplat()) {
15009 int Lane = SVN->getSplatIndex();
15010 // If this is undef splat, generate it via "just" vdup, if possible.
15011 if (Lane == -1)
15012 Lane = 0;
15013
15014 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
15015 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
15016 V1.getOperand(0));
15017 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
15018 // constant. If so, we can just reference the lane's definition directly.
15019 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
15021 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
15022
15023 // Otherwise, duplicate from the lane of the input vector.
15024 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
15025 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
15026 }
15027
15028 // Check if the mask matches a DUP for a wider element
15029 for (unsigned LaneSize : {64U, 32U, 16U}) {
15030 unsigned Lane = 0;
15031 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
15032 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
15033 : LaneSize == 32 ? AArch64ISD::DUPLANE32
15034 : AArch64ISD::DUPLANE16;
15035 // Cast V1 to an integer vector with required lane size
15036 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
15037 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
15038 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
15039 V1 = DAG.getBitcast(NewVecTy, V1);
15040 // Construct the DUP instruction
15041 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
15042 // Cast back to the original type
15043 return DAG.getBitcast(VT, V1);
15044 }
15045 }
15046
15047 unsigned NumElts = VT.getVectorNumElements();
15048 unsigned EltSize = VT.getScalarSizeInBits();
15049 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
15050 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
15051 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
15052 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
15053 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
15054 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
15055
15056 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
15057 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
15058 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
15059 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
15060 DAG.getConstant(8, DL, MVT::i32));
15061 }
15062
15063 bool ReverseEXT = false;
15064 unsigned Imm;
15065 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
15066 if (ReverseEXT)
15067 std::swap(V1, V2);
15068 Imm *= getExtFactor(V1);
15069 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
15070 DAG.getConstant(Imm, DL, MVT::i32));
15071 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
15072 Imm *= getExtFactor(V1);
15073 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
15074 DAG.getConstant(Imm, DL, MVT::i32));
15075 }
15076
15077 unsigned WhichResult;
15078 unsigned OperandOrder;
15079 if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15080 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15081 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15082 OperandOrder == 0 ? V2 : V1);
15083 }
15084 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
15085 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15086 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
15087 }
15088 if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15089 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15090 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15091 OperandOrder == 0 ? V2 : V1);
15092 }
15093
15094 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15095 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15096 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15097 }
15098 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15099 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15100 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15101 }
15102 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15103 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15104 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15105 }
15106
15108 return Concat;
15109
15110 bool DstIsLeft;
15111 int Anomaly;
15112 int NumInputElements = V1.getValueType().getVectorNumElements();
15113 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
15114 SDValue DstVec = DstIsLeft ? V1 : V2;
15115 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
15116
15117 SDValue SrcVec = V1;
15118 int SrcLane = ShuffleMask[Anomaly];
15119 if (SrcLane >= NumInputElements) {
15120 SrcVec = V2;
15121 SrcLane -= NumElts;
15122 }
15123 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
15124
15125 EVT ScalarVT = VT.getVectorElementType();
15126
15127 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
15128 ScalarVT = MVT::i32;
15129
15130 return DAG.getNode(
15131 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
15132 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
15133 DstLaneV);
15134 }
15135
15136 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
15137 return NewSD;
15138
15139 // If the shuffle is not directly supported and it has 4 elements, use
15140 // the PerfectShuffle-generated table to synthesize it from other shuffles.
15141 if (NumElts == 4) {
15142 unsigned PFIndexes[4];
15143 for (unsigned i = 0; i != 4; ++i) {
15144 if (ShuffleMask[i] < 0)
15145 PFIndexes[i] = 8;
15146 else
15147 PFIndexes[i] = ShuffleMask[i];
15148 }
15149
15150 // Compute the index in the perfect shuffle table.
15151 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
15152 PFIndexes[2] * 9 + PFIndexes[3];
15153 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
15154 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
15155 DL);
15156 }
15157
15158 // Check for a "select shuffle", generating a BSL to pick between lanes in
15159 // V1/V2.
15160 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
15161 assert(VT.getScalarSizeInBits() <= 32 &&
15162 "Expected larger vector element sizes to be handled already");
15163 SmallVector<SDValue> MaskElts;
15164 for (int M : ShuffleMask)
15165 MaskElts.push_back(DAG.getConstant(
15166 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
15167 EVT IVT = VT.changeVectorElementTypeToInteger();
15168 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
15169 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
15170 DAG.getBitcast(IVT, V1),
15171 DAG.getBitcast(IVT, V2)));
15172 }
15173
15174 // Fall back to generating a TBL
15175 return GenerateTBL(Op, ShuffleMask, DAG);
15176}
15177
15178SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
15179 SelectionDAG &DAG) const {
15180 EVT VT = Op.getValueType();
15181
15182 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15183 return LowerToScalableOp(Op, DAG);
15184
15185 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
15186 "Unexpected vector type!");
15187
15188 // We can handle the constant cases during isel.
15189 if (isa<ConstantSDNode>(Op.getOperand(0)))
15190 return Op;
15191
15192 // There isn't a natural way to handle the general i1 case, so we use some
15193 // trickery with whilelo.
15194 SDLoc DL(Op);
15195 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
15196 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
15197 DAG.getValueType(MVT::i1));
15198 SDValue ID =
15199 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
15200 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15201 if (VT == MVT::nxv1i1)
15202 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
15203 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
15204 Zero, SplatVal),
15205 Zero);
15206 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
15207}
15208
15209SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
15210 SelectionDAG &DAG) const {
15211 SDLoc DL(Op);
15212
15213 EVT VT = Op.getValueType();
15214 if (!isTypeLegal(VT) || !VT.isScalableVector())
15215 return SDValue();
15216
15217 // Current lowering only supports the SVE-ACLE types.
15219 return SDValue();
15220
15221 // The DUPQ operation is independent of element type so normalise to i64s.
15222 SDValue Idx128 = Op.getOperand(2);
15223
15224 // DUPQ can be used when idx is in range.
15225 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
15226 if (CIdx && (CIdx->getZExtValue() <= 3)) {
15227 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
15228 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
15229 }
15230
15231 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
15232
15233 // The ACLE says this must produce the same result as:
15234 // svtbl(data, svadd_x(svptrue_b64(),
15235 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
15236 // index * 2))
15237 SDValue One = DAG.getConstant(1, DL, MVT::i64);
15238 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
15239
15240 // create the vector 0,1,0,1,...
15241 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
15242 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
15243
15244 // create the vector idx64,idx64+1,idx64,idx64+1,...
15245 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
15246 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
15247 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
15248
15249 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
15250 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
15251 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
15252}
15253
15254
15255static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
15256 APInt &UndefBits) {
15257 EVT VT = BVN->getValueType(0);
15258 APInt SplatBits, SplatUndef;
15259 unsigned SplatBitSize;
15260 bool HasAnyUndefs;
15261 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
15262 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
15263
15264 for (unsigned i = 0; i < NumSplats; ++i) {
15265 CnstBits <<= SplatBitSize;
15266 UndefBits <<= SplatBitSize;
15267 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
15268 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
15269 }
15270
15271 return true;
15272 }
15273
15274 return false;
15275}
15276
15277// Try 64-bit splatted SIMD immediate.
15278static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15279 const APInt &Bits) {
15280 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15281 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15282 EVT VT = Op.getValueType();
15283 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
15284
15287
15288 SDLoc DL(Op);
15289 SDValue Mov =
15290 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15291 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15292 }
15293 }
15294
15295 return SDValue();
15296}
15297
15298// Try 32-bit splatted SIMD immediate.
15299static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15300 const APInt &Bits,
15301 const SDValue *LHS = nullptr) {
15302 EVT VT = Op.getValueType();
15303 if (VT.isFixedLengthVector() &&
15305 return SDValue();
15306
15307 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15308 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15309 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15310 bool isAdvSIMDModImm = false;
15311 uint64_t Shift;
15312
15313 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
15315 Shift = 0;
15316 }
15317 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
15319 Shift = 8;
15320 }
15321 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
15323 Shift = 16;
15324 }
15325 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
15327 Shift = 24;
15328 }
15329
15330 if (isAdvSIMDModImm) {
15331 SDLoc DL(Op);
15332 SDValue Mov;
15333
15334 if (LHS)
15335 Mov = DAG.getNode(NewOp, DL, MovTy,
15336 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15337 DAG.getConstant(Value, DL, MVT::i32),
15338 DAG.getConstant(Shift, DL, MVT::i32));
15339 else
15340 Mov =
15341 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15342 DAG.getConstant(Shift, DL, MVT::i32));
15343
15344 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15345 }
15346 }
15347
15348 return SDValue();
15349}
15350
15351// Try 16-bit splatted SIMD immediate.
15352static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15353 const APInt &Bits,
15354 const SDValue *LHS = nullptr) {
15355 EVT VT = Op.getValueType();
15356 if (VT.isFixedLengthVector() &&
15358 return SDValue();
15359
15360 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15361 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15362 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
15363 bool isAdvSIMDModImm = false;
15364 uint64_t Shift;
15365
15366 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
15368 Shift = 0;
15369 }
15370 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
15372 Shift = 8;
15373 }
15374
15375 if (isAdvSIMDModImm) {
15376 SDLoc DL(Op);
15377 SDValue Mov;
15378
15379 if (LHS)
15380 Mov = DAG.getNode(NewOp, DL, MovTy,
15381 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15382 DAG.getConstant(Value, DL, MVT::i32),
15383 DAG.getConstant(Shift, DL, MVT::i32));
15384 else
15385 Mov =
15386 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15387 DAG.getConstant(Shift, DL, MVT::i32));
15388
15389 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15390 }
15391 }
15392
15393 return SDValue();
15394}
15395
15396// Try 32-bit splatted SIMD immediate with shifted ones.
15398 SelectionDAG &DAG, const APInt &Bits) {
15399 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15400 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15401 EVT VT = Op.getValueType();
15402 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15403 bool isAdvSIMDModImm = false;
15404 uint64_t Shift;
15405
15406 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
15408 Shift = 264;
15409 }
15410 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
15412 Shift = 272;
15413 }
15414
15415 if (isAdvSIMDModImm) {
15416 SDLoc DL(Op);
15417 SDValue Mov =
15418 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15419 DAG.getConstant(Shift, DL, MVT::i32));
15420 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15421 }
15422 }
15423
15424 return SDValue();
15425}
15426
15427// Try 8-bit splatted SIMD immediate.
15428static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15429 const APInt &Bits) {
15430 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15431 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15432 EVT VT = Op.getValueType();
15433 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
15434
15437
15438 SDLoc DL(Op);
15439 SDValue Mov =
15440 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15441 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15442 }
15443 }
15444
15445 return SDValue();
15446}
15447
15448// Try FP splatted SIMD immediate.
15449static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15450 const APInt &Bits) {
15451 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15452 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15453 EVT VT = Op.getValueType();
15454 bool isWide = (VT.getSizeInBits() == 128);
15455 MVT MovTy;
15456 bool isAdvSIMDModImm = false;
15457
15458 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
15460 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
15461 }
15462 else if (isWide &&
15463 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
15465 MovTy = MVT::v2f64;
15466 }
15467
15468 if (isAdvSIMDModImm) {
15469 SDLoc DL(Op);
15470 SDValue Mov =
15471 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15472 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15473 }
15474 }
15475
15476 return SDValue();
15477}
15478
15479// Specialized code to quickly find if PotentialBVec is a BuildVector that
15480// consists of only the same constant int value, returned in reference arg
15481// ConstVal
15482static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15483 uint64_t &ConstVal) {
15484 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15485 if (!Bvec)
15486 return false;
15488 if (!FirstElt)
15489 return false;
15490 EVT VT = Bvec->getValueType(0);
15491 unsigned NumElts = VT.getVectorNumElements();
15492 for (unsigned i = 1; i < NumElts; ++i)
15493 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15494 return false;
15495 ConstVal = FirstElt->getZExtValue();
15496 return true;
15497}
15498
15500 // Look through cast.
15501 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15502 N = N.getOperand(0);
15503
15504 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15505}
15506
15508 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15509
15510 // Look through cast.
15511 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15512 N = N.getOperand(0);
15513 // When reinterpreting from a type with fewer elements the "new" elements
15514 // are not active, so bail if they're likely to be used.
15515 if (N.getValueType().getVectorMinNumElements() < NumElts)
15516 return false;
15517 }
15518
15519 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15520 return true;
15521
15522 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15523 // or smaller than the implicit element type represented by N.
15524 // NOTE: A larger element count implies a smaller element type.
15525 if (N.getOpcode() == AArch64ISD::PTRUE &&
15526 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15527 return N.getValueType().getVectorMinNumElements() >= NumElts;
15528
15529 return false;
15530}
15531
15532// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15533// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15534// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15535// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15536// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15537// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15539 EVT VT = N->getValueType(0);
15540
15541 if (!VT.isVector())
15542 return SDValue();
15543
15544 SDLoc DL(N);
15545
15546 SDValue And;
15547 SDValue Shift;
15548
15549 SDValue FirstOp = N->getOperand(0);
15550 unsigned FirstOpc = FirstOp.getOpcode();
15551 SDValue SecondOp = N->getOperand(1);
15552 unsigned SecondOpc = SecondOp.getOpcode();
15553
15554 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15555 // a BICi in order to use an immediate instead of a register.
15556 // Is the other operand an shl or lshr? This will have been turned into:
15557 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15558 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15559 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15560 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15561 SecondOpc == AArch64ISD::SHL_PRED ||
15562 SecondOpc == AArch64ISD::SRL_PRED)) {
15563 And = FirstOp;
15564 Shift = SecondOp;
15565
15566 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15567 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15568 FirstOpc == AArch64ISD::SHL_PRED ||
15569 FirstOpc == AArch64ISD::SRL_PRED)) {
15570 And = SecondOp;
15571 Shift = FirstOp;
15572 } else
15573 return SDValue();
15574
15575 bool IsAnd = And.getOpcode() == ISD::AND;
15576 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15577 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15578 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15579 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15580
15581 // Is the shift amount constant and are all lanes active?
15582 uint64_t C2;
15583 if (ShiftHasPredOp) {
15584 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15585 return SDValue();
15586 APInt C;
15588 return SDValue();
15589 C2 = C.getZExtValue();
15590 } else if (ConstantSDNode *C2node =
15592 C2 = C2node->getZExtValue();
15593 else
15594 return SDValue();
15595
15596 APInt C1AsAPInt;
15597 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15598 if (IsAnd) {
15599 // Is the and mask vector all constant?
15600 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15601 return SDValue();
15602 } else {
15603 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15604 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15605 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15606 assert(C1nodeImm && C1nodeShift);
15607 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15608 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15609 }
15610
15611 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15612 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15613 // how much one can shift elements of a particular size?
15614 if (C2 > ElemSizeInBits)
15615 return SDValue();
15616
15617 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15618 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15619 if (C1AsAPInt != RequiredC1)
15620 return SDValue();
15621
15622 SDValue X = And.getOperand(0);
15623 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15624 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15625 : Shift.getOperand(1);
15626
15627 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15628 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15629}
15630
15632 EVT VT = N->getValueType(0);
15633 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15634 SDLoc DL(N);
15635 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15636
15637 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15638 return SDValue();
15639
15640 SDValue N0 = N->getOperand(0);
15641 if (N0.getOpcode() != ISD::AND)
15642 return SDValue();
15643
15644 SDValue N1 = N->getOperand(1);
15645 if (N1.getOpcode() != ISD::AND)
15646 return SDValue();
15647
15648 // InstCombine does (not (neg a)) => (add a -1).
15649 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15650 // Loop over all combinations of AND operands.
15651 for (int i = 1; i >= 0; --i) {
15652 for (int j = 1; j >= 0; --j) {
15653 SDValue O0 = N0->getOperand(i);
15654 SDValue O1 = N1->getOperand(j);
15655 SDValue Sub, Add, SubSibling, AddSibling;
15656
15657 // Find a SUB and an ADD operand, one from each AND.
15658 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15659 Sub = O0;
15660 Add = O1;
15661 SubSibling = N0->getOperand(1 - i);
15662 AddSibling = N1->getOperand(1 - j);
15663 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15664 Add = O0;
15665 Sub = O1;
15666 AddSibling = N0->getOperand(1 - i);
15667 SubSibling = N1->getOperand(1 - j);
15668 } else
15669 continue;
15670
15671 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15672 continue;
15673
15674 // Constant ones is always righthand operand of the Add.
15675 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15676 continue;
15677
15678 if (Sub.getOperand(1) != Add.getOperand(0))
15679 continue;
15680
15681 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15682 }
15683 }
15684
15685 // (or (and a b) (and (not a) c)) => (bsl a b c)
15686 // We only have to look for constant vectors here since the general, variable
15687 // case can be handled in TableGen.
15688 unsigned Bits = VT.getScalarSizeInBits();
15689 for (int i = 1; i >= 0; --i)
15690 for (int j = 1; j >= 0; --j) {
15691 APInt Val1, Val2;
15692
15693 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15695 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15696 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15697 N0->getOperand(1 - i), N1->getOperand(1 - j));
15698 }
15701 if (!BVN0 || !BVN1)
15702 continue;
15703
15704 bool FoundMatch = true;
15705 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15708 if (!CN0 || !CN1 ||
15709 CN0->getAPIntValue().trunc(Bits) !=
15710 ~CN1->getAsAPIntVal().trunc(Bits)) {
15711 FoundMatch = false;
15712 break;
15713 }
15714 }
15715 if (FoundMatch)
15716 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15717 N0->getOperand(1 - i), N1->getOperand(1 - j));
15718 }
15719
15720 return SDValue();
15721}
15722
15723SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15724 SelectionDAG &DAG) const {
15725 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15726 !Subtarget->isNeonAvailable()))
15727 return LowerToScalableOp(Op, DAG);
15728
15729 if (SDValue Res = tryLowerToBSL(Op, DAG))
15730 return Res;
15731
15732 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15733 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15734 return Res;
15735
15736 EVT VT = Op.getValueType();
15737 if (VT.isScalableVector())
15738 return Op;
15739
15740 SDValue LHS = Op.getOperand(0);
15741 BuildVectorSDNode *BVN =
15742 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15743 if (!BVN) {
15744 // OR commutes, so try swapping the operands.
15745 LHS = Op.getOperand(1);
15746 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15747 }
15748 if (!BVN)
15749 return Op;
15750
15751 APInt DefBits(VT.getSizeInBits(), 0);
15752 APInt UndefBits(VT.getSizeInBits(), 0);
15753 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15754 SDValue NewOp;
15755
15756 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15757 DefBits, &LHS)) ||
15758 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15759 DefBits, &LHS)))
15760 return NewOp;
15761
15762 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15763 UndefBits, &LHS)) ||
15764 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15765 UndefBits, &LHS)))
15766 return NewOp;
15767 }
15768
15769 // We can always fall back to a non-immediate OR.
15770 return Op;
15771}
15772
15773// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15774// be truncated to fit element width.
15776 SelectionDAG &DAG) {
15777 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15778 SDLoc DL(Op);
15779 EVT VT = Op.getValueType();
15780 EVT EltTy= VT.getVectorElementType();
15781
15782 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15783 return Op;
15784
15786 for (SDValue Lane : Op->ops()) {
15787 // For integer vectors, type legalization would have promoted the
15788 // operands already. Otherwise, if Op is a floating-point splat
15789 // (with operands cast to integers), then the only possibilities
15790 // are constants and UNDEFs.
15791 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15792 Lane = DAG.getConstant(
15793 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15794 DL, MVT::i32);
15795 } else if (Lane.getOpcode() == ISD::POISON) {
15796 Lane = DAG.getPOISON(MVT::i32);
15797 } else if (Lane.getOpcode() == ISD::UNDEF) {
15798 Lane = DAG.getUNDEF(MVT::i32);
15799 } else {
15800 assert(Lane.getValueType() == MVT::i32 &&
15801 "Unexpected BUILD_VECTOR operand type");
15802 }
15803 Ops.push_back(Lane);
15804 }
15805 return DAG.getBuildVector(VT, DL, Ops);
15806}
15807
15809 const AArch64Subtarget *ST, APInt &DefBits) {
15810 EVT VT = Op.getValueType();
15811 // TODO: We should be able to support 64-bit destinations too
15812 if (!ST->hasSVE() || !VT.is128BitVector() ||
15813 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15814 return SDValue();
15815
15816 // See if we can make use of the SVE dup instruction.
15817 APInt Val64 = DefBits.trunc(64);
15818 int32_t ImmVal, ShiftVal;
15819 uint64_t Encoding;
15820 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal) &&
15821 !AArch64_AM::isSVELogicalImm(64, Val64.getZExtValue(), Encoding))
15822 return SDValue();
15823
15824 SDLoc DL(Op);
15825 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15826 DAG.getConstant(Val64, DL, MVT::i64));
15827 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15828 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15829}
15830
15832 const AArch64Subtarget *ST) {
15833 EVT VT = Op.getValueType();
15834 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15835 "Expected a legal NEON vector");
15836
15837 APInt DefBits(VT.getSizeInBits(), 0);
15838 APInt UndefBits(VT.getSizeInBits(), 0);
15840 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15841 auto TryMOVIWithBits = [&](APInt DefBits) {
15842 SDValue NewOp;
15843 if ((NewOp =
15844 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15845 (NewOp =
15846 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15847 (NewOp =
15848 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15849 (NewOp =
15850 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15851 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15852 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15853 return NewOp;
15854
15855 APInt NotDefBits = ~DefBits;
15856 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15857 NotDefBits)) ||
15858 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15859 NotDefBits)) ||
15860 (NewOp =
15861 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15862 return NewOp;
15863 return SDValue();
15864 };
15865 if (SDValue R = TryMOVIWithBits(DefBits))
15866 return R;
15867 if (SDValue R = TryMOVIWithBits(UndefBits))
15868 return R;
15869
15870 // Try to materialise the constant using SVE when available.
15871 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15872 return R;
15873
15874 // See if a fneg of the constant can be materialized with a MOVI, etc
15875 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15876 // FNegate each sub-element of the constant
15877 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15878 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15879 .zext(VT.getSizeInBits());
15880 APInt NegBits(VT.getSizeInBits(), 0);
15881 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15882 for (unsigned i = 0; i < NumElts; i++)
15883 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15884 NegBits = DefBits ^ NegBits;
15885
15886 // Try to create the new constants with MOVI, and if so generate a fneg
15887 // for it.
15888 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15889 SDLoc DL(Op);
15890 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15891 return DAG.getNode(
15892 AArch64ISD::NVCAST, DL, VT,
15893 DAG.getNode(ISD::FNEG, DL, VFVT,
15894 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15895 }
15896 return SDValue();
15897 };
15898 SDValue R;
15899 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15900 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15901 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15902 return R;
15903 }
15904
15905 return SDValue();
15906}
15907
15908SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15909 SDValue Op, SelectionDAG &DAG) const {
15910 EVT VT = Op.getValueType();
15911 SDLoc DL(Op);
15912 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15913 auto *BVN = cast<BuildVectorSDNode>(Op);
15914
15915 if (auto SeqInfo = BVN->isConstantSequence()) {
15916 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15917 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15918 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15919 return convertFromScalableVector(DAG, VT, Seq);
15920 }
15921
15922 unsigned NumElems = VT.getVectorNumElements();
15923 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15924 NumElems <= 1 || BVN->isConstant())
15925 return SDValue();
15926
15927 auto IsExtractElt = [](SDValue Op) {
15928 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15929 };
15930
15931 // For integer types that are not already in vectors limit to at most four
15932 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15933 if (VT.getScalarType().isInteger() &&
15934 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15935 return SDValue();
15936
15937 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15938 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15940 Op->op_values(), [&, Poison = DAG.getPOISON(ContainerVT)](SDValue Op) {
15941 return Op.isUndef() ? Poison
15942 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15943 ContainerVT, Poison, Op, ZeroI64);
15944 });
15945
15946 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15947 while (Intermediates.size() > 1) {
15948 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15949
15950 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15951 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15952 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15953 Intermediates[I / 2] =
15954 Op1.isUndef() ? Op0
15955 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15956 }
15957
15958 Intermediates.resize(Intermediates.size() / 2);
15959 ZipEC = ZipEC.divideCoefficientBy(2);
15960 }
15961
15962 assert(Intermediates.size() == 1);
15963 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15964 return convertFromScalableVector(DAG, VT, Vec);
15965}
15966
15967SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15968 SelectionDAG &DAG) const {
15969 EVT VT = Op.getValueType();
15970
15971 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15972 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15973 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15974 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15975
15976 // Try to build a simple constant vector.
15977 Op = NormalizeBuildVector(Op, DAG);
15978 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15979 // abort.
15980 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15981 return SDValue();
15982
15983 // Certain vector constants, used to express things like logical NOT and
15984 // arithmetic NEG, are passed through unmodified. This allows special
15985 // patterns for these operations to match, which will lower these constants
15986 // to whatever is proven necessary.
15987 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15988 if (BVN->isConstant()) {
15989 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15990 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15991 APInt Val(BitSize,
15992 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15993 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15994 return Op;
15995 }
15996 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15997 if (Const->isZero() && !Const->isNegative())
15998 return Op;
15999 }
16000
16001 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
16002 return V;
16003
16004 // Scan through the operands to find some interesting properties we can
16005 // exploit:
16006 // 1) If only one value is used, we can use a DUP, or
16007 // 2) if only the low element is not undef, we can just insert that, or
16008 // 3) if only one constant value is used (w/ some non-constant lanes),
16009 // we can splat the constant value into the whole vector then fill
16010 // in the non-constant lanes.
16011 // 4) FIXME: If different constant values are used, but we can intelligently
16012 // select the values we'll be overwriting for the non-constant
16013 // lanes such that we can directly materialize the vector
16014 // some other way (MOVI, e.g.), we can be sneaky.
16015 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
16016 SDLoc DL(Op);
16017 unsigned NumElts = VT.getVectorNumElements();
16018 bool isOnlyLowElement = true;
16019 bool usesOnlyOneValue = true;
16020 bool usesOnlyOneConstantValue = true;
16021 bool isConstant = true;
16022 bool AllLanesExtractElt = true;
16023 unsigned NumConstantLanes = 0;
16024 unsigned NumDifferentLanes = 0;
16025 unsigned NumUndefLanes = 0;
16026 SDValue Value;
16027 SDValue ConstantValue;
16028 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
16029 unsigned ConsecutiveValCount = 0;
16030 SDValue PrevVal;
16031 for (unsigned i = 0; i < NumElts; ++i) {
16032 SDValue V = Op.getOperand(i);
16033 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16034 AllLanesExtractElt = false;
16035 if (V.isUndef()) {
16036 ++NumUndefLanes;
16037 continue;
16038 }
16039 if (i > 0)
16040 isOnlyLowElement = false;
16041 if (!isIntOrFPConstant(V))
16042 isConstant = false;
16043
16044 if (isIntOrFPConstant(V)) {
16045 ++NumConstantLanes;
16046 if (!ConstantValue.getNode())
16047 ConstantValue = V;
16048 else if (ConstantValue != V)
16049 usesOnlyOneConstantValue = false;
16050 }
16051
16052 if (!Value.getNode())
16053 Value = V;
16054 else if (V != Value) {
16055 usesOnlyOneValue = false;
16056 ++NumDifferentLanes;
16057 }
16058
16059 if (PrevVal != V) {
16060 ConsecutiveValCount = 0;
16061 PrevVal = V;
16062 }
16063
16064 // Keep different values and its last consecutive count. For example,
16065 //
16066 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16067 // t24, t24, t24, t24, t24, t24, t24, t24
16068 // t23 = consecutive count 8
16069 // t24 = consecutive count 8
16070 // ------------------------------------------------------------------
16071 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
16072 // t24, t24, t24, t24, t24, t24, t24, t24
16073 // t23 = consecutive count 5
16074 // t24 = consecutive count 9
16075 DifferentValueMap[V] = ++ConsecutiveValCount;
16076 }
16077
16078 if (!Value.getNode()) {
16079 LLVM_DEBUG(
16080 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
16081 return DAG.getUNDEF(VT);
16082 }
16083
16084 // Convert BUILD_VECTOR where all elements but the lowest are undef into
16085 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
16086 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
16087 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
16088 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
16089 "SCALAR_TO_VECTOR node\n");
16090 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
16091 }
16092
16093 if (AllLanesExtractElt) {
16094 SDNode *Vector = nullptr;
16095 bool Even = false;
16096 bool Odd = false;
16097 // Check whether the extract elements match the Even pattern <0,2,4,...> or
16098 // the Odd pattern <1,3,5,...>.
16099 for (unsigned i = 0; i < NumElts; ++i) {
16100 SDValue V = Op.getOperand(i);
16101 const SDNode *N = V.getNode();
16102 if (!isa<ConstantSDNode>(N->getOperand(1))) {
16103 Even = false;
16104 Odd = false;
16105 break;
16106 }
16107 SDValue N0 = N->getOperand(0);
16108
16109 // All elements are extracted from the same vector.
16110 if (!Vector) {
16111 Vector = N0.getNode();
16112 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
16113 // BUILD_VECTOR.
16114 if (VT.getVectorElementType() !=
16116 break;
16117 } else if (Vector != N0.getNode()) {
16118 Odd = false;
16119 Even = false;
16120 break;
16121 }
16122
16123 // Extracted values are either at Even indices <0,2,4,...> or at Odd
16124 // indices <1,3,5,...>.
16125 uint64_t Val = N->getConstantOperandVal(1);
16126 if (Val == 2 * i) {
16127 Even = true;
16128 continue;
16129 }
16130 if (Val - 1 == 2 * i) {
16131 Odd = true;
16132 continue;
16133 }
16134
16135 // Something does not match: abort.
16136 Odd = false;
16137 Even = false;
16138 break;
16139 }
16140 if (Even || Odd) {
16141 SDValue LHS =
16143 DAG.getConstant(0, DL, MVT::i64));
16144 SDValue RHS =
16146 DAG.getConstant(NumElts, DL, MVT::i64));
16147
16148 if (Even && !Odd)
16149 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
16150 if (Odd && !Even)
16151 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
16152 }
16153 }
16154
16155 // Use DUP for non-constant splats. For f32 constant splats, reduce to
16156 // i32 and try again.
16157 if (usesOnlyOneValue) {
16158 if (!isConstant) {
16159 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16160 Value.getValueType() != VT) {
16161 LLVM_DEBUG(
16162 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
16163 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
16164 }
16165
16166 // This is actually a DUPLANExx operation, which keeps everything vectory.
16167
16168 SDValue Lane = Value.getOperand(1);
16169 Value = Value.getOperand(0);
16170 if (Value.getValueSizeInBits() == 64) {
16171 LLVM_DEBUG(
16172 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
16173 "widening it\n");
16174 Value = WidenVector(Value, DAG);
16175 }
16176
16177 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
16178 return DAG.getNode(Opcode, DL, VT, Value, Lane);
16179 }
16180
16183 EVT EltTy = VT.getVectorElementType();
16184 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
16185 EltTy == MVT::f64) && "Unsupported floating-point vector type");
16186 LLVM_DEBUG(
16187 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
16188 "BITCASTS, and try again\n");
16189 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
16190 for (unsigned i = 0; i < NumElts; ++i)
16191 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
16192 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
16193 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
16194 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
16195 Val.dump(););
16196 Val = LowerBUILD_VECTOR(Val, DAG);
16197 if (Val.getNode())
16198 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
16199 }
16200 }
16201
16202 // If we need to insert a small number of different non-constant elements and
16203 // the vector width is sufficiently large, prefer using DUP with the common
16204 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
16205 // skip the constant lane handling below.
16206 bool PreferDUPAndInsert =
16207 !isConstant && NumDifferentLanes >= 1 &&
16208 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
16209 NumDifferentLanes >= NumConstantLanes;
16210
16211 // If there was only one constant value used and for more than one lane,
16212 // start by splatting that value, then replace the non-constant lanes. This
16213 // is better than the default, which will perform a separate initialization
16214 // for each lane.
16215 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
16216 // Firstly, try to materialize the splat constant.
16217 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
16218 unsigned BitSize = VT.getScalarSizeInBits();
16219 APInt ConstantValueAPInt(1, 0);
16220 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
16221 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
16222 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
16223 !ConstantValueAPInt.isAllOnes()) {
16224 Val = ConstantBuildVector(Val, DAG, Subtarget);
16225 if (!Val)
16226 // Otherwise, materialize the constant and splat it.
16227 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
16228 }
16229
16230 // Now insert the non-constant lanes.
16231 for (unsigned i = 0; i < NumElts; ++i) {
16232 SDValue V = Op.getOperand(i);
16233 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16234 if (!isIntOrFPConstant(V) && !V.isUndef())
16235 // Note that type legalization likely mucked about with the VT of the
16236 // source operand, so we may have to convert it here before inserting.
16237 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
16238 }
16239 return Val;
16240 }
16241
16242 // This will generate a load from the constant pool.
16243 if (isConstant) {
16244 LLVM_DEBUG(
16245 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
16246 "expansion\n");
16247 return SDValue();
16248 }
16249
16250 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
16251 // v4i32s. This is really a truncate, which we can construct out of (legal)
16252 // concats and truncate nodes.
16254 return M;
16255
16256 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
16257 if (NumElts >= 4) {
16258 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
16259 return Shuffle;
16260
16261 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
16262 return Shuffle;
16263 }
16264
16265 if (PreferDUPAndInsert) {
16266 // First, build a constant vector with the common element.
16268 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
16269 // Next, insert the elements that do not match the common value.
16270 for (unsigned I = 0; I < NumElts; ++I)
16271 if (Op.getOperand(I) != Value)
16272 NewVector =
16273 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
16274 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
16275
16276 return NewVector;
16277 }
16278
16279 // If vector consists of two different values, try to generate two DUPs and
16280 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
16281 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
16283 // Check the consecutive count of the value is the half number of vector
16284 // elements. In this case, we can use CONCAT_VECTORS. For example,
16285 //
16286 // canUseVECTOR_CONCAT = true;
16287 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16288 // t24, t24, t24, t24, t24, t24, t24, t24
16289 //
16290 // canUseVECTOR_CONCAT = false;
16291 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
16292 // t24, t24, t24, t24, t24, t24, t24, t24
16293 bool canUseVECTOR_CONCAT = true;
16294 for (auto Pair : DifferentValueMap) {
16295 // Check different values have same length which is NumElts / 2.
16296 if (Pair.second != NumElts / 2)
16297 canUseVECTOR_CONCAT = false;
16298 Vals.push_back(Pair.first);
16299 }
16300
16301 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
16302 // CONCAT_VECTORs. For example,
16303 //
16304 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
16305 // t24, t24, t24, t24, t24, t24, t24, t24
16306 // ==>
16307 // t26: v8i8 = AArch64ISD::DUP t23
16308 // t28: v8i8 = AArch64ISD::DUP t24
16309 // t29: v16i8 = concat_vectors t26, t28
16310 if (canUseVECTOR_CONCAT) {
16311 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16312 if (isTypeLegal(SubVT) && SubVT.isVector() &&
16313 SubVT.getVectorNumElements() >= 2) {
16314 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
16315 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
16316 SDValue DUP1 =
16317 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
16318 SDValue DUP2 =
16319 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
16321 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
16322 return CONCAT_VECTORS;
16323 }
16324 }
16325
16326 // Let's try to generate VECTOR_SHUFFLE. For example,
16327 //
16328 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
16329 // ==>
16330 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
16331 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
16332 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
16333 if (NumElts >= 8) {
16334 SmallVector<int, 16> MaskVec;
16335 // Build mask for VECTOR_SHUFLLE.
16336 SDValue FirstLaneVal = Op.getOperand(0);
16337 for (unsigned i = 0; i < NumElts; ++i) {
16338 SDValue Val = Op.getOperand(i);
16339 if (FirstLaneVal == Val)
16340 MaskVec.push_back(i);
16341 else
16342 MaskVec.push_back(i + NumElts);
16343 }
16344
16345 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
16346 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
16347 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
16348 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
16350 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
16351 return VECTOR_SHUFFLE;
16352 }
16353 }
16354
16355 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
16356 // know the default expansion would otherwise fall back on something even
16357 // worse. For a vector with one or two non-undef values, that's
16358 // scalar_to_vector for the elements followed by a shuffle (provided the
16359 // shuffle is valid for the target) and materialization element by element
16360 // on the stack followed by a load for everything else.
16361 if (!isConstant && !usesOnlyOneValue) {
16362 LLVM_DEBUG(
16363 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
16364 "of INSERT_VECTOR_ELT\n");
16365
16366 SDValue Vec = DAG.getPOISON(VT);
16367 SDValue Op0 = Op.getOperand(0);
16368 unsigned i = 0;
16369
16370 // Use SCALAR_TO_VECTOR for lane zero to
16371 // a) Avoid a RMW dependency on the full vector register, and
16372 // b) Allow the register coalescer to fold away the copy if the
16373 // value is already in an S or D register, and we're forced to emit an
16374 // INSERT_SUBREG that we can't fold anywhere.
16375 //
16376 // We also allow types like i8 and i16 which are illegal scalar but legal
16377 // vector element types. After type-legalization the inserted value is
16378 // extended (i32) and it is safe to cast them to the vector type by ignoring
16379 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
16380 if (!Op0.isUndef()) {
16381 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
16382 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
16383 ++i;
16384 }
16385 LLVM_DEBUG({
16386 if (i < NumElts)
16387 dbgs() << "Creating nodes for the other vector elements:\n";
16388 });
16389 for (; i < NumElts; ++i) {
16390 SDValue V = Op.getOperand(i);
16391 if (V.isUndef())
16392 continue;
16393 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16394 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
16395 }
16396 return Vec;
16397 }
16398
16399 LLVM_DEBUG(
16400 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
16401 "better alternative\n");
16402 return SDValue();
16403}
16404
16405SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
16406 SelectionDAG &DAG) const {
16407 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16408 !Subtarget->isNeonAvailable()))
16409 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
16410
16411 assert(Op.getValueType().isScalableVector() &&
16412 isTypeLegal(Op.getValueType()) &&
16413 "Expected legal scalable vector type!");
16414
16415 if (isTypeLegal(Op.getOperand(0).getValueType())) {
16416 unsigned NumOperands = Op->getNumOperands();
16417 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
16418 "Unexpected number of operands in CONCAT_VECTORS");
16419
16420 if (NumOperands == 2)
16421 return Op;
16422
16423 // Concat each pair of subvectors and pack into the lower half of the array.
16424 SmallVector<SDValue> ConcatOps(Op->ops());
16425 while (ConcatOps.size() > 1) {
16426 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
16427 SDValue V1 = ConcatOps[I];
16428 SDValue V2 = ConcatOps[I + 1];
16429 EVT SubVT = V1.getValueType();
16430 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
16431 ConcatOps[I / 2] =
16432 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
16433 }
16434 ConcatOps.resize(ConcatOps.size() / 2);
16435 }
16436 return ConcatOps[0];
16437 }
16438
16439 return SDValue();
16440}
16441
16442SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
16443 SelectionDAG &DAG) const {
16444 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
16445
16446 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16447 !Subtarget->isNeonAvailable()))
16448 return LowerFixedLengthInsertVectorElt(Op, DAG);
16449
16450 EVT VT = Op.getOperand(0).getValueType();
16451
16452 if (VT.getScalarType() == MVT::i1) {
16453 EVT VectorVT = getPromotedVTForPredicate(VT);
16454 SDLoc DL(Op);
16455 SDValue ExtendedVector =
16456 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
16457 SDValue ExtendedValue =
16458 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
16459 VectorVT.getScalarType().getSizeInBits() < 32
16460 ? MVT::i32
16461 : VectorVT.getScalarType());
16462 ExtendedVector =
16463 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
16464 ExtendedValue, Op.getOperand(2));
16465 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
16466 }
16467
16468 // Check for non-constant or out of range lane.
16469 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
16470 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16471 return SDValue();
16472
16473 return Op;
16474}
16475
16476SDValue
16477AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16478 SelectionDAG &DAG) const {
16479 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16480 EVT VT = Op.getOperand(0).getValueType();
16481
16482 if (VT.getScalarType() == MVT::i1) {
16483 SDLoc DL(Op);
16484 // There are no operations to extend a nxv1i1 predicate to a nxv1i128 vector
16485 // An easy lowering is widening the input predicate to nxv2i1.
16486 if (VT == MVT::nxv1i1) {
16487 SDValue WidenedPred = DAG.getInsertSubvector(
16488 DL, DAG.getPOISON(MVT::nxv2i1), Op->getOperand(0), 0);
16489 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
16490 WidenedPred, Op.getOperand(1));
16491 }
16492 // We can't directly extract from an SVE predicate; extend it first.
16493 // (This isn't the only possible lowering, but it's straightforward.)
16494 EVT VectorVT = getPromotedVTForPredicate(VT);
16495 SDValue Extend =
16496 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16497 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16498 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16499 Extend, Op.getOperand(1));
16500 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16501 }
16502
16503 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16504 return LowerFixedLengthExtractVectorElt(Op, DAG);
16505
16506 // Check for non-constant or out of range lane.
16507 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16508 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16509 return SDValue();
16510
16511 // Insertion/extraction are legal for V128 types.
16512 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16513 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16514 VT == MVT::v8f16 || VT == MVT::v8bf16)
16515 return Op;
16516
16517 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16518 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16519 VT != MVT::v4bf16)
16520 return SDValue();
16521
16522 // For V64 types, we perform extraction by expanding the value
16523 // to a V128 type and perform the extraction on that.
16524 SDLoc DL(Op);
16525 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16526 EVT WideTy = WideVec.getValueType();
16527
16528 EVT ExtrTy = WideTy.getVectorElementType();
16529 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16530 ExtrTy = MVT::i32;
16531
16532 // For extractions, we just return the result directly.
16533 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16534 Op.getOperand(1));
16535}
16536
16537SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16538 SelectionDAG &DAG) const {
16539 EVT VT = Op.getValueType();
16541 "Only cases that extract a fixed length vector are supported!");
16542 EVT InVT = Op.getOperand(0).getValueType();
16543
16544 // If we don't have legal types yet, do nothing
16545 if (!isTypeLegal(InVT))
16546 return SDValue();
16547
16548 if (InVT.is128BitVector()) {
16549 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16550 unsigned Idx = Op.getConstantOperandVal(1);
16551
16552 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16553 if (Idx == 0)
16554 return Op;
16555
16556 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16557 // that directly.
16558 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16559 return Op;
16560 }
16561
16562 if (InVT.isScalableVector() ||
16563 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16564 SDLoc DL(Op);
16565 SDValue Vec = Op.getOperand(0);
16566 SDValue Idx = Op.getOperand(1);
16567
16568 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16569 if (PackedVT != InVT) {
16570 // Pack input into the bottom part of an SVE register and try again.
16571 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16572 DAG.getPOISON(PackedVT), Vec,
16573 DAG.getVectorIdxConstant(0, DL));
16574 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16575 }
16576
16577 // This will get matched by custom code during ISelDAGToDAG.
16578 if (isNullConstant(Idx))
16579 return Op;
16580
16581 assert(InVT.isScalableVector() && "Unexpected vector type!");
16582 // Move requested subvector to the start of the vector and try again.
16583 SDValue Splice =
16584 DAG.getNode(ISD::VECTOR_SPLICE_LEFT, DL, InVT, Vec, Vec, Idx);
16585 return convertFromScalableVector(DAG, VT, Splice);
16586 }
16587
16588 return SDValue();
16589}
16590
16591SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16592 SelectionDAG &DAG) const {
16593 assert(Op.getValueType().isScalableVector() &&
16594 "Only expect to lower inserts into scalable vectors!");
16595
16596 EVT InVT = Op.getOperand(1).getValueType();
16597 unsigned Idx = Op.getConstantOperandVal(2);
16598
16599 SDValue Vec0 = Op.getOperand(0);
16600 SDValue Vec1 = Op.getOperand(1);
16601 SDLoc DL(Op);
16602 EVT VT = Op.getValueType();
16603
16604 if (InVT.isScalableVector()) {
16605 if (!isTypeLegal(VT))
16606 return SDValue();
16607
16608 // Break down insert_subvector into simpler parts.
16609 if (VT.getVectorElementType() == MVT::i1) {
16610 unsigned NumElts = VT.getVectorMinNumElements();
16611 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16612
16613 SDValue Lo, Hi;
16614 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16615 DAG.getVectorIdxConstant(0, DL));
16616 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16617 DAG.getVectorIdxConstant(NumElts / 2, DL));
16618 if (Idx < (NumElts / 2))
16619 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16620 DAG.getVectorIdxConstant(Idx, DL));
16621 else
16622 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16623 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16624
16625 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16626 }
16627
16628 // We can select these directly.
16629 if (isTypeLegal(InVT) && Vec0.isUndef())
16630 return Op;
16631
16632 // Ensure the subvector is half the size of the main vector.
16633 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16634 return SDValue();
16635
16636 // Here narrow and wide refers to the vector element types. After "casting"
16637 // both vectors must have the same bit length and so because the subvector
16638 // has fewer elements, those elements need to be bigger.
16639 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16640 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16641
16642 // NOP cast operands to the largest legal vector of the same element count.
16643 if (VT.isFloatingPoint()) {
16644 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16645 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16646 } else {
16647 // Legal integer vectors are already their largest so Vec0 is fine as is.
16648 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16649 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16650 }
16651
16652 // To replace the top/bottom half of vector V with vector SubV we widen the
16653 // preserved half of V, concatenate this to SubV (the order depending on the
16654 // half being replaced) and then narrow the result.
16655 SDValue Narrow;
16656 if (Idx == 0) {
16657 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16658 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16659 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16660 } else {
16661 assert(Idx == InVT.getVectorMinNumElements() &&
16662 "Invalid subvector index!");
16663 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16664 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16665 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16666 }
16667
16668 return getSVESafeBitCast(VT, Narrow, DAG);
16669 }
16670
16671 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16672 // This will be matched by custom code during ISelDAGToDAG.
16673 if (Vec0.isUndef())
16674 return Op;
16675
16676 std::optional<unsigned> PredPattern =
16678 auto PredTy = VT.changeVectorElementType(*DAG.getContext(), MVT::i1);
16679 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16680 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16681 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16682 }
16683
16684 return SDValue();
16685}
16686
16687static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16688 if (Op.getOpcode() != AArch64ISD::DUP &&
16689 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16690 Op.getOpcode() != ISD::BUILD_VECTOR)
16691 return false;
16692
16693 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16694 !isAllConstantBuildVector(Op, SplatVal))
16695 return false;
16696
16697 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16698 !isa<ConstantSDNode>(Op->getOperand(0)))
16699 return false;
16700
16701 SplatVal = Op->getConstantOperandVal(0);
16702 if (Op.getValueType().getVectorElementType() != MVT::i64)
16703 SplatVal = (int32_t)SplatVal;
16704
16705 Negated = false;
16706 if (isPowerOf2_64(SplatVal))
16707 return true;
16708
16709 Negated = true;
16710 if (isPowerOf2_64(-SplatVal)) {
16711 SplatVal = -SplatVal;
16712 return true;
16713 }
16714
16715 return false;
16716}
16717
16718SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16719 EVT VT = Op.getValueType();
16720 SDLoc DL(Op);
16721
16722 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16723 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16724
16725 assert(VT.isScalableVector() && "Expected a scalable vector.");
16726
16727 bool Signed = Op.getOpcode() == ISD::SDIV;
16728 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16729
16730 bool Negated;
16731 uint64_t SplatVal;
16732 // NOTE: SRAD cannot be used to represent sdiv-by-one.
16733 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
16734 SplatVal > 1) {
16736 SDValue Res =
16737 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16738 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16739 if (Negated)
16740 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16741
16742 return Res;
16743 }
16744
16745 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16746 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16747
16748 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16749 // operations, and truncate the result.
16750 EVT WidenedVT;
16751 if (VT == MVT::nxv16i8)
16752 WidenedVT = MVT::nxv8i16;
16753 else if (VT == MVT::nxv8i16)
16754 WidenedVT = MVT::nxv4i32;
16755 else
16756 llvm_unreachable("Unexpected Custom DIV operation");
16757
16758 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16759 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16760 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16761 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16762 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16763 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16764 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16765 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16766 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16767 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16768 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16769}
16770
16771bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16772 EVT VT, unsigned DefinedValues) const {
16773 if (!Subtarget->isNeonAvailable())
16774 return false;
16776}
16777
16779 // Currently no fixed length shuffles that require SVE are legal.
16780 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16781 return false;
16782
16783 if (VT.getVectorNumElements() == 4 &&
16784 (VT.is128BitVector() || VT.is64BitVector())) {
16785 unsigned Cost = getPerfectShuffleCost(M);
16786 if (Cost <= 1)
16787 return true;
16788 }
16789
16790 bool DummyBool;
16791 int DummyInt;
16792 unsigned DummyUnsigned;
16793
16794 unsigned EltSize = VT.getScalarSizeInBits();
16795 unsigned NumElts = VT.getVectorNumElements();
16797 isREVMask(M, EltSize, NumElts, 64) ||
16798 isREVMask(M, EltSize, NumElts, 32) ||
16799 isREVMask(M, EltSize, NumElts, 16) ||
16800 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16801 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16802 isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16803 isUZPMask(M, NumElts, DummyUnsigned) ||
16804 isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16805 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16806 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16807 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16808 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16809 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16810}
16811
16813 EVT VT) const {
16814 // Just delegate to the generic legality, clear masks aren't special.
16815 return isShuffleMaskLegal(M, VT);
16816}
16817
16818/// getVShiftImm - Check if this is a valid build_vector for the immediate
16819/// operand of a vector shift operation, where all the elements of the
16820/// build_vector must have the same constant integer value.
16821static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16822 // Ignore bit_converts.
16823 while (Op.getOpcode() == ISD::BITCAST)
16824 Op = Op.getOperand(0);
16826 APInt SplatBits, SplatUndef;
16827 unsigned SplatBitSize;
16828 bool HasAnyUndefs;
16829 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16830 HasAnyUndefs, ElementBits) ||
16831 SplatBitSize > ElementBits)
16832 return false;
16833 Cnt = SplatBits.getSExtValue();
16834 return true;
16835}
16836
16837/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16838/// operand of a vector shift left operation. That value must be in the range:
16839/// 0 <= Value < ElementBits for a left shift; or
16840/// 0 <= Value <= ElementBits for a long left shift.
16841static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16842 assert(VT.isVector() && "vector shift count is not a vector type");
16843 int64_t ElementBits = VT.getScalarSizeInBits();
16844 if (!getVShiftImm(Op, ElementBits, Cnt))
16845 return false;
16846 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16847}
16848
16849/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16850/// operand of a vector shift right operation. The value must be in the range:
16851/// 1 <= Value <= ElementBits for a right shift; or
16852static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16853 assert(VT.isVector() && "vector shift count is not a vector type");
16854 int64_t ElementBits = VT.getScalarSizeInBits();
16855 if (!getVShiftImm(Op, ElementBits, Cnt))
16856 return false;
16857 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16858}
16859
16860SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16861 SelectionDAG &DAG) const {
16862 EVT VT = Op.getValueType();
16863
16864 if (VT.getScalarType() == MVT::i1) {
16865 // Lower i1 truncate to `(x & 1) != 0`.
16866 SDLoc DL(Op);
16867 EVT OpVT = Op.getOperand(0).getValueType();
16868 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16869 SDValue One = DAG.getConstant(1, DL, OpVT);
16870 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16871 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16872 }
16873
16874 if (!VT.isVector() || VT.isScalableVector())
16875 return SDValue();
16876
16877 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16878 !Subtarget->isNeonAvailable()))
16879 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16880
16881 return SDValue();
16882}
16883
16884// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16885// possibly a truncated type, it tells how many bits of the value are to be
16886// used.
16888 SelectionDAG &DAG,
16889 unsigned &ShiftValue,
16890 SDValue &RShOperand) {
16891 if (Shift->getOpcode() != ISD::SRL)
16892 return false;
16893
16894 EVT VT = Shift.getValueType();
16895 assert(VT.isScalableVT());
16896
16897 auto ShiftOp1 =
16899 if (!ShiftOp1)
16900 return false;
16901
16902 ShiftValue = ShiftOp1->getZExtValue();
16903 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16904 return false;
16905
16906 SDValue Add = Shift->getOperand(0);
16907 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16908 return false;
16909
16911 "ResVT must be truncated or same type as the shift.");
16912 // Check if an overflow can lead to incorrect results.
16913 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16914 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16915 return false;
16916
16917 auto AddOp1 =
16919 if (!AddOp1)
16920 return false;
16921 uint64_t AddValue = AddOp1->getZExtValue();
16922 if (AddValue != 1ULL << (ShiftValue - 1))
16923 return false;
16924
16925 RShOperand = Add->getOperand(0);
16926 return true;
16927}
16928
16929SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16930 SelectionDAG &DAG) const {
16931 EVT VT = Op.getValueType();
16932 SDLoc DL(Op);
16933 int64_t Cnt;
16934
16935 if (!Op.getOperand(1).getValueType().isVector())
16936 return Op;
16937 unsigned EltSize = VT.getScalarSizeInBits();
16938
16939 switch (Op.getOpcode()) {
16940 case ISD::SHL:
16941 if (VT.isScalableVector() ||
16942 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16943 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16944
16945 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16946 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16947 DAG.getTargetConstant(Cnt, DL, MVT::i32));
16948 return DAG.getNode(
16950 DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
16951 Op.getOperand(0), Op.getOperand(1));
16952 case ISD::SRA:
16953 case ISD::SRL:
16954 if (VT.isScalableVector() &&
16955 (Subtarget->hasSVE2() ||
16956 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16957 SDValue RShOperand;
16958 unsigned ShiftValue;
16959 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16960 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16961 getPredicateForVector(DAG, DL, VT), RShOperand,
16962 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16963 }
16964
16965 if (VT.isScalableVector() ||
16966 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16967 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16968 : AArch64ISD::SRL_PRED;
16969 return LowerToPredicatedOp(Op, DAG, Opc);
16970 }
16971
16972 // Right shift immediate
16973 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16974 unsigned Opc =
16975 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16976 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16977 DAG.getTargetConstant(Cnt, DL, MVT::i32),
16978 Op->getFlags());
16979 }
16980
16981 // Right shift register. Note, there is not a shift right register
16982 // instruction, but the shift left register instruction takes a signed
16983 // value, where negative numbers specify a right shift.
16984 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16985 : Intrinsic::aarch64_neon_ushl;
16986 // negate the shift amount
16987 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16988 Op.getOperand(1));
16989 SDValue NegShiftLeft =
16991 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16992 NegShift);
16993 return NegShiftLeft;
16994 }
16995
16996 llvm_unreachable("unexpected shift opcode");
16997}
16998
16999SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
17000 SelectionDAG &DAG) const {
17001 if (Op.getValueType().isScalableVector())
17002 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
17003
17004 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
17005 !Subtarget->isNeonAvailable()))
17006 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
17007
17008 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17009 SDValue LHS = Op.getOperand(0);
17010 SDValue RHS = Op.getOperand(1);
17011 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
17012 SDLoc DL(Op);
17013
17014 if (LHS.getValueType().getVectorElementType().isInteger())
17015 return Op;
17016
17017 assert(((!Subtarget->hasFullFP16() &&
17018 LHS.getValueType().getVectorElementType() != MVT::f16) ||
17019 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
17020 LHS.getValueType().getVectorElementType() != MVT::f128) &&
17021 "Unexpected type!");
17022
17023 // Lower isnan(x) | isnan(never-nan) to x != x.
17024 // Lower !isnan(x) & !isnan(never-nan) to x == x.
17025 if (CC == ISD::SETUO || CC == ISD::SETO) {
17026 bool OneNaN = false;
17027 if (LHS == RHS) {
17028 OneNaN = true;
17029 } else if (DAG.isKnownNeverNaN(RHS)) {
17030 OneNaN = true;
17031 RHS = LHS;
17032 } else if (DAG.isKnownNeverNaN(LHS)) {
17033 OneNaN = true;
17034 LHS = RHS;
17035 }
17036 if (OneNaN) {
17037 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
17038 }
17039 }
17040
17041 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
17042 // clean. Some of them require two branches to implement.
17043 AArch64CC::CondCode CC1, CC2;
17044 bool ShouldInvert;
17045 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
17046
17047 bool NoNaNs =
17048 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
17049 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
17050 if (!Cmp.getNode())
17051 return SDValue();
17052
17053 if (CC2 != AArch64CC::AL) {
17054 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
17055 if (!Cmp2.getNode())
17056 return SDValue();
17057
17058 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
17059 }
17060
17061 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
17062
17063 if (ShouldInvert)
17064 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
17065
17066 return Cmp;
17067}
17068
17069static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
17070 SelectionDAG &DAG) {
17071 SDValue VecOp = ScalarOp.getOperand(0);
17072 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
17073 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
17074 DAG.getConstant(0, DL, MVT::i64));
17075}
17076
17077static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
17078 SDLoc DL, SelectionDAG &DAG) {
17079 unsigned ScalarOpcode;
17080 switch (Opcode) {
17081 case ISD::VECREDUCE_AND:
17082 ScalarOpcode = ISD::AND;
17083 break;
17084 case ISD::VECREDUCE_OR:
17085 ScalarOpcode = ISD::OR;
17086 break;
17087 case ISD::VECREDUCE_XOR:
17088 ScalarOpcode = ISD::XOR;
17089 break;
17090 default:
17091 llvm_unreachable("Expected bitwise vector reduction");
17092 return SDValue();
17093 }
17094
17095 EVT VecVT = Vec.getValueType();
17096 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
17097 "Expected power-of-2 length vector");
17098
17099 EVT ElemVT = VecVT.getVectorElementType();
17100
17101 SDValue Result;
17102 unsigned NumElems = VecVT.getVectorNumElements();
17103
17104 // Special case for boolean reductions
17105 if (ElemVT == MVT::i1) {
17106 // Split large vectors into smaller ones
17107 if (NumElems > 16) {
17108 SDValue Lo, Hi;
17109 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17110 EVT HalfVT = Lo.getValueType();
17111 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
17112 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
17113 }
17114
17115 // Results of setcc operations get widened to 128 bits if their input
17116 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
17117 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
17118 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
17119 // size leads to the best codegen, since e.g. setcc results might need to be
17120 // truncated otherwise.
17121 unsigned ExtendedWidth = 64;
17122 if (Vec.getOpcode() == ISD::SETCC &&
17123 Vec.getOperand(0).getValueSizeInBits() >= 128) {
17124 ExtendedWidth = 128;
17125 }
17126 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
17127
17128 // any_ext doesn't work with umin/umax, so only use it for uadd.
17129 unsigned ExtendOp =
17130 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
17131 SDValue Extended = DAG.getNode(
17132 ExtendOp, DL,
17133 VecVT.changeVectorElementType(*DAG.getContext(), ExtendedVT), Vec);
17134 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
17135 // in that case we bitcast the sign extended values from v2i64 to v4i32
17136 // before reduction for optimal code generation.
17137 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
17138 NumElems == 2 && ExtendedWidth == 128) {
17139 Extended = DAG.getBitcast(MVT::v4i32, Extended);
17140 ExtendedVT = MVT::i32;
17141 }
17142 switch (ScalarOpcode) {
17143 case ISD::AND:
17144 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
17145 break;
17146 case ISD::OR:
17147 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
17148 break;
17149 case ISD::XOR:
17150 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
17151 break;
17152 default:
17153 llvm_unreachable("Unexpected Opcode");
17154 }
17155
17156 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
17157 } else {
17158 // Iteratively split the vector in half and combine using the bitwise
17159 // operation until it fits in a 64 bit register.
17160 while (VecVT.getSizeInBits() > 64) {
17161 SDValue Lo, Hi;
17162 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17163 VecVT = Lo.getValueType();
17164 NumElems = VecVT.getVectorNumElements();
17165 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
17166 }
17167
17168 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
17169
17170 // Do the remaining work on a scalar since it allows the code generator to
17171 // combine the shift and bitwise operation into one instruction and since
17172 // integer instructions can have higher throughput than vector instructions.
17173 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
17174
17175 // Iteratively combine the lower and upper halves of the scalar using the
17176 // bitwise operation, halving the relevant region of the scalar in each
17177 // iteration, until the relevant region is just one element of the original
17178 // vector.
17179 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
17180 SDValue ShiftAmount =
17181 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
17182 SDValue Shifted =
17183 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
17184 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
17185 }
17186
17187 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
17188 }
17189
17190 return DAG.getAnyExtOrTrunc(Result, DL, VT);
17191}
17192
17193SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
17194 SelectionDAG &DAG) const {
17195 SDValue Src = Op.getOperand(0);
17196 EVT SrcVT = Src.getValueType();
17197
17198 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
17199 // widening by inserting zeroes.
17200 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
17201 SrcVT == MVT::v2f16) {
17202 SDLoc DL(Op);
17203 return DAG.getNode(ISD::FADD, DL, MVT::f16,
17204 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
17205 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
17206 }
17207
17208 // Try to lower fixed length reductions to SVE.
17209 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
17210 Op.getOpcode() == ISD::VECREDUCE_AND ||
17211 Op.getOpcode() == ISD::VECREDUCE_OR ||
17212 Op.getOpcode() == ISD::VECREDUCE_XOR ||
17213 Op.getOpcode() == ISD::VECREDUCE_FADD ||
17214 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
17215 SrcVT.getVectorElementType() == MVT::i64);
17216 if (SrcVT.isScalableVector() ||
17218 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
17219
17220 if (SrcVT.getVectorElementType() == MVT::i1)
17221 return LowerPredReductionToSVE(Op, DAG);
17222
17223 switch (Op.getOpcode()) {
17224 case ISD::VECREDUCE_ADD:
17225 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
17226 case ISD::VECREDUCE_AND:
17227 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
17228 case ISD::VECREDUCE_OR:
17229 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
17231 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
17233 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
17235 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
17237 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
17238 case ISD::VECREDUCE_XOR:
17239 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
17241 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
17243 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
17245 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
17247 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
17249 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
17250 default:
17251 llvm_unreachable("Unhandled fixed length reduction");
17252 }
17253 }
17254
17255 // Lower NEON reductions.
17256 SDLoc DL(Op);
17257 switch (Op.getOpcode()) {
17258 case ISD::VECREDUCE_AND:
17259 case ISD::VECREDUCE_OR:
17260 case ISD::VECREDUCE_XOR:
17261 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
17262 Op.getValueType(), DL, DAG);
17263 case ISD::VECREDUCE_ADD:
17264 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
17266 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
17268 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
17270 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
17272 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
17273 default:
17274 llvm_unreachable("Unhandled reduction");
17275 }
17276}
17277
17278SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
17279 SelectionDAG &DAG) const {
17280 SDLoc DL(Op);
17281 SDValue Src = Op.getOperand(0);
17282 EVT SrcVT = Src.getValueType();
17283 assert(SrcVT.isScalableVector() && "Unexpected operand type!");
17284
17285 SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
17286 unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
17287 SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
17288
17289 // Whilst we don't know the size of the vector we do know the maximum size so
17290 // can perform a tree reduction with an identity vector, which means once we
17291 // arrive at the result the remaining stages (when the vector is smaller than
17292 // the maximum) have no affect.
17293
17295 unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
17296
17297 for (unsigned I = 0; I < Stages; ++I) {
17298 Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
17299 Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
17300 }
17301
17302 return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
17303}
17304
17305SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
17306 SelectionDAG &DAG) const {
17307 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17308 // No point replacing if we don't have the relevant instruction/libcall anyway
17309 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
17310 return SDValue();
17311
17312 // LSE has an atomic load-clear instruction, but not a load-and.
17313 SDLoc DL(Op);
17314 MVT VT = Op.getSimpleValueType();
17315 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
17316 SDValue RHS = Op.getOperand(2);
17317 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
17318 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
17319 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
17320 Op.getOperand(0), Op.getOperand(1), RHS,
17321 AN->getMemOperand());
17322}
17323
17324SDValue
17325AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
17326 SelectionDAG &DAG) const {
17327
17328 SDLoc DL(Op);
17329 // Get the inputs.
17330 SDNode *Node = Op.getNode();
17331 SDValue Chain = Op.getOperand(0);
17332 SDValue Size = Op.getOperand(1);
17333 MaybeAlign Align =
17334 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17335 EVT VT = Node->getValueType(0);
17336
17338 "no-stack-arg-probe")) {
17339 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17340 Chain = SP.getValue(1);
17341 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17342 if (Align)
17343 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17344 DAG.getSignedConstant(-Align->value(), DL, VT));
17345 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17346 SDValue Ops[2] = {SP, Chain};
17347 return DAG.getMergeValues(Ops, DL);
17348 }
17349
17350 RTLIB::LibcallImpl ChkStkImpl = getLibcallImpl(RTLIB::STACK_PROBE);
17351 if (ChkStkImpl == RTLIB::Unsupported)
17352 return SDValue();
17353
17354 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
17355
17356 EVT PtrVT = getPointerTy(DAG.getDataLayout());
17358 getLibcallImplName(ChkStkImpl).data(), PtrVT, 0);
17359
17360 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17361 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
17362 if (Subtarget->hasCustomCallingConv())
17363 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
17364
17365 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
17366 DAG.getConstant(4, DL, MVT::i64));
17367 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
17368 Chain =
17369 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
17370 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
17371 DAG.getRegisterMask(Mask), Chain.getValue(1));
17372 // To match the actual intent better, we should read the output from X15 here
17373 // again (instead of potentially spilling it to the stack), but rereading Size
17374 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
17375 // here.
17376
17377 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
17378 DAG.getConstant(4, DL, MVT::i64));
17379
17380 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17381 Chain = SP.getValue(1);
17382 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17383 if (Align)
17384 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17385 DAG.getSignedConstant(-Align->value(), DL, VT));
17386 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17387
17388 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
17389
17390 SDValue Ops[2] = {SP, Chain};
17391 return DAG.getMergeValues(Ops, DL);
17392}
17393
17394SDValue
17395AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
17396 SelectionDAG &DAG) const {
17397 // Get the inputs.
17398 SDNode *Node = Op.getNode();
17399 SDValue Chain = Op.getOperand(0);
17400 SDValue Size = Op.getOperand(1);
17401
17402 MaybeAlign Align =
17403 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17404 SDLoc DL(Op);
17405 EVT VT = Node->getValueType(0);
17406
17407 // Construct the new SP value in a GPR.
17408 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17409 Chain = SP.getValue(1);
17410 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17411 if (Align)
17412 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17413 DAG.getSignedConstant(-Align->value(), DL, VT));
17414
17415 // Set the real SP to the new value with a probing loop.
17416 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
17417 SDValue Ops[2] = {SP, Chain};
17418 return DAG.getMergeValues(Ops, DL);
17419}
17420
17421SDValue
17422AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17423 SelectionDAG &DAG) const {
17424 MachineFunction &MF = DAG.getMachineFunction();
17425
17426 if (Subtarget->isTargetWindows())
17427 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
17428 else if (hasInlineStackProbe(MF))
17429 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
17430 else
17431 return SDValue();
17432}
17433
17434SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
17435 unsigned NewOp) const {
17436 if (Subtarget->hasSVE2())
17437 return LowerToPredicatedOp(Op, DAG, NewOp);
17438
17439 // Default to expand.
17440 return SDValue();
17441}
17442
17443SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
17444 SelectionDAG &DAG) const {
17445 EVT VT = Op.getValueType();
17446 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
17447
17448 SDLoc DL(Op);
17449 APInt MulImm = Op.getConstantOperandAPInt(0);
17450 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
17451 VT);
17452}
17453
17454/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
17455template <unsigned NumVecs>
17456static bool
17459 Info.opc = ISD::INTRINSIC_VOID;
17460 // Retrieve EC from first vector argument.
17461 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
17463#ifndef NDEBUG
17464 // Check the assumption that all input vectors are the same type.
17465 for (unsigned I = 0; I < NumVecs; ++I)
17466 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
17467 "Invalid type.");
17468#endif
17469 // memVT is `NumVecs * VT`.
17470 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
17471 EC * NumVecs);
17472 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
17473 Info.offset = 0;
17474 Info.align.reset();
17475 Info.flags = MachineMemOperand::MOStore;
17476 return true;
17477}
17478
17479/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
17480/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
17481/// specified in the intrinsic calls.
17483 const CallBase &I,
17484 MachineFunction &MF,
17485 unsigned Intrinsic) const {
17486 auto &DL = I.getDataLayout();
17487 switch (Intrinsic) {
17488 case Intrinsic::aarch64_sve_st2:
17489 return setInfoSVEStN<2>(*this, DL, Info, I);
17490 case Intrinsic::aarch64_sve_st3:
17491 return setInfoSVEStN<3>(*this, DL, Info, I);
17492 case Intrinsic::aarch64_sve_st4:
17493 return setInfoSVEStN<4>(*this, DL, Info, I);
17494 case Intrinsic::aarch64_neon_ld2:
17495 case Intrinsic::aarch64_neon_ld3:
17496 case Intrinsic::aarch64_neon_ld4:
17497 case Intrinsic::aarch64_neon_ld1x2:
17498 case Intrinsic::aarch64_neon_ld1x3:
17499 case Intrinsic::aarch64_neon_ld1x4: {
17500 Info.opc = ISD::INTRINSIC_W_CHAIN;
17501 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
17502 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17503 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17504 Info.offset = 0;
17505 Info.align.reset();
17506 // volatile loads with NEON intrinsics not supported
17507 Info.flags = MachineMemOperand::MOLoad;
17508 return true;
17509 }
17510 case Intrinsic::aarch64_neon_ld2lane:
17511 case Intrinsic::aarch64_neon_ld3lane:
17512 case Intrinsic::aarch64_neon_ld4lane:
17513 case Intrinsic::aarch64_neon_ld2r:
17514 case Intrinsic::aarch64_neon_ld3r:
17515 case Intrinsic::aarch64_neon_ld4r: {
17516 Info.opc = ISD::INTRINSIC_W_CHAIN;
17517 // ldx return struct with the same vec type
17518 Type *RetTy = I.getType();
17519 auto *StructTy = cast<StructType>(RetTy);
17520 unsigned NumElts = StructTy->getNumElements();
17521 Type *VecTy = StructTy->getElementType(0);
17522 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17523 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17524 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17525 Info.offset = 0;
17526 Info.align.reset();
17527 // volatile loads with NEON intrinsics not supported
17528 Info.flags = MachineMemOperand::MOLoad;
17529 return true;
17530 }
17531 case Intrinsic::aarch64_neon_st2:
17532 case Intrinsic::aarch64_neon_st3:
17533 case Intrinsic::aarch64_neon_st4:
17534 case Intrinsic::aarch64_neon_st1x2:
17535 case Intrinsic::aarch64_neon_st1x3:
17536 case Intrinsic::aarch64_neon_st1x4: {
17537 Info.opc = ISD::INTRINSIC_VOID;
17538 unsigned NumElts = 0;
17539 for (const Value *Arg : I.args()) {
17540 Type *ArgTy = Arg->getType();
17541 if (!ArgTy->isVectorTy())
17542 break;
17543 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17544 }
17545 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17546 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17547 Info.offset = 0;
17548 Info.align.reset();
17549 // volatile stores with NEON intrinsics not supported
17550 Info.flags = MachineMemOperand::MOStore;
17551 return true;
17552 }
17553 case Intrinsic::aarch64_neon_st2lane:
17554 case Intrinsic::aarch64_neon_st3lane:
17555 case Intrinsic::aarch64_neon_st4lane: {
17556 Info.opc = ISD::INTRINSIC_VOID;
17557 unsigned NumElts = 0;
17558 // all the vector type is same
17559 Type *VecTy = I.getArgOperand(0)->getType();
17560 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17561
17562 for (const Value *Arg : I.args()) {
17563 Type *ArgTy = Arg->getType();
17564 if (!ArgTy->isVectorTy())
17565 break;
17566 NumElts += 1;
17567 }
17568
17569 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17570 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17571 Info.offset = 0;
17572 Info.align.reset();
17573 // volatile stores with NEON intrinsics not supported
17574 Info.flags = MachineMemOperand::MOStore;
17575 return true;
17576 }
17577 case Intrinsic::aarch64_ldaxr:
17578 case Intrinsic::aarch64_ldxr: {
17579 Type *ValTy = I.getParamElementType(0);
17580 Info.opc = ISD::INTRINSIC_W_CHAIN;
17581 Info.memVT = MVT::getVT(ValTy);
17582 Info.ptrVal = I.getArgOperand(0);
17583 Info.offset = 0;
17584 Info.align = DL.getABITypeAlign(ValTy);
17586 return true;
17587 }
17588 case Intrinsic::aarch64_stlxr:
17589 case Intrinsic::aarch64_stxr: {
17590 Type *ValTy = I.getParamElementType(1);
17591 Info.opc = ISD::INTRINSIC_W_CHAIN;
17592 Info.memVT = MVT::getVT(ValTy);
17593 Info.ptrVal = I.getArgOperand(1);
17594 Info.offset = 0;
17595 Info.align = DL.getABITypeAlign(ValTy);
17597 return true;
17598 }
17599 case Intrinsic::aarch64_ldaxp:
17600 case Intrinsic::aarch64_ldxp:
17601 Info.opc = ISD::INTRINSIC_W_CHAIN;
17602 Info.memVT = MVT::i128;
17603 Info.ptrVal = I.getArgOperand(0);
17604 Info.offset = 0;
17605 Info.align = Align(16);
17607 return true;
17608 case Intrinsic::aarch64_stlxp:
17609 case Intrinsic::aarch64_stxp:
17610 Info.opc = ISD::INTRINSIC_W_CHAIN;
17611 Info.memVT = MVT::i128;
17612 Info.ptrVal = I.getArgOperand(2);
17613 Info.offset = 0;
17614 Info.align = Align(16);
17616 return true;
17617 case Intrinsic::aarch64_sve_ldnt1: {
17618 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17619 Info.opc = ISD::INTRINSIC_W_CHAIN;
17620 Info.memVT = MVT::getVT(I.getType());
17621 Info.ptrVal = I.getArgOperand(1);
17622 Info.offset = 0;
17623 Info.align = DL.getABITypeAlign(ElTy);
17625 return true;
17626 }
17627 case Intrinsic::aarch64_sve_stnt1: {
17628 Type *ElTy =
17629 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17630 Info.opc = ISD::INTRINSIC_W_CHAIN;
17631 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17632 Info.ptrVal = I.getArgOperand(2);
17633 Info.offset = 0;
17634 Info.align = DL.getABITypeAlign(ElTy);
17636 return true;
17637 }
17638 case Intrinsic::aarch64_mops_memset_tag: {
17639 Value *Dst = I.getArgOperand(0);
17640 Value *Val = I.getArgOperand(1);
17641 Info.opc = ISD::INTRINSIC_W_CHAIN;
17642 Info.memVT = MVT::getVT(Val->getType());
17643 Info.ptrVal = Dst;
17644 Info.offset = 0;
17645 Info.align = I.getParamAlign(0).valueOrOne();
17646 Info.flags = MachineMemOperand::MOStore;
17647 // The size of the memory being operated on is unknown at this point
17648 Info.size = MemoryLocation::UnknownSize;
17649 return true;
17650 }
17651 default:
17652 break;
17653 }
17654
17655 return false;
17656}
17657
17659 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17660 std::optional<unsigned> ByteOffset) const {
17661 // TODO: This may be worth removing. Check regression tests for diffs.
17662 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17663 ByteOffset))
17664 return false;
17665
17666 // If we're reducing the load width in order to avoid having to use an extra
17667 // instruction to do extension then it's probably a good idea.
17668 if (ExtTy != ISD::NON_EXTLOAD)
17669 return true;
17670 // Don't reduce load width if it would prevent us from combining a shift into
17671 // the offset.
17672 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17673 assert(Mem);
17674 const SDValue &Base = Mem->getBasePtr();
17675 if (Base.getOpcode() == ISD::ADD &&
17676 Base.getOperand(1).getOpcode() == ISD::SHL &&
17677 Base.getOperand(1).hasOneUse() &&
17678 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17679 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17680 if (Mem->getMemoryVT().isScalableVector())
17681 return false;
17682 // The shift can be combined if it matches the size of the value being
17683 // loaded (and so reducing the width would make it not match).
17684 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17685 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17686 if (ShiftAmount == Log2_32(LoadBytes))
17687 return false;
17688 }
17689 // We have no reason to disallow reducing the load width, so allow it.
17690 return true;
17691}
17692
17693// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17695 EVT VT = Extend.getValueType();
17696 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17697 SDValue Extract = Extend.getOperand(0);
17698 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17699 Extract = Extract.getOperand(0);
17700 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17701 EVT VecVT = Extract.getOperand(0).getValueType();
17702 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17703 return false;
17704 }
17705 }
17706 return true;
17707}
17708
17709// Truncations from 64-bit GPR to 32-bit GPR is free.
17711 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17712 return false;
17713 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17714 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17715 return NumBits1 > NumBits2;
17716}
17718 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17719 return false;
17720 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17721 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17722 return NumBits1 > NumBits2;
17723}
17724
17725/// Check if it is profitable to hoist instruction in then/else to if.
17726/// Not profitable if I and it's user can form a FMA instruction
17727/// because we prefer FMSUB/FMADD.
17729 if (I->getOpcode() != Instruction::FMul)
17730 return true;
17731
17732 if (!I->hasOneUse())
17733 return true;
17734
17735 Instruction *User = I->user_back();
17736
17737 if (!(User->getOpcode() == Instruction::FSub ||
17738 User->getOpcode() == Instruction::FAdd))
17739 return true;
17740
17742 const Function *F = I->getFunction();
17743 const DataLayout &DL = F->getDataLayout();
17744 Type *Ty = User->getOperand(0)->getType();
17745
17746 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17748 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17749 I->getFastMathFlags().allowContract()));
17750}
17751
17752// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17753// 64-bit GPR.
17755 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17756 return false;
17757 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17758 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17759 return NumBits1 == 32 && NumBits2 == 64;
17760}
17762 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17763 return false;
17764 unsigned NumBits1 = VT1.getSizeInBits();
17765 unsigned NumBits2 = VT2.getSizeInBits();
17766 return NumBits1 == 32 && NumBits2 == 64;
17767}
17768
17770 EVT VT1 = Val.getValueType();
17771 if (isZExtFree(VT1, VT2)) {
17772 return true;
17773 }
17774
17775 if (Val.getOpcode() != ISD::LOAD)
17776 return false;
17777
17778 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17779 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17780 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17781 VT1.getSizeInBits() <= 32);
17782}
17783
17784bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17785 if (isa<FPExtInst>(Ext))
17786 return false;
17787
17788 // Vector types are not free.
17789 if (Ext->getType()->isVectorTy())
17790 return false;
17791
17792 for (const Use &U : Ext->uses()) {
17793 // The extension is free if we can fold it with a left shift in an
17794 // addressing mode or an arithmetic operation: add, sub, and cmp.
17795
17796 // Is there a shift?
17797 const Instruction *Instr = cast<Instruction>(U.getUser());
17798
17799 // Is this a constant shift?
17800 switch (Instr->getOpcode()) {
17801 case Instruction::Shl:
17802 if (!isa<ConstantInt>(Instr->getOperand(1)))
17803 return false;
17804 break;
17805 case Instruction::GetElementPtr: {
17806 gep_type_iterator GTI = gep_type_begin(Instr);
17807 auto &DL = Ext->getDataLayout();
17808 std::advance(GTI, U.getOperandNo()-1);
17809 Type *IdxTy = GTI.getIndexedType();
17810 // This extension will end up with a shift because of the scaling factor.
17811 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17812 // Get the shift amount based on the scaling factor:
17813 // log2(sizeof(IdxTy)) - log2(8).
17814 if (IdxTy->isScalableTy())
17815 return false;
17816 uint64_t ShiftAmt =
17817 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17818 3;
17819 // Is the constant foldable in the shift of the addressing mode?
17820 // I.e., shift amount is between 1 and 4 inclusive.
17821 if (ShiftAmt == 0 || ShiftAmt > 4)
17822 return false;
17823 break;
17824 }
17825 case Instruction::Trunc:
17826 // Check if this is a noop.
17827 // trunc(sext ty1 to ty2) to ty1.
17828 if (Instr->getType() == Ext->getOperand(0)->getType())
17829 continue;
17830 [[fallthrough]];
17831 default:
17832 return false;
17833 }
17834
17835 // At this point we can use the bfm family, so this extension is free
17836 // for that use.
17837 }
17838 return true;
17839}
17840
17841static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17842 unsigned NumElts, bool IsLittleEndian,
17843 SmallVectorImpl<int> &Mask) {
17844 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17845 return false;
17846
17847 assert(DstWidth % SrcWidth == 0 &&
17848 "TBL lowering is not supported for a conversion instruction with this "
17849 "source and destination element type.");
17850
17851 unsigned Factor = DstWidth / SrcWidth;
17852 unsigned MaskLen = NumElts * Factor;
17853
17854 Mask.clear();
17855 Mask.resize(MaskLen, NumElts);
17856
17857 unsigned SrcIndex = 0;
17858 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17859 Mask[I] = SrcIndex++;
17860
17861 return true;
17862}
17863
17865 FixedVectorType *ZExtTy,
17866 FixedVectorType *DstTy,
17867 bool IsLittleEndian) {
17868 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17869 unsigned NumElts = SrcTy->getNumElements();
17870 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17871 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17872
17873 SmallVector<int> Mask;
17874 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17875 return nullptr;
17876
17877 auto *FirstEltZero = Builder.CreateInsertElement(
17878 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17879 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17880 Result = Builder.CreateBitCast(Result, DstTy);
17881 if (DstTy != ZExtTy)
17882 Result = Builder.CreateZExt(Result, ZExtTy);
17883 return Result;
17884}
17885
17887 FixedVectorType *DstTy,
17888 bool IsLittleEndian) {
17889 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17890 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17891 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17892
17893 SmallVector<int> Mask;
17894 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17895 !IsLittleEndian, Mask))
17896 return nullptr;
17897
17898 auto *FirstEltZero = Builder.CreateInsertElement(
17899 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17900
17901 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17902}
17903
17904static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17905 IRBuilder<> Builder(TI);
17907 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17908 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17909 auto *DstTy = cast<FixedVectorType>(TI->getType());
17910 assert(SrcTy->getElementType()->isIntegerTy() &&
17911 "Non-integer type source vector element is not supported");
17912 assert(DstTy->getElementType()->isIntegerTy(8) &&
17913 "Unsupported destination vector element type");
17914 unsigned SrcElemTySz =
17915 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17916 unsigned DstElemTySz =
17917 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17918 assert((SrcElemTySz % DstElemTySz == 0) &&
17919 "Cannot lower truncate to tbl instructions for a source element size "
17920 "that is not divisible by the destination element size");
17921 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17922 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17923 "Unsupported source vector element type size");
17924 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17925
17926 // Create a mask to choose every nth byte from the source vector table of
17927 // bytes to create the truncated destination vector, where 'n' is the truncate
17928 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17929 // 0,8,16,..Y*8th bytes for the little-endian format
17931 for (int Itr = 0; Itr < 16; Itr++) {
17932 if (Itr < NumElements)
17933 MaskConst.push_back(Builder.getInt8(
17934 IsLittleEndian ? Itr * TruncFactor
17935 : Itr * TruncFactor + (TruncFactor - 1)));
17936 else
17937 MaskConst.push_back(Builder.getInt8(255));
17938 }
17939
17940 int MaxTblSz = 128 * 4;
17941 int MaxSrcSz = SrcElemTySz * NumElements;
17942 int ElemsPerTbl =
17943 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17944 assert(ElemsPerTbl <= 16 &&
17945 "Maximum elements selected using TBL instruction cannot exceed 16!");
17946
17947 int ShuffleCount = 128 / SrcElemTySz;
17948 SmallVector<int> ShuffleLanes;
17949 for (int i = 0; i < ShuffleCount; ++i)
17950 ShuffleLanes.push_back(i);
17951
17952 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17953 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17954 // call TBL & save the result in a vector of TBL results for combining later.
17956 while (ShuffleLanes.back() < NumElements) {
17957 Parts.push_back(Builder.CreateBitCast(
17958 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17959
17960 if (Parts.size() == 4) {
17961 Parts.push_back(ConstantVector::get(MaskConst));
17962 Results.push_back(
17963 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17964 Parts.clear();
17965 }
17966
17967 for (int i = 0; i < ShuffleCount; ++i)
17968 ShuffleLanes[i] += ShuffleCount;
17969 }
17970
17971 assert((Parts.empty() || Results.empty()) &&
17972 "Lowering trunc for vectors requiring different TBL instructions is "
17973 "not supported!");
17974 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17975 // registers
17976 if (!Parts.empty()) {
17977 Intrinsic::ID TblID;
17978 switch (Parts.size()) {
17979 case 1:
17980 TblID = Intrinsic::aarch64_neon_tbl1;
17981 break;
17982 case 2:
17983 TblID = Intrinsic::aarch64_neon_tbl2;
17984 break;
17985 case 3:
17986 TblID = Intrinsic::aarch64_neon_tbl3;
17987 break;
17988 }
17989
17990 Parts.push_back(ConstantVector::get(MaskConst));
17991 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17992 }
17993
17994 // Extract the destination vector from TBL result(s) after combining them
17995 // where applicable. Currently, at most two TBLs are supported.
17996 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17997 "more than 2 tbl instructions!");
17998 Value *FinalResult = Results[0];
17999 if (Results.size() == 1) {
18000 if (ElemsPerTbl < 16) {
18001 SmallVector<int> FinalMask(ElemsPerTbl);
18002 std::iota(FinalMask.begin(), FinalMask.end(), 0);
18003 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
18004 }
18005 } else {
18006 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
18007 if (ElemsPerTbl < 16) {
18008 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
18009 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
18010 } else {
18011 std::iota(FinalMask.begin(), FinalMask.end(), 0);
18012 }
18013 FinalResult =
18014 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
18015 }
18016
18017 TI->replaceAllUsesWith(FinalResult);
18018 TI->eraseFromParent();
18019}
18020
18022 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
18023 // shuffle_vector instructions are serialized when targeting SVE,
18024 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
18025 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
18026 return false;
18027
18028 // Try to optimize conversions using tbl. This requires materializing constant
18029 // index vectors, which can increase code size and add loads. Skip the
18030 // transform unless the conversion is in a loop block guaranteed to execute
18031 // and we are not optimizing for size.
18032 Function *F = I->getParent()->getParent();
18033 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
18034 return false;
18035
18036 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
18037 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
18038 if (!SrcTy || !DstTy)
18039 return false;
18040
18041 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
18042 // lowered to tbl instructions to insert the original i8 elements
18043 // into i8x lanes. This is enabled for cases where it is beneficial.
18044 auto *ZExt = dyn_cast<ZExtInst>(I);
18045 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
18046 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
18047 if (DstWidth % 8 != 0)
18048 return false;
18049
18050 auto *TruncDstType =
18052 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
18053 // the remaining ZExt folded into the user, don't use tbl lowering.
18054 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
18055 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
18058 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
18059 return false;
18060
18061 DstTy = TruncDstType;
18062 }
18063
18064 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
18065 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
18066 // most one extra extend step is needed and using tbl is not profitable.
18067 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
18068 // udot instruction.
18069 if (SrcWidth * 4 <= DstWidth) {
18070 if (all_of(I->users(), [&](auto *U) {
18071 using namespace llvm::PatternMatch;
18072 auto *SingleUser = cast<Instruction>(&*U);
18073 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
18074 return true;
18075 if (match(SingleUser,
18076 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
18077 m_Value(), m_Specific(I))))
18078 return true;
18079 return false;
18080 }))
18081 return false;
18082 }
18083
18084 if (DstTy->getScalarSizeInBits() >= 64)
18085 return false;
18086
18087 IRBuilder<> Builder(ZExt);
18089 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
18090 DstTy, Subtarget->isLittleEndian());
18091 if (!Result)
18092 return false;
18093 ZExt->replaceAllUsesWith(Result);
18094 ZExt->eraseFromParent();
18095 return true;
18096 }
18097
18098 auto *UIToFP = dyn_cast<UIToFPInst>(I);
18099 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
18100 DstTy->getElementType()->isFloatTy()) ||
18101 (SrcTy->getElementType()->isIntegerTy(16) &&
18102 DstTy->getElementType()->isDoubleTy()))) {
18103 IRBuilder<> Builder(I);
18105 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
18106 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
18107 assert(ZExt && "Cannot fail for the i8 to float conversion");
18108 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
18109 I->replaceAllUsesWith(UI);
18110 I->eraseFromParent();
18111 return true;
18112 }
18113
18114 auto *SIToFP = dyn_cast<SIToFPInst>(I);
18115 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
18116 DstTy->getElementType()->isFloatTy()) {
18117 IRBuilder<> Builder(I);
18118 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
18120 Subtarget->isLittleEndian());
18121 assert(Shuffle && "Cannot fail for the i8 to float conversion");
18122 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
18123 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
18124 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
18125 I->replaceAllUsesWith(SI);
18126 I->eraseFromParent();
18127 return true;
18128 }
18129
18130 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
18131 // followed by a truncate lowered to using tbl.4.
18132 auto *FPToUI = dyn_cast<FPToUIInst>(I);
18133 if (FPToUI &&
18134 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
18135 SrcTy->getElementType()->isFloatTy() &&
18136 DstTy->getElementType()->isIntegerTy(8)) {
18137 IRBuilder<> Builder(I);
18138 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
18139 VectorType::getInteger(SrcTy));
18140 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
18141 I->replaceAllUsesWith(TruncI);
18142 I->eraseFromParent();
18143 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
18144 return true;
18145 }
18146
18147 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
18148 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
18149 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
18150 // registers
18151 auto *TI = dyn_cast<TruncInst>(I);
18152 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
18153 ((SrcTy->getElementType()->isIntegerTy(32) ||
18154 SrcTy->getElementType()->isIntegerTy(64)) &&
18155 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
18156 createTblForTrunc(TI, Subtarget->isLittleEndian());
18157 return true;
18158 }
18159
18160 return false;
18161}
18162
18164 Align &RequiredAlignment) const {
18165 if (!LoadedType.isSimple() ||
18166 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
18167 return false;
18168 // Cyclone supports unaligned accesses.
18169 RequiredAlignment = Align(1);
18170 unsigned NumBits = LoadedType.getSizeInBits();
18171 return NumBits == 32 || NumBits == 64;
18172}
18173
18174/// A helper function for determining the number of interleaved accesses we
18175/// will generate when lowering accesses of the given type.
18177 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
18178 unsigned VecSize = 128;
18179 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18180 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
18181 if (UseScalable && isa<FixedVectorType>(VecTy))
18182 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18183 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
18184}
18185
18188 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
18189 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
18190 return MOStridedAccess;
18192}
18193
18195 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
18196 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18197 auto EC = VecTy->getElementCount();
18198 unsigned MinElts = EC.getKnownMinValue();
18199
18200 UseScalable = false;
18201
18202 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
18203 (!Subtarget->useSVEForFixedLengthVectors() ||
18205 return false;
18206
18207 if (isa<ScalableVectorType>(VecTy) &&
18208 !Subtarget->isSVEorStreamingSVEAvailable())
18209 return false;
18210
18211 // Ensure the number of vector elements is greater than 1.
18212 if (MinElts < 2)
18213 return false;
18214
18215 // Ensure the element type is legal.
18216 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
18217 return false;
18218
18219 if (EC.isScalable()) {
18220 UseScalable = true;
18221 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
18222 }
18223
18224 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
18225 if (Subtarget->useSVEForFixedLengthVectors()) {
18226 unsigned MinSVEVectorSize =
18227 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18228 if (VecSize % MinSVEVectorSize == 0 ||
18229 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
18230 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
18231 UseScalable = true;
18232 return true;
18233 }
18234 }
18235
18236 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
18237 // 128 will be split into multiple interleaved accesses.
18238 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
18239}
18240
18242 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
18243 return ScalableVectorType::get(VTy->getElementType(), 2);
18244
18245 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
18246 return ScalableVectorType::get(VTy->getElementType(), 4);
18247
18248 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
18249 return ScalableVectorType::get(VTy->getElementType(), 8);
18250
18251 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
18252 return ScalableVectorType::get(VTy->getElementType(), 8);
18253
18254 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
18255 return ScalableVectorType::get(VTy->getElementType(), 2);
18256
18257 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
18258 return ScalableVectorType::get(VTy->getElementType(), 4);
18259
18260 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
18261 return ScalableVectorType::get(VTy->getElementType(), 8);
18262
18263 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
18264 return ScalableVectorType::get(VTy->getElementType(), 16);
18265
18266 llvm_unreachable("Cannot handle input vector type");
18267}
18268
18269static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
18270 bool Scalable, Type *LDVTy,
18271 Type *PtrTy) {
18272 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18273 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
18274 Intrinsic::aarch64_sve_ld3_sret,
18275 Intrinsic::aarch64_sve_ld4_sret};
18276 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
18277 Intrinsic::aarch64_neon_ld3,
18278 Intrinsic::aarch64_neon_ld4};
18279 if (Scalable)
18280 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
18281
18282 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
18283 {LDVTy, PtrTy});
18284}
18285
18286static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
18287 bool Scalable, Type *STVTy,
18288 Type *PtrTy) {
18289 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18290 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
18291 Intrinsic::aarch64_sve_st3,
18292 Intrinsic::aarch64_sve_st4};
18293 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
18294 Intrinsic::aarch64_neon_st3,
18295 Intrinsic::aarch64_neon_st4};
18296 if (Scalable)
18297 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
18298
18299 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
18300 {STVTy, PtrTy});
18301}
18302
18303/// Lower an interleaved load into a ldN intrinsic.
18304///
18305/// E.g. Lower an interleaved load (Factor = 2):
18306/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
18307/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
18308/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
18309///
18310/// Into:
18311/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
18312/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
18313/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
18315 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
18316 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
18317 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18318 "Invalid interleave factor");
18319 assert(!Shuffles.empty() && "Empty shufflevector input");
18320 assert(Shuffles.size() == Indices.size() &&
18321 "Unmatched number of shufflevectors and indices");
18322
18323 auto *LI = dyn_cast<LoadInst>(Load);
18324 if (!LI)
18325 return false;
18326 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
18327
18328 const DataLayout &DL = LI->getDataLayout();
18329
18330 VectorType *VTy = Shuffles[0]->getType();
18331
18332 // Skip if we do not have NEON and skip illegal vector types. We can
18333 // "legalize" wide vector types into multiple interleaved accesses as long as
18334 // the vector types are divisible by 128.
18335 bool UseScalable;
18336 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18337 return false;
18338
18339 // Check if the interleave is a zext(shuffle), that can be better optimized
18340 // into shift / and masks. For the moment we do this just for uitofp (not
18341 // zext) to avoid issues with widening instructions.
18342 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
18343 using namespace llvm::PatternMatch;
18344 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
18345 SI->getType()->getScalarSizeInBits() * 4 ==
18346 SI->user_back()->getType()->getScalarSizeInBits();
18347 }))
18348 return false;
18349
18350 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18351
18352 auto *FVTy = cast<FixedVectorType>(VTy);
18353
18354 // A pointer vector can not be the return type of the ldN intrinsics. Need to
18355 // load integer vectors first and then convert to pointer vectors.
18356 Type *EltTy = FVTy->getElementType();
18357 if (EltTy->isPointerTy())
18358 FVTy =
18359 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
18360
18361 // If we're going to generate more than one load, reset the sub-vector type
18362 // to something legal.
18363 FVTy = FixedVectorType::get(FVTy->getElementType(),
18364 FVTy->getNumElements() / NumLoads);
18365
18366 auto *LDVTy =
18367 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
18368
18369 IRBuilder<> Builder(LI);
18370
18371 // The base address of the load.
18372 Value *BaseAddr = LI->getPointerOperand();
18373
18374 Type *PtrTy = LI->getPointerOperandType();
18375 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
18376 LDVTy->getElementCount());
18377
18378 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18379 UseScalable, LDVTy, PtrTy);
18380
18381 // Holds sub-vectors extracted from the load intrinsic return values. The
18382 // sub-vectors are associated with the shufflevector instructions they will
18383 // replace.
18385
18386 Value *PTrue = nullptr;
18387 if (UseScalable) {
18388 std::optional<unsigned> PgPattern =
18389 getSVEPredPatternFromNumElements(FVTy->getNumElements());
18390 if (Subtarget->getMinSVEVectorSizeInBits() ==
18391 Subtarget->getMaxSVEVectorSizeInBits() &&
18392 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
18393 PgPattern = AArch64SVEPredPattern::all;
18394
18395 auto *PTruePat =
18396 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
18397 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18398 {PTruePat});
18399 }
18400
18401 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
18402
18403 // If we're generating more than one load, compute the base address of
18404 // subsequent loads as an offset from the previous.
18405 if (LoadCount > 0)
18406 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
18407 FVTy->getNumElements() * Factor);
18408
18409 CallInst *LdN;
18410 if (UseScalable)
18411 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
18412 else
18413 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18414
18415 // Extract and store the sub-vectors returned by the load intrinsic.
18416 for (unsigned i = 0; i < Shuffles.size(); i++) {
18417 ShuffleVectorInst *SVI = Shuffles[i];
18418 unsigned Index = Indices[i];
18419
18420 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
18421
18422 if (UseScalable)
18423 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
18424
18425 // Convert the integer vector to pointer vector if the element is pointer.
18426 if (EltTy->isPointerTy())
18427 SubVec = Builder.CreateIntToPtr(
18429 FVTy->getNumElements()));
18430
18431 SubVecs[SVI].push_back(SubVec);
18432 }
18433 }
18434
18435 // Replace uses of the shufflevector instructions with the sub-vectors
18436 // returned by the load intrinsic. If a shufflevector instruction is
18437 // associated with more than one sub-vector, those sub-vectors will be
18438 // concatenated into a single wide vector.
18439 for (ShuffleVectorInst *SVI : Shuffles) {
18440 auto &SubVec = SubVecs[SVI];
18441 auto *WideVec =
18442 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
18443 SVI->replaceAllUsesWith(WideVec);
18444 }
18445
18446 return true;
18447}
18448
18449template <typename Iter>
18450bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
18451 int MaxLookupDist = 20;
18452 unsigned IdxWidth = DL.getIndexSizeInBits(0);
18453 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
18454 const Value *PtrA1 =
18456
18457 while (++It != End) {
18458 if (It->isDebugOrPseudoInst())
18459 continue;
18460 if (MaxLookupDist-- == 0)
18461 break;
18462 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
18463 const Value *PtrB1 =
18464 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
18465 DL, OffsetB);
18466 if (PtrA1 == PtrB1 &&
18467 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
18468 .abs() == 16)
18469 return true;
18470 }
18471 }
18472
18473 return false;
18474}
18475
18476/// Lower an interleaved store into a stN intrinsic.
18477///
18478/// E.g. Lower an interleaved store (Factor = 3):
18479/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
18480/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
18481/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18482///
18483/// Into:
18484/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
18485/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
18486/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
18487/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18488///
18489/// Note that the new shufflevectors will be removed and we'll only generate one
18490/// st3 instruction in CodeGen.
18491///
18492/// Example for a more general valid mask (Factor 3). Lower:
18493/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
18494/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
18495/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18496///
18497/// Into:
18498/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
18499/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
18500/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
18501/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18503 Value *LaneMask,
18504 ShuffleVectorInst *SVI,
18505 unsigned Factor,
18506 const APInt &GapMask) const {
18507
18508 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18509 "Invalid interleave factor");
18510 auto *SI = dyn_cast<StoreInst>(Store);
18511 if (!SI)
18512 return false;
18513 assert(!LaneMask && GapMask.popcount() == Factor &&
18514 "Unexpected mask on store");
18515
18516 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18517 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18518
18519 unsigned LaneLen = VecTy->getNumElements() / Factor;
18520 Type *EltTy = VecTy->getElementType();
18521 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18522
18523 const DataLayout &DL = SI->getDataLayout();
18524 bool UseScalable;
18525
18526 // Skip if we do not have NEON and skip illegal vector types. We can
18527 // "legalize" wide vector types into multiple interleaved accesses as long as
18528 // the vector types are divisible by 128.
18529 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18530 return false;
18531
18532 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18533
18534 Value *Op0 = SVI->getOperand(0);
18535 Value *Op1 = SVI->getOperand(1);
18536 IRBuilder<> Builder(SI);
18537
18538 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18539 // vectors to integer vectors.
18540 if (EltTy->isPointerTy()) {
18541 Type *IntTy = DL.getIntPtrType(EltTy);
18542 unsigned NumOpElts =
18543 cast<FixedVectorType>(Op0->getType())->getNumElements();
18544
18545 // Convert to the corresponding integer vector.
18546 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18547 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18548 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18549
18550 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18551 }
18552
18553 // If we're going to generate more than one store, reset the lane length
18554 // and sub-vector type to something legal.
18555 LaneLen /= NumStores;
18556 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18557
18558 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18559 : SubVecTy;
18560
18561 // The base address of the store.
18562 Value *BaseAddr = SI->getPointerOperand();
18563
18564 auto Mask = SVI->getShuffleMask();
18565
18566 // Sanity check if all the indices are NOT in range.
18567 // If mask is `poison`, `Mask` may be a vector of -1s.
18568 // If all of them are `poison`, OOB read will happen later.
18569 if (llvm::all_of(Mask, equal_to(PoisonMaskElem))) {
18570 return false;
18571 }
18572 // A 64bit st2 which does not start at element 0 will involved adding extra
18573 // ext elements making the st2 unprofitable, and if there is a nearby store
18574 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18575 // zip;ldp pair which has higher throughput.
18576 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18577 (Mask[0] != 0 ||
18578 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18579 DL) ||
18580 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18581 BaseAddr, DL)))
18582 return false;
18583
18584 Type *PtrTy = SI->getPointerOperandType();
18585 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18586 STVTy->getElementCount());
18587
18588 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18589 UseScalable, STVTy, PtrTy);
18590
18591 Value *PTrue = nullptr;
18592 if (UseScalable) {
18593 std::optional<unsigned> PgPattern =
18594 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18595 if (Subtarget->getMinSVEVectorSizeInBits() ==
18596 Subtarget->getMaxSVEVectorSizeInBits() &&
18597 Subtarget->getMinSVEVectorSizeInBits() ==
18598 DL.getTypeSizeInBits(SubVecTy))
18599 PgPattern = AArch64SVEPredPattern::all;
18600
18601 auto *PTruePat =
18602 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18603 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18604 {PTruePat});
18605 }
18606
18607 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18608
18610
18611 // Split the shufflevector operands into sub vectors for the new stN call.
18612 for (unsigned i = 0; i < Factor; i++) {
18613 Value *Shuffle;
18614 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18615 if (Mask[IdxI] >= 0) {
18616 Shuffle = Builder.CreateShuffleVector(
18617 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18618 } else {
18619 unsigned StartMask = 0;
18620 for (unsigned j = 1; j < LaneLen; j++) {
18621 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18622 if (Mask[IdxJ] >= 0) {
18623 StartMask = Mask[IdxJ] - j;
18624 break;
18625 }
18626 }
18627 // Note: Filling undef gaps with random elements is ok, since
18628 // those elements were being written anyway (with undefs).
18629 // In the case of all undefs we're defaulting to using elems from 0
18630 // Note: StartMask cannot be negative, it's checked in
18631 // isReInterleaveMask
18632 Shuffle = Builder.CreateShuffleVector(
18633 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18634 }
18635
18636 if (UseScalable)
18637 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18638 Shuffle, uint64_t(0));
18639
18640 Ops.push_back(Shuffle);
18641 }
18642
18643 if (UseScalable)
18644 Ops.push_back(PTrue);
18645
18646 // If we generating more than one store, we compute the base address of
18647 // subsequent stores as an offset from the previous.
18648 if (StoreCount > 0)
18649 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18650 BaseAddr, LaneLen * Factor);
18651
18652 Ops.push_back(BaseAddr);
18653 Builder.CreateCall(StNFunc, Ops);
18654 }
18655 return true;
18656}
18657
18659 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18660 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18661 if (Factor != 2 && Factor != 3 && Factor != 4) {
18662 LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
18663 return false;
18664 }
18665 auto *LI = dyn_cast<LoadInst>(Load);
18666 if (!LI)
18667 return false;
18668 assert(!Mask && "Unexpected mask on a load\n");
18669
18671
18672 const DataLayout &DL = LI->getModule()->getDataLayout();
18673 bool UseScalable;
18674 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18675 return false;
18676
18677 // TODO: Add support for using SVE instructions with fixed types later, using
18678 // the code from lowerInterleavedLoad to obtain the correct container type.
18679 if (UseScalable && !VTy->isScalableTy())
18680 return false;
18681
18682 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18683 VectorType *LdTy =
18685 VTy->getElementCount().divideCoefficientBy(NumLoads));
18686
18687 Type *PtrTy = LI->getPointerOperandType();
18688 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18689 UseScalable, LdTy, PtrTy);
18690
18691 IRBuilder<> Builder(LI);
18692 Value *Pred = nullptr;
18693 if (UseScalable)
18694 Pred =
18695 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18696
18697 Value *BaseAddr = LI->getPointerOperand();
18698 Value *Result = nullptr;
18699 if (NumLoads > 1) {
18700 // Create multiple legal small ldN.
18701 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18702 for (unsigned I = 0; I < NumLoads; ++I) {
18703 Value *Offset = Builder.getInt64(I * Factor);
18704
18705 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18706 Value *LdN = nullptr;
18707 if (UseScalable)
18708 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18709 else
18710 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18711 Value *Idx =
18712 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18713 for (unsigned J = 0; J < Factor; ++J) {
18714 ExtractedLdValues[J] = Builder.CreateInsertVector(
18715 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18716 }
18717 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18718 }
18719
18720 // Merge the values from different factors.
18721 Result = PoisonValue::get(DI->getType());
18722 for (unsigned J = 0; J < Factor; ++J)
18723 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18724 } else {
18725 if (UseScalable)
18726 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18727 else
18728 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18729 }
18730
18731 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18732 DI->replaceAllUsesWith(Result);
18733 return true;
18734}
18735
18737 Instruction *Store, Value *Mask,
18738 ArrayRef<Value *> InterleavedValues) const {
18739 unsigned Factor = InterleavedValues.size();
18740 if (Factor != 2 && Factor != 3 && Factor != 4) {
18741 LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
18742 return false;
18743 }
18745 if (!SI)
18746 return false;
18747 assert(!Mask && "Unexpected mask on plain store");
18748
18749 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18750 const DataLayout &DL = SI->getModule()->getDataLayout();
18751
18752 bool UseScalable;
18753 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18754 return false;
18755
18756 // TODO: Add support for using SVE instructions with fixed types later, using
18757 // the code from lowerInterleavedStore to obtain the correct container type.
18758 if (UseScalable && !VTy->isScalableTy())
18759 return false;
18760
18761 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18762
18763 VectorType *StTy =
18765 VTy->getElementCount().divideCoefficientBy(NumStores));
18766
18767 Type *PtrTy = SI->getPointerOperandType();
18768 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18769 UseScalable, StTy, PtrTy);
18770
18771 IRBuilder<> Builder(SI);
18772
18773 Value *BaseAddr = SI->getPointerOperand();
18774 Value *Pred = nullptr;
18775
18776 if (UseScalable)
18777 Pred =
18778 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18779
18780 auto ExtractedValues = InterleavedValues;
18781 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18782 if (UseScalable)
18783 StoreOperands.push_back(Pred);
18784 StoreOperands.push_back(BaseAddr);
18785 for (unsigned I = 0; I < NumStores; ++I) {
18786 Value *Address = BaseAddr;
18787 if (NumStores > 1) {
18788 Value *Offset = Builder.getInt64(I * Factor);
18789 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18790 Value *Idx =
18791 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18792 for (unsigned J = 0; J < Factor; J++) {
18793 StoreOperands[J] =
18794 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18795 }
18796 // update the address
18797 StoreOperands[StoreOperands.size() - 1] = Address;
18798 }
18799 Builder.CreateCall(StNFunc, StoreOperands);
18800 }
18801 return true;
18802}
18803
18805 LLVMContext &Context, const MemOp &Op,
18806 const AttributeList &FuncAttributes) const {
18807 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18808 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18809 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18810 // For zero memset, only use AdvSIMD for 32-byte and above. It would have
18811 // taken one instruction to materialize the v2i64 zero and one store (with
18812 // restrictive addressing mode). Just do i64 stores.
18813 // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
18814 bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
18815 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18816 if (Op.isAligned(AlignCheck))
18817 return true;
18818 unsigned Fast;
18819 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18821 Fast;
18822 };
18823
18824 // For non-zero memset, use NEON even for smaller sizes as dup + scalar store
18825 // is efficient
18826 if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
18827 AlignmentIsAcceptable(MVT::v16i8, Align(1)))
18828 return MVT::v16i8;
18829 if (CanUseFP && !IsSmallZeroMemset &&
18830 AlignmentIsAcceptable(MVT::f128, Align(16)))
18831 return MVT::f128;
18832 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18833 return MVT::i64;
18834 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18835 return MVT::i32;
18836 return MVT::Other;
18837}
18838
18840 LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
18841 const MemOp &Op, unsigned DstAS, unsigned SrcAS,
18842 const AttributeList &FuncAttributes, EVT *LargestVT) const {
18843 // For non-zero memset with v16i8, don't downgrade. We can extract smaller
18844 // stores (i64, i32, i16, i8) from the v16i8 splat efficiently.
18845 EVT VT = getOptimalMemOpType(Context, Op, FuncAttributes);
18846 if (VT == MVT::v16i8 && Op.isMemset() && !Op.isZeroMemset() &&
18847 Op.size() < 16) {
18848 unsigned Size = Op.size();
18849 unsigned RemainingSize = Size;
18850
18851 // Break down the size into stores that we can extract from v16i8.
18852 // We support: i64 (8 bytes), i32 (4 bytes), i16 (2 bytes), i8 (1 byte)
18853 // Use the largest possible stores first to minimize the number of
18854 // operations.
18855 while (RemainingSize > 0) {
18856 EVT TargetVT;
18857
18858 // Try largest stores first
18859 if (RemainingSize >= 8) {
18860 TargetVT = MVT::i64;
18861 RemainingSize -= 8;
18862 } else if (RemainingSize >= 4) {
18863 TargetVT = MVT::i32;
18864 RemainingSize -= 4;
18865 } else if (RemainingSize >= 2) {
18866 TargetVT = MVT::i16;
18867 RemainingSize -= 2;
18868 } else if (RemainingSize >= 1) {
18869 TargetVT = MVT::i8;
18870 RemainingSize -= 1;
18871 } else {
18872 // Should not reach here, but fall back to default implementation
18873 break;
18874 }
18875
18876 MemOps.push_back(TargetVT);
18877 }
18878
18879 // If we successfully decomposed the entire size, set LargestVT to v16i8
18880 // to ensure getMemsetValue generates the efficient vector splat (DUP).
18881 // We don't add v16i8 to MemOps since we only need it for value generation.
18882 if (RemainingSize == 0 && !MemOps.empty()) {
18883 if (LargestVT)
18884 *LargestVT = VT; // v16i8 for vector splat generation
18885 return true;
18886 }
18887
18888 // Clear MemOps if we didn't successfully handle everything
18889 MemOps.clear();
18890 }
18891 // Otherwise, use the default implementation
18893 Context, MemOps, Limit, Op, DstAS, SrcAS, FuncAttributes, LargestVT);
18894}
18895
18897 const MemOp &Op, const AttributeList &FuncAttributes) const {
18898 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18899 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18900 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18901 // For zero memset, only use AdvSIMD for 32-byte and above. It would have
18902 // taken one instruction to materialize the v2i64 zero and one store (with
18903 // restrictive addressing mode). Just do i64 stores.
18904 // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
18905 bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
18906 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18907 if (Op.isAligned(AlignCheck))
18908 return true;
18909 unsigned Fast;
18910 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18912 Fast;
18913 };
18914
18915 // For non-zero memset, use NEON for all sizes where it's beneficial.
18916 // NEON dup + scalar store works for any alignment and is efficient.
18917 if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
18918 AlignmentIsAcceptable(MVT::v16i8, Align(1)))
18919 return LLT::fixed_vector(2, 64);
18920 if (CanUseFP && !IsSmallZeroMemset &&
18921 AlignmentIsAcceptable(MVT::f128, Align(16)))
18922 return LLT::scalar(128);
18923 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18924 return LLT::scalar(64);
18925 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18926 return LLT::scalar(32);
18927 return LLT();
18928}
18929
18930// 12-bit optionally shifted immediates are legal for adds.
18932 if (Immed == std::numeric_limits<int64_t>::min()) {
18933 return false;
18934 }
18935 // Same encoding for add/sub, just flip the sign.
18936 return isLegalArithImmed((uint64_t)std::abs(Immed));
18937}
18938
18940 // We will only emit addvl/inc* instructions for SVE2
18941 if (!Subtarget->hasSVE2())
18942 return false;
18943
18944 // addvl's immediates are in terms of the number of bytes in a register.
18945 // Since there are 16 in the base supported size (128bits), we need to
18946 // divide the immediate by that much to give us a useful immediate to
18947 // multiply by vscale. We can't have a remainder as a result of this.
18948 if (Imm % 16 == 0)
18949 return isInt<6>(Imm / 16);
18950
18951 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18952 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18953 // of addvl as a result, so only take h|w|d into account.
18954 // Dec[h|w|d] will cover subtractions.
18955 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18956 // FIXME: Can we make use of other patterns to cover other immediates?
18957
18958 // inch|dech
18959 if (Imm % 8 == 0)
18960 return std::abs(Imm / 8) <= 16;
18961 // incw|decw
18962 if (Imm % 4 == 0)
18963 return std::abs(Imm / 4) <= 16;
18964 // incd|decd
18965 if (Imm % 2 == 0)
18966 return std::abs(Imm / 2) <= 16;
18967
18968 return false;
18969}
18970
18971// Return false to prevent folding
18972// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18973// if the folding leads to worse code.
18975 SDValue AddNode, SDValue ConstNode) const {
18976 // Let the DAGCombiner decide for vector types and large types.
18977 const EVT VT = AddNode.getValueType();
18978 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18979 return true;
18980
18981 // It is worse if c1 is legal add immediate, while c1*c2 is not
18982 // and has to be composed by at least two instructions.
18983 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18984 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18985 const int64_t C1 = C1Node->getSExtValue();
18986 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18988 return true;
18990 // Adapt to the width of a register.
18991 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18992 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18993 if (Insn.size() > 1)
18994 return false;
18995
18996 // Default to true and let the DAGCombiner decide.
18997 return true;
18998}
18999
19000// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
19001// immediates is the same as for an add or a sub.
19003 return isLegalAddImmediate(Immed);
19004}
19005
19006/// isLegalAddressingMode - Return true if the addressing mode represented
19007/// by AM is legal for this target, for a load/store of the specified type.
19009 const AddrMode &AMode, Type *Ty,
19010 unsigned AS, Instruction *I) const {
19011 // AArch64 has five basic addressing modes:
19012 // reg
19013 // reg + 9-bit signed offset
19014 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
19015 // reg1 + reg2
19016 // reg + SIZE_IN_BYTES * reg
19017
19018 // No global is ever allowed as a base.
19019 if (AMode.BaseGV)
19020 return false;
19021
19022 // No reg+reg+imm addressing.
19023 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
19024 return false;
19025
19026 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
19027 // `2*ScaledReg` into `BaseReg + ScaledReg`
19028 AddrMode AM = AMode;
19029 if (AM.Scale && !AM.HasBaseReg) {
19030 if (AM.Scale == 1) {
19031 AM.HasBaseReg = true;
19032 AM.Scale = 0;
19033 } else if (AM.Scale == 2) {
19034 AM.HasBaseReg = true;
19035 AM.Scale = 1;
19036 } else {
19037 return false;
19038 }
19039 }
19040
19041 // A base register is required in all addressing modes.
19042 if (!AM.HasBaseReg)
19043 return false;
19044
19045 if (Ty->isScalableTy()) {
19046 if (isa<ScalableVectorType>(Ty)) {
19047 // See if we have a foldable vscale-based offset, for vector types which
19048 // are either legal or smaller than the minimum; more work will be
19049 // required if we need to consider addressing for types which need
19050 // legalization by splitting.
19051 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
19052 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
19053 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
19054 isPowerOf2_64(VecNumBytes))
19055 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
19056
19057 uint64_t VecElemNumBytes =
19058 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
19059 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
19060 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
19061 }
19062
19063 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
19064 }
19065
19066 // No scalable offsets allowed for non-scalable types.
19067 if (AM.ScalableOffset)
19068 return false;
19069
19070 // check reg + imm case:
19071 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
19072 uint64_t NumBytes = 0;
19073 if (Ty->isSized()) {
19074 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
19075 NumBytes = NumBits / 8;
19076 if (!isPowerOf2_64(NumBits))
19077 NumBytes = 0;
19078 }
19079
19080 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
19081 AM.Scale);
19082}
19083
19084// Check whether the 2 offsets belong to the same imm24 range, and their high
19085// 12bits are same, then their high part can be decoded with the offset of add.
19086int64_t
19088 int64_t MaxOffset) const {
19089 int64_t HighPart = MinOffset & ~0xfffULL;
19090 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
19091 // Rebase the value to an integer multiple of imm12.
19092 return HighPart;
19093 }
19094
19095 return 0;
19096}
19097
19099 // Consider splitting large offset of struct or array.
19100 return true;
19101}
19102
19104 const MachineFunction &MF, EVT VT) const {
19105 EVT ScalarVT = VT.getScalarType();
19106
19107 if (!ScalarVT.isSimple())
19108 return false;
19109
19110 switch (ScalarVT.getSimpleVT().SimpleTy) {
19111 case MVT::f16:
19112 return Subtarget->hasFullFP16();
19113 case MVT::f32:
19114 case MVT::f64:
19115 return true;
19116 case MVT::bf16:
19117 return VT.isScalableVector() && Subtarget->hasBF16() &&
19118 Subtarget->isNonStreamingSVEorSME2Available();
19119 default:
19120 break;
19121 }
19122
19123 return false;
19124}
19125
19127 Type *Ty) const {
19128 switch (Ty->getScalarType()->getTypeID()) {
19129 case Type::FloatTyID:
19130 case Type::DoubleTyID:
19131 return true;
19132 default:
19133 return false;
19134 }
19135}
19136
19138 EVT VT, CodeGenOptLevel OptLevel) const {
19139 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
19141}
19142
19143const MCPhysReg *
19145 // LR is a callee-save register, but we must treat it as clobbered by any call
19146 // site. Hence we include LR in the scratch registers, which are in turn added
19147 // as implicit-defs for stackmaps and patchpoints.
19148 static const MCPhysReg ScratchRegs[] = {
19149 AArch64::X16, AArch64::X17, AArch64::LR, 0
19150 };
19151 return ScratchRegs;
19152}
19153
19155 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
19156 return RCRegs;
19157}
19158
19159bool
19161 CombineLevel Level) const {
19162 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
19163 N->getOpcode() == ISD::SRL) &&
19164 "Expected shift op");
19165
19166 SDValue ShiftLHS = N->getOperand(0);
19167 EVT VT = N->getValueType(0);
19168
19169 if (!ShiftLHS->hasOneUse())
19170 return false;
19171
19172 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
19173 !ShiftLHS.getOperand(0)->hasOneUse())
19174 return false;
19175
19176 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
19177 // combine it with shift 'N' to let it be lowered to UBFX except:
19178 // ((x >> C) & mask) << C.
19179 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
19180 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
19181 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
19182 if (isMask_64(TruncMask)) {
19183 SDValue AndLHS = ShiftLHS.getOperand(0);
19184 if (AndLHS.getOpcode() == ISD::SRL) {
19185 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
19186 if (N->getOpcode() == ISD::SHL)
19187 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
19188 return SRLC->getZExtValue() == SHLC->getZExtValue();
19189 return false;
19190 }
19191 }
19192 }
19193 }
19194 return true;
19195}
19196
19198 const SDNode *N) const {
19199 assert(N->getOpcode() == ISD::XOR &&
19200 (N->getOperand(0).getOpcode() == ISD::SHL ||
19201 N->getOperand(0).getOpcode() == ISD::SRL) &&
19202 "Expected XOR(SHIFT) pattern");
19203
19204 // Only commute if the entire NOT mask is a hidden shifted mask.
19205 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
19206 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19207 if (XorC && ShiftC) {
19208 unsigned MaskIdx, MaskLen;
19209 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
19210 unsigned ShiftAmt = ShiftC->getZExtValue();
19211 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
19212 if (N->getOperand(0).getOpcode() == ISD::SHL)
19213 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
19214 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
19215 }
19216 }
19217
19218 return false;
19219}
19220
19222 const SDNode *N) const {
19223 assert(((N->getOpcode() == ISD::SHL &&
19224 N->getOperand(0).getOpcode() == ISD::SRL) ||
19225 (N->getOpcode() == ISD::SRL &&
19226 N->getOperand(0).getOpcode() == ISD::SHL)) &&
19227 "Expected shift-shift mask");
19228 // Don't allow multiuse shift folding with the same shift amount.
19229 if (!N->getOperand(0)->hasOneUse())
19230 return false;
19231
19232 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
19233 EVT VT = N->getValueType(0);
19234 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
19235 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19236 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
19237 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
19238 }
19239
19240 // We do not need to fold when this shifting used in specific load case:
19241 // (ldr x, (add x, (shl (srl x, c1) 2)))
19242 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
19243 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
19244 unsigned ShlAmt = C2->getZExtValue();
19245 if (auto ShouldADD = *N->user_begin();
19246 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
19247 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
19248 EVT MemVT = Load->getMemoryVT();
19249
19250 if (Load->getValueType(0).isScalableVector())
19251 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
19252
19253 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
19254 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
19255 }
19256 }
19257 }
19258 }
19259
19260 return true;
19261}
19262
19264 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
19265 SDValue Y) const {
19266 return VT.isScalableVector() && isTypeLegal(VT) &&
19267 SelectOpcode == ISD::VSELECT;
19268}
19269
19271 Type *Ty) const {
19272 assert(Ty->isIntegerTy());
19273
19274 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19275 if (BitSize == 0)
19276 return false;
19277
19278 int64_t Val = Imm.getSExtValue();
19279 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
19280 return true;
19281
19282 if (Val < 0)
19283 Val = ~Val;
19284 if (BitSize == 32)
19285 Val &= (1LL << 32) - 1;
19286
19287 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
19288 // MOVZ is free so return true for one or fewer MOVK.
19289 return Shift < 3;
19290}
19291
19293 unsigned Index) const {
19295 return false;
19296
19297 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
19298}
19299
19301 LLVMContext &Context, EVT VT) const {
19302 if (getTypeAction(Context, VT) != TypeExpandInteger)
19303 return false;
19304
19305 EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
19306 return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
19307}
19308
19309/// Turn vector tests of the signbit in the form of:
19310/// xor (sra X, elt_size(X)-1), -1
19311/// into:
19312/// cmge X, X, #0
19314 const AArch64Subtarget *Subtarget) {
19315 EVT VT = N->getValueType(0);
19316 if (!Subtarget->hasNEON() || !VT.isVector())
19317 return SDValue();
19318
19319 // There must be a shift right algebraic before the xor, and the xor must be a
19320 // 'not' operation.
19321 SDValue Shift = N->getOperand(0);
19322 SDValue Ones = N->getOperand(1);
19323 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
19325 return SDValue();
19326
19327 // The shift should be smearing the sign bit across each vector element.
19328 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
19329 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
19330 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
19331 return SDValue();
19332
19333 SDLoc DL(N);
19334 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
19335 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
19336}
19337
19338// Given a vecreduce_add node, detect the below pattern and convert it to the
19339// node sequence with UABDL, [S|U]ADB and UADDLP.
19340//
19341// i32 vecreduce_add(
19342// v16i32 abs(
19343// v16i32 sub(
19344// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
19345//
19346// or
19347//
19348// i32 vecreduce_add(
19349// v16i32 zext(
19350// v16i16 abs(
19351// v16i16 sub(
19352// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
19353//
19354// =================>
19355// i32 vecreduce_add(
19356// v4i32 UADDLP(
19357// v8i16 add(
19358// v8i16 zext(
19359// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
19360// v8i16 zext(
19361// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
19363 SelectionDAG &DAG) {
19364 // Assumed i32 vecreduce_add
19365 if (N->getValueType(0) != MVT::i32)
19366 return SDValue();
19367
19368 SDValue VecReduceOp0 = N->getOperand(0);
19369 bool SawTrailingZext = false;
19370 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
19371 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
19372 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
19373 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
19374 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
19375 SawTrailingZext = true;
19376 VecReduceOp0 = VecReduceOp0.getOperand(0);
19377 }
19378
19379 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
19380 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
19381 // Assumed v16i16 or v16i32 abs input
19382 unsigned Opcode = VecReduceOp0.getOpcode();
19383 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
19384 return SDValue();
19385
19386 SDValue ABS = VecReduceOp0;
19387 // Assumed v16i16 or v16i32 sub
19388 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
19389 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
19390 return SDValue();
19391
19392 SDValue SUB = ABS->getOperand(0);
19393 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
19394 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
19395 // Assumed v16i16 or v16i32 type
19396 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
19397 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
19398 return SDValue();
19399
19400 // Assumed zext or sext
19401 bool IsZExt = false;
19402 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
19403 IsZExt = true;
19404 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
19405 IsZExt = false;
19406 } else
19407 return SDValue();
19408
19409 SDValue EXT0 = SUB->getOperand(0);
19410 SDValue EXT1 = SUB->getOperand(1);
19411 // Assumed zext's operand has v16i8 type
19412 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
19413 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
19414 return SDValue();
19415
19416 // Pattern is detected. Let's convert it to sequence of nodes.
19417 SDLoc DL(N);
19418
19419 // First, create the node pattern of UABD/SABD.
19420 SDValue UABDHigh8Op0 =
19421 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19422 DAG.getConstant(8, DL, MVT::i64));
19423 SDValue UABDHigh8Op1 =
19424 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19425 DAG.getConstant(8, DL, MVT::i64));
19426 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19427 UABDHigh8Op0, UABDHigh8Op1);
19428 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
19429
19430 // Second, create the node pattern of UABAL.
19431 SDValue UABDLo8Op0 =
19432 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19433 DAG.getConstant(0, DL, MVT::i64));
19434 SDValue UABDLo8Op1 =
19435 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19436 DAG.getConstant(0, DL, MVT::i64));
19437 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19438 UABDLo8Op0, UABDLo8Op1);
19439 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
19440 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
19441
19442 // Third, create the node of UADDLP.
19443 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
19444
19445 // Fourth, create the node of VECREDUCE_ADD.
19446 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
19447}
19448
19449static SDValue
19451 const AArch64Subtarget *ST) {
19452 if (DCI.isBeforeLegalize())
19453 return SDValue();
19454
19455 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
19456 /*IsEqual=*/false))
19457 return While;
19458
19459 if (!N->getValueType(0).isScalableVector() ||
19460 !ST->isSVEorStreamingSVEAvailable() ||
19461 !(ST->hasSVE2p1() || ST->hasSME2()))
19462 return SDValue();
19463
19464 // Count the number of users which are extract_vectors.
19465 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
19466 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
19467 });
19468
19469 auto MaskEC = N->getValueType(0).getVectorElementCount();
19470 if (!MaskEC.isKnownMultipleOf(NumExts))
19471 return SDValue();
19472
19473 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
19474 if (ExtMinEC.getKnownMinValue() < 2)
19475 return SDValue();
19476
19477 SmallVector<SDNode *> Extracts(NumExts, nullptr);
19478 for (SDNode *Use : N->users()) {
19479 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
19480 continue;
19481
19482 // Ensure the extract type is correct (e.g. if NumExts is 4 and
19483 // the mask return type is nxv8i1, each extract should be nxv2i1.
19484 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
19485 return SDValue();
19486
19487 // There should be exactly one extract for each part of the mask.
19488 unsigned Offset = Use->getConstantOperandVal(1);
19489 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
19490 if (Extracts[Part] != nullptr)
19491 return SDValue();
19492
19493 Extracts[Part] = Use;
19494 }
19495
19496 SelectionDAG &DAG = DCI.DAG;
19497 SDLoc DL(N);
19498 SDValue ID =
19499 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
19500
19501 SDValue Idx = N->getOperand(0);
19502 SDValue TC = N->getOperand(1);
19503 if (Idx.getValueType() != MVT::i64) {
19504 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
19505 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
19506 }
19507
19508 // Create the whilelo_x2 intrinsics from each pair of extracts
19509 EVT ExtVT = Extracts[0]->getValueType(0);
19510 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19511 auto R =
19512 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19513 DCI.CombineTo(Extracts[0], R.getValue(0));
19514 DCI.CombineTo(Extracts[1], R.getValue(1));
19515 SmallVector<SDValue> Concats = {DAG.getNode(
19516 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
19517
19518 if (NumExts == 2) {
19519 assert(N->getValueType(0) == DoubleExtVT);
19520 return Concats[0];
19521 }
19522
19523 auto Elts =
19524 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
19525 for (unsigned I = 2; I < NumExts; I += 2) {
19526 // After the first whilelo_x2, we need to increment the starting value.
19527 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
19528 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19529 DCI.CombineTo(Extracts[I], R.getValue(0));
19530 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
19531 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
19532 R.getValue(0), R.getValue(1)));
19533 }
19534
19535 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
19536}
19537
19538// Turn vecreduce.add(ZExt(predicate)) into cntp(predicate).
19540 const AArch64Subtarget *ST) {
19541 SDValue Op = N->getOperand(0);
19542 if (Op->getOpcode() != ISD::ZERO_EXTEND)
19543 return SDValue();
19544
19545 SDValue ZExtOp = Op->getOperand(0);
19546 EVT VT = ZExtOp.getValueType();
19547 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19549 return SDValue();
19550
19551 SDLoc DL(N);
19552 SDValue Cntp = DAG.getNode(
19553 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
19554 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), ZExtOp,
19555 ZExtOp);
19556 return DAG.getZExtOrTrunc(Cntp, DL, N->getValueType(0));
19557}
19558
19560 const AArch64Subtarget *ST) {
19561 if (SDValue Result = performVecReduceAddCntpCombine(N, DAG, ST))
19562 return Result;
19563
19564 if (!ST->isNeonAvailable())
19565 return SDValue();
19566
19567 if (!ST->hasDotProd())
19569
19570 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
19571 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
19572 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
19573 // If we have vectors larger than v16i8 we extract v16i8 vectors,
19574 // Follow the same steps above to get DOT instructions concatenate them
19575 // and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
19576
19577 SDValue Op0 = N->getOperand(0);
19578 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
19579 Op0.getValueType().getVectorElementType() != MVT::i32)
19580 return SDValue();
19581
19582 unsigned ExtOpcode = Op0.getOpcode();
19583 SDValue A = Op0;
19584 SDValue B;
19585 unsigned DotOpcode;
19586 if (ExtOpcode == ISD::MUL) {
19587 A = Op0.getOperand(0);
19588 B = Op0.getOperand(1);
19589 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
19590 return SDValue();
19591 auto OpCodeA = A.getOpcode();
19592 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
19593 return SDValue();
19594
19595 auto OpCodeB = B.getOpcode();
19596 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
19597 return SDValue();
19598
19599 if (OpCodeA == OpCodeB) {
19600 DotOpcode =
19601 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
19602 } else {
19603 // Check USDOT support support
19604 if (!ST->hasMatMulInt8())
19605 return SDValue();
19606 DotOpcode = AArch64ISD::USDOT;
19607 if (OpCodeA == ISD::SIGN_EXTEND)
19608 std::swap(A, B);
19609 }
19610 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
19611 DotOpcode = AArch64ISD::UDOT;
19612 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
19613 DotOpcode = AArch64ISD::SDOT;
19614 } else {
19615 return SDValue();
19616 }
19617
19618 EVT Op0VT = A.getOperand(0).getValueType();
19619 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19620 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19621 if (!IsValidElementCount || !IsValidSize)
19622 return SDValue();
19623
19624 SDLoc DL(Op0);
19625 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19626 // the extend B.
19627 if (!B)
19628 B = DAG.getConstant(1, DL, Op0VT);
19629 else
19630 B = B.getOperand(0);
19631
19632 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19633 unsigned NumOfVecReduce;
19634 EVT TargetType;
19635 if (IsMultipleOf16) {
19636 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19637 TargetType = MVT::v4i32;
19638 } else {
19639 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19640 TargetType = MVT::v2i32;
19641 }
19642 // Handle the case where we need to generate only one Dot operation.
19643 if (NumOfVecReduce == 1) {
19644 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19645 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19646 A.getOperand(0), B);
19647 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19648 }
19649 // Generate Dot instructions that are multiple of 16.
19650 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19651 SmallVector<SDValue, 4> SDotVec16;
19652 unsigned I = 0;
19653 for (; I < VecReduce16Num; I += 1) {
19654 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19655 SDValue Op0 =
19656 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19657 DAG.getConstant(I * 16, DL, MVT::i64));
19658 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19659 DAG.getConstant(I * 16, DL, MVT::i64));
19660 SDValue Dot =
19661 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19662 SDotVec16.push_back(Dot);
19663 }
19664 // Concatenate dot operations.
19665 EVT SDot16EVT =
19666 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19667 SDValue ConcatSDot16 =
19668 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19669 SDValue VecReduceAdd16 =
19670 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19671 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19672 if (VecReduce8Num == 0)
19673 return VecReduceAdd16;
19674
19675 // Generate the remainder Dot operation that is multiple of 8.
19676 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19677 SDValue Vec8Op0 =
19678 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19679 DAG.getConstant(I * 16, DL, MVT::i64));
19680 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19681 DAG.getConstant(I * 16, DL, MVT::i64));
19682 SDValue Dot =
19683 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19684 SDValue VecReduceAdd8 =
19685 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19686 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19687 VecReduceAdd8);
19688}
19689
19690// Given an (integer) vecreduce, we know the order of the inputs does not
19691// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19692// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19693// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19695 auto DetectAddExtract = [&](SDValue A) {
19696 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19697 // UADDLP(x) if found.
19698 assert(A.getOpcode() == ISD::ADD);
19699 EVT VT = A.getValueType();
19700 SDValue Op0 = A.getOperand(0);
19701 SDValue Op1 = A.getOperand(1);
19702 if (Op0.getOpcode() != Op1.getOpcode() ||
19703 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19704 Op0.getOpcode() != ISD::SIGN_EXTEND))
19705 return SDValue();
19706 SDValue Ext0 = Op0.getOperand(0);
19707 SDValue Ext1 = Op1.getOperand(0);
19708 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19710 Ext0.getOperand(0) != Ext1.getOperand(0) ||
19712 return SDValue();
19713 // Check that the type is twice the add types, and the extract are from
19714 // upper/lower parts of the same source.
19716 VT.getVectorNumElements() * 2)
19717 return SDValue();
19718 if ((Ext0.getConstantOperandVal(1) != 0 ||
19720 (Ext1.getConstantOperandVal(1) != 0 ||
19722 return SDValue();
19723 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19724 : AArch64ISD::SADDLP;
19725 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19726 };
19727
19728 if (SDValue R = DetectAddExtract(A))
19729 return R;
19730
19731 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19732 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19733 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19734 A.getOperand(1));
19735 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19736 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19737 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19738 A.getOperand(0));
19739 return SDValue();
19740}
19741
19742// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19743// UADDLV(concat), where the concat represents the 64-bit zext sources.
19745 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19746 // UADDLV(concat(zext, zext)) if found.
19747 assert(A.getOpcode() == ISD::ADD);
19748 EVT VT = A.getValueType();
19749 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19750 return SDValue();
19751 SDValue Op0 = A.getOperand(0);
19752 SDValue Op1 = A.getOperand(1);
19753 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19754 return SDValue();
19755 SDValue Ext0 = Op0.getOperand(0);
19756 SDValue Ext1 = Op1.getOperand(0);
19757 EVT ExtVT0 = Ext0.getValueType();
19758 EVT ExtVT1 = Ext1.getValueType();
19759 // Check zext VTs are the same and 64-bit length.
19760 if (ExtVT0 != ExtVT1 ||
19761 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19762 return SDValue();
19763 // Get VT for concat of zext sources.
19764 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19765 SDValue Concat =
19766 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19767
19768 switch (VT.getSimpleVT().SimpleTy) {
19769 case MVT::v2i64:
19770 case MVT::v4i32:
19771 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19772 case MVT::v8i16: {
19773 SDValue Uaddlv =
19774 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19775 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19776 }
19777 default:
19778 llvm_unreachable("Unhandled vector type");
19779 }
19780}
19781
19783 SDValue A = N->getOperand(0);
19784 if (A.getOpcode() == ISD::ADD) {
19785 if (SDValue R = performUADDVAddCombine(A, DAG))
19786 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19787 else if (SDValue R = performUADDVZextCombine(A, DAG))
19788 return R;
19789 }
19790
19791 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19792 MVT OpVT = A.getSimpleValueType();
19793 assert(N->getSimpleValueType(0) == OpVT &&
19794 "The operand type should be consistent with the result type of UADDV");
19796 Mask.clearBit(0);
19797 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19798 if (KnownLeadingLanes.isZero())
19799 return A;
19800
19801 return SDValue();
19802}
19803
19807 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
19808 APInt DemandedElts =
19809 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
19810
19812 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
19813 return SDValue(N, 0);
19814 return SDValue();
19815}
19816
19819 const AArch64Subtarget *Subtarget) {
19820 if (DCI.isBeforeLegalizeOps())
19821 return SDValue();
19822
19823 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19824}
19825
19826SDValue
19827AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19828 SelectionDAG &DAG,
19829 SmallVectorImpl<SDNode *> &Created) const {
19830 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19831 if (isIntDivCheap(N->getValueType(0), Attr))
19832 return SDValue(N, 0); // Lower SDIV as SDIV
19833
19834 EVT VT = N->getValueType(0);
19835
19836 // If SVE is available, we can generate
19837 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19838 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19839 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19840 return SDValue(N, 0);
19841
19842 // fold (sdiv X, pow2)
19843 if ((VT != MVT::i32 && VT != MVT::i64) ||
19844 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19845 return SDValue();
19846
19847 // If the divisor is 2 or -2, the default expansion is better. It will add
19848 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19849 if (Divisor == 2 ||
19850 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19851 return SDValue();
19852
19853 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19854}
19855
19856SDValue
19857AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19858 SelectionDAG &DAG,
19859 SmallVectorImpl<SDNode *> &Created) const {
19860 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19861 if (isIntDivCheap(N->getValueType(0), Attr))
19862 return SDValue(N, 0); // Lower SREM as SREM
19863
19864 EVT VT = N->getValueType(0);
19865
19866 // For scalable and fixed types, mark them as cheap so we can handle it much
19867 // later. This allows us to handle larger than legal types.
19868 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19869 return SDValue(N, 0);
19870
19871 // fold (srem X, pow2)
19872 if ((VT != MVT::i32 && VT != MVT::i64) ||
19873 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19874 return SDValue();
19875
19876 unsigned Lg2 = Divisor.countr_zero();
19877 if (Lg2 == 0)
19878 return SDValue();
19879
19880 SDLoc DL(N);
19881 SDValue N0 = N->getOperand(0);
19882 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19883 SDValue Zero = DAG.getConstant(0, DL, VT);
19884 SDValue CCVal, CSNeg;
19885 if (Lg2 == 1) {
19886 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19887 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19888 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19889
19890 Created.push_back(Cmp.getNode());
19891 Created.push_back(And.getNode());
19892 } else {
19893 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19894 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19895
19896 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19897 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19898 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19899 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19900 Negs.getValue(1));
19901
19902 Created.push_back(Negs.getNode());
19903 Created.push_back(AndPos.getNode());
19904 Created.push_back(AndNeg.getNode());
19905 }
19906
19907 return CSNeg;
19908}
19909
19911 switch(getIntrinsicID(S.getNode())) {
19912 default:
19913 break;
19914 case Intrinsic::aarch64_sve_cntb:
19915 case Intrinsic::aarch64_sve_cnth:
19916 case Intrinsic::aarch64_sve_cntw:
19917 case Intrinsic::aarch64_sve_cntd:
19918 return true;
19919 }
19920 return false;
19921}
19922
19923// Returns the maximum (scalable) value that can be returned by an SVE count
19924// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
19925static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
19926 Intrinsic::ID IID = getIntrinsicID(Op.getNode());
19927 if (IID == Intrinsic::aarch64_sve_cntp)
19928 return Op.getOperand(1).getValueType().getVectorElementCount();
19929 switch (IID) {
19930 case Intrinsic::aarch64_sve_cntd:
19931 return ElementCount::getScalable(2);
19932 case Intrinsic::aarch64_sve_cntw:
19933 return ElementCount::getScalable(4);
19934 case Intrinsic::aarch64_sve_cnth:
19935 return ElementCount::getScalable(8);
19936 case Intrinsic::aarch64_sve_cntb:
19937 return ElementCount::getScalable(16);
19938 default:
19939 return std::nullopt;
19940 }
19941}
19942
19943/// Calculates what the pre-extend type is, based on the extension
19944/// operation node provided by \p Extend.
19945///
19946/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19947/// pre-extend type is pulled directly from the operand, while other extend
19948/// operations need a bit more inspection to get this information.
19949///
19950/// \param Extend The SDNode from the DAG that represents the extend operation
19951///
19952/// \returns The type representing the \p Extend source type, or \p MVT::Other
19953/// if no valid type can be determined
19955 switch (Extend.getOpcode()) {
19956 case ISD::SIGN_EXTEND:
19957 case ISD::ZERO_EXTEND:
19958 case ISD::ANY_EXTEND:
19959 return Extend.getOperand(0).getValueType();
19960 case ISD::AssertSext:
19961 case ISD::AssertZext:
19963 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19964 if (!TypeNode)
19965 return MVT::Other;
19966 return TypeNode->getVT();
19967 }
19968 case ISD::AND: {
19971 if (!Constant)
19972 return MVT::Other;
19973
19974 uint32_t Mask = Constant->getZExtValue();
19975
19976 if (Mask == UCHAR_MAX)
19977 return MVT::i8;
19978 else if (Mask == USHRT_MAX)
19979 return MVT::i16;
19980 else if (Mask == UINT_MAX)
19981 return MVT::i32;
19982
19983 return MVT::Other;
19984 }
19985 default:
19986 return MVT::Other;
19987 }
19988}
19989
19990/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19991/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19992/// SExt/ZExt rather than the scalar SExt/ZExt
19994 EVT VT = BV.getValueType();
19995 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19997 return SDValue();
19998
19999 // Use the first item in the buildvector/shuffle to get the size of the
20000 // extend, and make sure it looks valid.
20001 SDValue Extend = BV->getOperand(0);
20002 unsigned ExtendOpcode = Extend.getOpcode();
20003 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
20004 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
20005 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
20006 ExtendOpcode == ISD::AssertSext;
20007 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
20008 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
20009 return SDValue();
20010 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
20011 // ensure calculatePreExtendType will work without issue.
20012 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
20013 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
20014 return SDValue();
20015
20016 // Restrict valid pre-extend data type
20017 EVT PreExtendType = calculatePreExtendType(Extend);
20018 if (PreExtendType == MVT::Other ||
20019 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
20020 return SDValue();
20021
20022 // Make sure all other operands are equally extended.
20023 bool SeenZExtOrSExt = !IsAnyExt;
20024 for (SDValue Op : drop_begin(BV->ops())) {
20025 if (Op.isUndef())
20026 continue;
20027
20028 if (calculatePreExtendType(Op) != PreExtendType)
20029 return SDValue();
20030
20031 unsigned Opc = Op.getOpcode();
20032 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
20034 return SDValue();
20035
20036 if (Opc == ISD::ANY_EXTEND)
20037 continue;
20038
20039 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
20041
20042 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
20043 return SDValue();
20044
20045 IsSExt = OpcIsSExt;
20046 SeenZExtOrSExt = true;
20047 }
20048
20049 SDValue NBV;
20050 SDLoc DL(BV);
20051 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
20052 EVT PreExtendVT =
20053 VT.changeVectorElementType(*DAG.getContext(), PreExtendType);
20054 EVT PreExtendLegalType =
20055 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
20057 for (SDValue Op : BV->ops())
20058 NewOps.push_back(Op.isUndef() ? DAG.getPOISON(PreExtendLegalType)
20059 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
20060 PreExtendLegalType));
20061 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
20062 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
20063 EVT PreExtendVT = VT.changeVectorElementType(*DAG.getContext(),
20064 PreExtendType.getScalarType());
20065 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
20066 BV.getOperand(1).isUndef()
20067 ? DAG.getPOISON(PreExtendVT)
20068 : BV.getOperand(1).getOperand(0),
20069 cast<ShuffleVectorSDNode>(BV)->getMask());
20070 }
20071 unsigned ExtOpc = !SeenZExtOrSExt
20073 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
20074 return DAG.getNode(ExtOpc, DL, VT, NBV);
20075}
20076
20077/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
20078/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
20080 // If the value type isn't a vector, none of the operands are going to be dups
20081 EVT VT = Mul->getValueType(0);
20082 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
20083 return SDValue();
20084
20085 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
20086 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
20087
20088 // Neither operands have been changed, don't make any further changes
20089 if (!Op0 && !Op1)
20090 return SDValue();
20091
20092 SDLoc DL(Mul);
20093 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
20094 Op1 ? Op1 : Mul->getOperand(1));
20095}
20096
20097// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
20098// folding a power-of-two factor of the constant into the RDSVL immediate and
20099// compensating with an extra shift.
20100//
20101// We rewrite:
20102// (mul (srl (rdsvl 1), w), x)
20103// to one of:
20104// (shl (rdsvl y), z) if z > 0
20105// (srl (rdsvl y), abs(z)) if z < 0
20106// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
20108 SDLoc DL(Mul);
20109 EVT VT = Mul->getValueType(0);
20110 SDValue MulOp0 = Mul->getOperand(0);
20111 int ConstMultiplier =
20112 cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
20113 if ((MulOp0->getOpcode() != ISD::SRL) ||
20114 (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
20115 return SDValue();
20116
20117 unsigned AbsConstValue = abs(ConstMultiplier);
20118 unsigned OperandShift =
20119 cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
20120
20121 // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
20122 // integral)
20123 int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
20124
20125 // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
20126 // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
20127 unsigned B = ConstMultiplier < 0 ? 32 : 31;
20128 unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
20129 int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
20130
20131 // No valid solution found.
20132 if (LowerBound > UpperBound)
20133 return SDValue();
20134
20135 // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
20136 // shift if possible.
20137 int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
20138
20139 // y = x / 2^(w + z)
20140 int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
20141 (ConstMultiplier < 0 ? -1 : 1);
20142 auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
20143 DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
20144
20145 if (Shift == 0)
20146 return Rdsvl;
20147 return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
20148 DAG.getConstant(abs(Shift), DL, MVT::i32),
20150}
20151
20152// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
20153// Same for other types with equivalent constants.
20155 EVT VT = N->getValueType(0);
20156 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
20157 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
20158 return SDValue();
20159 if (N->getOperand(0).getOpcode() != ISD::AND ||
20160 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
20161 return SDValue();
20162
20163 SDValue And = N->getOperand(0);
20164 SDValue Srl = And.getOperand(0);
20165
20166 APInt V1, V2, V3;
20167 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
20168 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
20170 return SDValue();
20171
20172 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
20173 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
20174 V3 != (HalfSize - 1))
20175 return SDValue();
20176
20177 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20178 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
20179 VT.getVectorElementCount() * 2);
20180
20181 SDLoc DL(N);
20182 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
20183 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
20184 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
20185 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
20186}
20187
20188// Transform vector add(zext i8 to i32, zext i8 to i32)
20189// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
20190// This allows extra uses of saddl/uaddl at the lower vector widths, and less
20191// extends.
20193 EVT VT = N->getValueType(0);
20194 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
20195 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
20196 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
20197 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
20198 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
20199 N->getOperand(0).getOperand(0).getValueType() !=
20200 N->getOperand(1).getOperand(0).getValueType())
20201 return SDValue();
20202
20203 if (N->getOpcode() == ISD::MUL &&
20204 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
20205 return SDValue();
20206
20207 SDValue N0 = N->getOperand(0).getOperand(0);
20208 SDValue N1 = N->getOperand(1).getOperand(0);
20209 EVT InVT = N0.getValueType();
20210
20211 EVT S1 = InVT.getScalarType();
20212 EVT S2 = VT.getScalarType();
20213 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
20214 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
20215 SDLoc DL(N);
20216 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20219 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
20220 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
20221 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
20222 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
20223 : (unsigned)ISD::SIGN_EXTEND,
20224 DL, VT, NewOp);
20225 }
20226 return SDValue();
20227}
20228
20231 const AArch64Subtarget *Subtarget) {
20232
20233 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
20234 return Ext;
20236 return Ext;
20237 if (SDValue Ext = performVectorExtCombine(N, DAG))
20238 return Ext;
20239
20240 if (DCI.isBeforeLegalizeOps())
20241 return SDValue();
20242
20243 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
20244 // and in MachineCombiner pass, add+mul will be combined into madd.
20245 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
20246 SDLoc DL(N);
20247 EVT VT = N->getValueType(0);
20248 SDValue N0 = N->getOperand(0);
20249 SDValue N1 = N->getOperand(1);
20250 SDValue MulOper;
20251 unsigned AddSubOpc;
20252
20253 auto IsAddSubWith1 = [&](SDValue V) -> bool {
20254 AddSubOpc = V->getOpcode();
20255 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
20256 SDValue Opnd = V->getOperand(1);
20257 MulOper = V->getOperand(0);
20258 if (AddSubOpc == ISD::SUB)
20259 std::swap(Opnd, MulOper);
20260 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
20261 return C->isOne();
20262 }
20263 return false;
20264 };
20265
20266 if (IsAddSubWith1(N0)) {
20267 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
20268 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
20269 }
20270
20271 if (IsAddSubWith1(N1)) {
20272 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
20273 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
20274 }
20275
20276 // The below optimizations require a constant RHS.
20277 if (!isa<ConstantSDNode>(N1))
20278 return SDValue();
20279
20280 if (SDValue Ext = performMulRdsvlCombine(N, DAG))
20281 return Ext;
20282
20284 const APInt &ConstValue = C->getAPIntValue();
20285
20286 // Allow the scaling to be folded into the `cnt` instruction by preventing
20287 // the scaling to be obscured here. This makes it easier to pattern match.
20288 if (IsSVECntIntrinsic(N0) ||
20289 (N0->getOpcode() == ISD::TRUNCATE &&
20290 (IsSVECntIntrinsic(N0->getOperand(0)))))
20291 if (ConstValue.sge(1) && ConstValue.sle(16))
20292 return SDValue();
20293
20294 // Multiplication of a power of two plus/minus one can be done more
20295 // cheaply as shift+add/sub. For now, this is true unilaterally. If
20296 // future CPUs have a cheaper MADD instruction, this may need to be
20297 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
20298 // 64-bit is 5 cycles, so this is always a win.
20299 // More aggressively, some multiplications N0 * C can be lowered to
20300 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
20301 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
20302 // TODO: lower more cases.
20303
20304 // TrailingZeroes is used to test if the mul can be lowered to
20305 // shift+add+shift.
20306 unsigned TrailingZeroes = ConstValue.countr_zero();
20307 if (TrailingZeroes) {
20308 // Conservatively do not lower to shift+add+shift if the mul might be
20309 // folded into smul or umul.
20310 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
20311 isZeroExtended(N0, DAG)))
20312 return SDValue();
20313 // Conservatively do not lower to shift+add+shift if the mul might be
20314 // folded into madd or msub.
20315 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
20316 N->user_begin()->getOpcode() == ISD::SUB))
20317 return SDValue();
20318 }
20319 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
20320 // and shift+add+shift.
20321 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
20322 unsigned ShiftAmt;
20323
20324 auto Shl = [&](SDValue N0, unsigned N1) {
20325 if (!N0.getNode())
20326 return SDValue();
20327 // If shift causes overflow, ignore this combine.
20328 if (N1 >= N0.getValueSizeInBits())
20329 return SDValue();
20330 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
20331 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
20332 };
20333 auto Add = [&](SDValue N0, SDValue N1) {
20334 if (!N0.getNode() || !N1.getNode())
20335 return SDValue();
20336 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
20337 };
20338 auto Sub = [&](SDValue N0, SDValue N1) {
20339 if (!N0.getNode() || !N1.getNode())
20340 return SDValue();
20341 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
20342 };
20343 auto Negate = [&](SDValue N) {
20344 if (!N0.getNode())
20345 return SDValue();
20346 SDValue Zero = DAG.getConstant(0, DL, VT);
20347 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
20348 };
20349
20350 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
20351 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
20352 // the (2^N - 1) can't be execused via a single instruction.
20353 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
20354 unsigned BitWidth = C.getBitWidth();
20355 for (unsigned i = 1; i < BitWidth / 2; i++) {
20356 APInt Rem;
20357 APInt X(BitWidth, (1 << i) + 1);
20358 APInt::sdivrem(C, X, N, Rem);
20359 APInt NVMinus1 = N - 1;
20360 if (Rem == 0 && NVMinus1.isPowerOf2()) {
20361 M = X;
20362 return true;
20363 }
20364 }
20365 return false;
20366 };
20367
20368 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
20369 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
20370 // the (2^N - 1) can't be execused via a single instruction.
20371 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
20372 APInt CVMinus1 = C - 1;
20373 if (CVMinus1.isNegative())
20374 return false;
20375 unsigned TrailingZeroes = CVMinus1.countr_zero();
20376 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
20377 if (SCVMinus1.isPowerOf2()) {
20378 unsigned BitWidth = SCVMinus1.getBitWidth();
20379 M = APInt(BitWidth, SCVMinus1.logBase2());
20380 N = APInt(BitWidth, TrailingZeroes);
20381 return true;
20382 }
20383 return false;
20384 };
20385
20386 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
20387 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
20388 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
20389 APInt CVMinus1 = C - 1;
20390 if (CVMinus1.isNegative())
20391 return false;
20392 unsigned TrailingZeroes = CVMinus1.countr_zero();
20393 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
20394 if (CVPlus1.isPowerOf2()) {
20395 unsigned BitWidth = CVPlus1.getBitWidth();
20396 M = APInt(BitWidth, CVPlus1.logBase2());
20397 N = APInt(BitWidth, TrailingZeroes);
20398 return true;
20399 }
20400 return false;
20401 };
20402
20403 if (ConstValue.isNonNegative()) {
20404 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
20405 // (mul x, 2^N - 1) => (sub (shl x, N), x)
20406 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
20407 // (mul x, (2^M + 1) * (2^N + 1))
20408 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
20409 // (mul x, (2^M + 1) * 2^N + 1))
20410 // => MV = add (shl x, M), x); add (shl MV, N), x)
20411 // (mul x, 1 - (1 - 2^M) * 2^N))
20412 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
20413 APInt SCVMinus1 = ShiftedConstValue - 1;
20414 APInt SCVPlus1 = ShiftedConstValue + 1;
20415 APInt CVPlus1 = ConstValue + 1;
20416 APInt CVM, CVN;
20417 if (SCVMinus1.isPowerOf2()) {
20418 ShiftAmt = SCVMinus1.logBase2();
20419 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
20420 } else if (CVPlus1.isPowerOf2()) {
20421 ShiftAmt = CVPlus1.logBase2();
20422 return Sub(Shl(N0, ShiftAmt), N0);
20423 } else if (SCVPlus1.isPowerOf2()) {
20424 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20425 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
20426 }
20427 if (Subtarget->hasALULSLFast() &&
20428 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
20429 APInt CVMMinus1 = CVM - 1;
20430 APInt CVNMinus1 = CVN - 1;
20431 unsigned ShiftM1 = CVMMinus1.logBase2();
20432 unsigned ShiftN1 = CVNMinus1.logBase2();
20433 // ALULSLFast implicate that Shifts <= 4 places are fast
20434 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
20435 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
20436 return Add(Shl(MVal, ShiftN1), MVal);
20437 }
20438 }
20439 if (Subtarget->hasALULSLFast() &&
20440 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
20441 unsigned ShiftM = CVM.getZExtValue();
20442 unsigned ShiftN = CVN.getZExtValue();
20443 // ALULSLFast implicate that Shifts <= 4 places are fast
20444 if (ShiftM <= 4 && ShiftN <= 4) {
20445 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
20446 return Add(Shl(MVal, CVN.getZExtValue()), N0);
20447 }
20448 }
20449
20450 if (Subtarget->hasALULSLFast() &&
20451 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
20452 unsigned ShiftM = CVM.getZExtValue();
20453 unsigned ShiftN = CVN.getZExtValue();
20454 // ALULSLFast implicate that Shifts <= 4 places are fast
20455 if (ShiftM <= 4 && ShiftN <= 4) {
20456 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
20457 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
20458 }
20459 }
20460 } else {
20461 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
20462 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
20463 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
20464 APInt SCVPlus1 = -ShiftedConstValue + 1;
20465 APInt CVNegPlus1 = -ConstValue + 1;
20466 APInt CVNegMinus1 = -ConstValue - 1;
20467 if (CVNegPlus1.isPowerOf2()) {
20468 ShiftAmt = CVNegPlus1.logBase2();
20469 return Sub(N0, Shl(N0, ShiftAmt));
20470 } else if (CVNegMinus1.isPowerOf2()) {
20471 ShiftAmt = CVNegMinus1.logBase2();
20472 return Negate(Add(Shl(N0, ShiftAmt), N0));
20473 } else if (SCVPlus1.isPowerOf2()) {
20474 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20475 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
20476 }
20477 }
20478
20479 return SDValue();
20480}
20481
20483 SelectionDAG &DAG) {
20484 // Take advantage of vector comparisons producing 0 or -1 in each lane to
20485 // optimize away operation when it's from a constant.
20486 //
20487 // The general transformation is:
20488 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
20489 // AND(VECTOR_CMP(x,y), constant2)
20490 // constant2 = UNARYOP(constant)
20491
20492 // Early exit if this isn't a vector operation, the operand of the
20493 // unary operation isn't a bitwise AND, or if the sizes of the operations
20494 // aren't the same.
20495 EVT VT = N->getValueType(0);
20496 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
20497 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
20498 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
20499 return SDValue();
20500
20501 // Now check that the other operand of the AND is a constant. We could
20502 // make the transformation for non-constant splats as well, but it's unclear
20503 // that would be a benefit as it would not eliminate any operations, just
20504 // perform one more step in scalar code before moving to the vector unit.
20505 if (BuildVectorSDNode *BV =
20506 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
20507 // Bail out if the vector isn't a constant.
20508 if (!BV->isConstant())
20509 return SDValue();
20510
20511 // Everything checks out. Build up the new and improved node.
20512 SDLoc DL(N);
20513 EVT IntVT = BV->getValueType(0);
20514 // Create a new constant of the appropriate type for the transformed
20515 // DAG.
20516 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
20517 // The AND node needs bitcasts to/from an integer vector type around it.
20518 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
20519 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
20520 N->getOperand(0)->getOperand(0), MaskConst);
20521 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
20522 return Res;
20523 }
20524
20525 return SDValue();
20526}
20527
20528/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
20529/// functions, this can help to reduce the number of fmovs to/from GPRs.
20530static SDValue
20533 const AArch64Subtarget *Subtarget) {
20534 if (N->isStrictFPOpcode())
20535 return SDValue();
20536
20537 if (DCI.isBeforeLegalizeOps())
20538 return SDValue();
20539
20540 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
20541 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
20542 return SDValue();
20543
20544 auto isSupportedType = [](EVT VT) {
20545 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
20546 };
20547
20548 SDValue SrcVal = N->getOperand(0);
20549 EVT SrcTy = SrcVal.getValueType();
20550 EVT DestTy = N->getValueType(0);
20551
20552 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
20553 return SDValue();
20554
20555 EVT SrcVecTy;
20556 EVT DestVecTy;
20557 if (DestTy.bitsGT(SrcTy)) {
20558 DestVecTy = getPackedSVEVectorVT(DestTy);
20559 SrcVecTy = DestVecTy.changeVectorElementType(*DAG.getContext(), SrcTy);
20560 } else {
20561 SrcVecTy = getPackedSVEVectorVT(SrcTy);
20562 DestVecTy = SrcVecTy.changeVectorElementType(*DAG.getContext(), DestTy);
20563 }
20564
20565 // Ensure the resulting src/dest vector type is legal.
20566 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
20567 return SDValue();
20568
20569 SDLoc DL(N);
20570 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20571 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
20572 DAG.getPOISON(SrcVecTy), SrcVal, ZeroIdx);
20573 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
20574 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
20575}
20576
20579 const AArch64Subtarget *Subtarget) {
20580 // First try to optimize away the conversion when it's conditionally from
20581 // a constant. Vectors only.
20583 return Res;
20584
20585 if (SDValue Res =
20586 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20587 return Res;
20588
20589 EVT VT = N->getValueType(0);
20590 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
20591 return SDValue();
20592 if (VT == MVT::f16 && !Subtarget->hasFullFP16())
20593 return SDValue();
20594
20595 // Only optimize when the source and destination types have the same width.
20596 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
20597 return SDValue();
20598
20599 // If the result of an integer load is only used by an integer-to-float
20600 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
20601 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
20602 SDValue N0 = N->getOperand(0);
20603 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
20604 N0.hasOneUse() &&
20605 // Do not change the width of a volatile load.
20606 !cast<LoadSDNode>(N0)->isVolatile()) {
20607 LoadSDNode *LN0 = cast<L