LLVM 23.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
20#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/Constants.h"
63#include "llvm/IR/DataLayout.h"
64#include "llvm/IR/DebugLoc.h"
66#include "llvm/IR/Function.h"
68#include "llvm/IR/GlobalValue.h"
69#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/Instruction.h"
73#include "llvm/IR/Intrinsics.h"
74#include "llvm/IR/IntrinsicsAArch64.h"
75#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Use.h"
79#include "llvm/IR/Value.h"
84#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <bitset>
96#include <cassert>
97#include <cctype>
98#include <cstdint>
99#include <cstdlib>
100#include <iterator>
101#include <limits>
102#include <optional>
103#include <tuple>
104#include <utility>
105#include <vector>
106
107using namespace llvm;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251static inline bool isPackedPredicateType(EVT VT, SelectionDAG &DAG) {
253 "Expected legal type!");
254 return VT == MVT::nxv16i1;
255}
256
257/// Returns true if the conceptual representation for \p VT does not map
258/// directly to its physical register representation, meaning there are gaps
259/// between elements in the register. In practice, the vector elements will be
260/// strided by a power of two and placed starting from lane 0. For example,
261/// nxv8i1 or nxv2f32 are unpacked types.
262///
263///\pre VT is a legal type.
264static inline bool isUnpackedType(EVT VT, SelectionDAG &DAG) {
265 bool Res = !isPackedVectorType(VT, DAG) && !isPackedPredicateType(VT, DAG);
266 assert((!Res || VT.isScalableVector()) &&
267 "Unexpected fixed-size unpacked type.");
268 return Res;
269}
270
271// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
272// predicate and end with a passthru value matching the result type.
273static bool isMergePassthruOpcode(unsigned Opc) {
274 switch (Opc) {
275 default:
276 return false;
277 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
278 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
279 case AArch64ISD::REVH_MERGE_PASSTHRU:
280 case AArch64ISD::REVW_MERGE_PASSTHRU:
281 case AArch64ISD::REVD_MERGE_PASSTHRU:
282 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
283 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
284 case AArch64ISD::DUP_MERGE_PASSTHRU:
285 case AArch64ISD::ABS_MERGE_PASSTHRU:
286 case AArch64ISD::NEG_MERGE_PASSTHRU:
287 case AArch64ISD::FNEG_MERGE_PASSTHRU:
288 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
289 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
290 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
291 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
292 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
293 case AArch64ISD::FRINT_MERGE_PASSTHRU:
294 case AArch64ISD::FRINT32_MERGE_PASSTHRU:
295 case AArch64ISD::FRINT64_MERGE_PASSTHRU:
296 case AArch64ISD::FROUND_MERGE_PASSTHRU:
297 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
298 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
299 case AArch64ISD::FTRUNC32_MERGE_PASSTHRU:
300 case AArch64ISD::FTRUNC64_MERGE_PASSTHRU:
301 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
302 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
303 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
304 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
305 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
306 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
307 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
308 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
309 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
310 case AArch64ISD::FABS_MERGE_PASSTHRU:
311 return true;
312 }
313}
314
315// Returns true if inactive lanes are known to be zeroed by construction.
317 switch (Op.getOpcode()) {
318 default:
319 return false;
320 // We guarantee i1 splat_vectors to zero the other lanes
323 case AArch64ISD::PTRUE:
324 case AArch64ISD::SETCC_MERGE_ZERO:
325 return true;
327 switch (Op.getConstantOperandVal(0)) {
328 default:
329 return false;
330 case Intrinsic::aarch64_sve_ptrue:
331 case Intrinsic::aarch64_sve_pnext:
332 case Intrinsic::aarch64_sve_cmpeq:
333 case Intrinsic::aarch64_sve_cmpne:
334 case Intrinsic::aarch64_sve_cmpge:
335 case Intrinsic::aarch64_sve_cmpgt:
336 case Intrinsic::aarch64_sve_cmphs:
337 case Intrinsic::aarch64_sve_cmphi:
338 case Intrinsic::aarch64_sve_cmpeq_wide:
339 case Intrinsic::aarch64_sve_cmpne_wide:
340 case Intrinsic::aarch64_sve_cmpge_wide:
341 case Intrinsic::aarch64_sve_cmpgt_wide:
342 case Intrinsic::aarch64_sve_cmplt_wide:
343 case Intrinsic::aarch64_sve_cmple_wide:
344 case Intrinsic::aarch64_sve_cmphs_wide:
345 case Intrinsic::aarch64_sve_cmphi_wide:
346 case Intrinsic::aarch64_sve_cmplo_wide:
347 case Intrinsic::aarch64_sve_cmpls_wide:
348 case Intrinsic::aarch64_sve_fcmpeq:
349 case Intrinsic::aarch64_sve_fcmpne:
350 case Intrinsic::aarch64_sve_fcmpge:
351 case Intrinsic::aarch64_sve_fcmpgt:
352 case Intrinsic::aarch64_sve_fcmpuo:
353 case Intrinsic::aarch64_sve_facgt:
354 case Intrinsic::aarch64_sve_facge:
355 case Intrinsic::aarch64_sve_whilege:
356 case Intrinsic::aarch64_sve_whilegt:
357 case Intrinsic::aarch64_sve_whilehi:
358 case Intrinsic::aarch64_sve_whilehs:
359 case Intrinsic::aarch64_sve_whilele:
360 case Intrinsic::aarch64_sve_whilelo:
361 case Intrinsic::aarch64_sve_whilels:
362 case Intrinsic::aarch64_sve_whilelt:
363 case Intrinsic::aarch64_sve_match:
364 case Intrinsic::aarch64_sve_nmatch:
365 case Intrinsic::aarch64_sve_whilege_x2:
366 case Intrinsic::aarch64_sve_whilegt_x2:
367 case Intrinsic::aarch64_sve_whilehi_x2:
368 case Intrinsic::aarch64_sve_whilehs_x2:
369 case Intrinsic::aarch64_sve_whilele_x2:
370 case Intrinsic::aarch64_sve_whilelo_x2:
371 case Intrinsic::aarch64_sve_whilels_x2:
372 case Intrinsic::aarch64_sve_whilelt_x2:
373 return true;
374 }
375 }
376}
377
378static std::tuple<SDValue, SDValue>
380 SDLoc DL(Disc);
381 SDValue AddrDisc;
382 SDValue ConstDisc;
383
384 // If this is a blend, remember the constant and address discriminators.
385 // Otherwise, it's either a constant discriminator, or a non-blended
386 // address discriminator.
387 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
388 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
389 AddrDisc = Disc->getOperand(1);
390 ConstDisc = Disc->getOperand(2);
391 } else {
392 ConstDisc = Disc;
393 }
394
395 // If the constant discriminator (either the blend RHS, or the entire
396 // discriminator value) isn't a 16-bit constant, bail out, and let the
397 // discriminator be computed separately.
398 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
399 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
400 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
401
402 // If there's no address discriminator, use NoRegister, which we'll later
403 // replace with XZR, or directly use a Z variant of the inst. when available.
404 if (!AddrDisc)
405 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
406
407 return std::make_tuple(
408 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
409 AddrDisc);
410}
411
413 const AArch64Subtarget &STI)
414 : TargetLowering(TM, STI), Subtarget(&STI) {
415 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
416 // we have to make something up. Arbitrarily, choose ZeroOrOne.
418 // When comparing vectors the result sets the different elements in the
419 // vector to all-one or all-zero.
421
422 // Set up the register classes.
423 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
424 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
425
426 if (Subtarget->hasLS64()) {
427 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
428 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
430 }
431
432 if (Subtarget->hasFPARMv8()) {
433 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
434 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
435 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
436 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
437 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
438 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
439 }
440
441 if (Subtarget->hasNEON()) {
442 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
443 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
444
445 addDRType(MVT::v2f32);
446 addDRType(MVT::v8i8);
447 addDRType(MVT::v4i16);
448 addDRType(MVT::v2i32);
449 addDRType(MVT::v1i64);
450 addDRType(MVT::v1f64);
451 addDRType(MVT::v4f16);
452 addDRType(MVT::v4bf16);
453
454 addQRType(MVT::v4f32);
455 addQRType(MVT::v2f64);
456 addQRType(MVT::v16i8);
457 addQRType(MVT::v8i16);
458 addQRType(MVT::v4i32);
459 addQRType(MVT::v2i64);
460 addQRType(MVT::v8f16);
461 addQRType(MVT::v8bf16);
462 }
463
464 if (Subtarget->isSVEorStreamingSVEAvailable()) {
465 // Add legal sve predicate types
466 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
467 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
468 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
469 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
470 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
471
472 // Add sve predicate as counter type
473 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
474
475 // Add legal sve data types
476 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
477 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
478 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
479 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
480
481 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
482 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
483 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
484 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
485 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
486 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
487
488 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
489 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
490 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
491
492 if (Subtarget->useSVEForFixedLengthVectors()) {
495 addRegisterClass(VT, &AArch64::ZPRRegClass);
496
499 addRegisterClass(VT, &AArch64::ZPRRegClass);
500 }
501 }
502
503 // Compute derived properties from the register classes
504 computeRegisterProperties(Subtarget->getRegisterInfo());
505
506 // Provide all sorts of operation actions
534 if (Subtarget->hasFPARMv8()) {
537 }
550
552
556
559
561
562 // Custom lowering hooks are needed for XOR
563 // to fold it into CSINC/CSINV.
566
569
570 // Virtually no operation on f128 is legal, but LLVM can't expand them when
571 // there's a valid register class, so we need custom operations in most cases.
596 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
597 // aren't handled.
598
599 // Lowering for many of the conversions is actually specified by the non-f128
600 // type. The LowerXXX function will be trivial when f128 isn't involved.
625 if (Subtarget->hasFPARMv8()) {
628 }
631 if (Subtarget->hasFPARMv8()) {
634 }
637
642
643 // Variable arguments.
648
649 // Variable-sized objects.
652
653 // Lowering Funnel Shifts to EXTR
658
660
661 // Constant pool entries
663
664 // BlockAddress
666
667 // AArch64 lacks both left-rotate and popcount instructions.
673 }
674
675 // AArch64 doesn't have i32 MULH{S|U}.
678
679 // AArch64 doesn't have {U|S}MUL_LOHI.
684
685 if (Subtarget->hasCSSC()) {
689
691
695
698
703
708 } else {
712
715
718 }
719
725 }
732
733 // Custom lower Add/Sub/Mul with overflow.
746
755
764 if (Subtarget->hasFullFP16()) {
767 } else {
770 }
771
772 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
786 setOperationAction(Op, MVT::f16, Promote);
787 setOperationAction(Op, MVT::v4f16, Expand);
788 setOperationAction(Op, MVT::v8f16, Expand);
789 setOperationAction(Op, MVT::bf16, Promote);
790 setOperationAction(Op, MVT::v4bf16, Expand);
791 setOperationAction(Op, MVT::v8bf16, Expand);
792 }
793
794 // Legalize fcanonicalize to circumvent default expansion
795 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
796 if (Subtarget->hasFullFP16()) {
798 }
799
800 // fpextend from f16 or bf16 to f32 is legal
805 // fpextend from bf16 to f64 needs to be split into two fpextends
808
809 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
810 for (auto Op : {
814 ISD::FADD,
815 ISD::FSUB,
816 ISD::FMUL,
817 ISD::FDIV,
818 ISD::FMA,
851 })
852 setOperationAction(Op, ScalarVT, Promote);
853
854 for (auto Op : {ISD::FNEG, ISD::FABS})
855 setOperationAction(Op, ScalarVT, Legal);
856
857 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
858 // because the result type is integer.
862 setOperationAction(Op, ScalarVT, Custom);
863
864 // promote v4f16 to v4f32 when that is known to be safe.
865 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
866 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
867 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
868 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
869 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
870 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
871 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
872 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
873 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
874 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
875 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
876 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
877 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
878 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
879
888
889 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
890 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
891 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
892
913 };
914
915 if (!Subtarget->hasFullFP16()) {
916 LegalizeNarrowFP(MVT::f16);
917 }
918 LegalizeNarrowFP(MVT::bf16);
921
922 // AArch64 has implementations of a lot of rounding-like FP operations.
923 // clang-format off
924 for (auto Op :
936 for (MVT Ty : {MVT::f32, MVT::f64})
938 if (Subtarget->hasFullFP16())
939 setOperationAction(Op, MVT::f16, Legal);
940 }
941 // clang-format on
942
943 // Basic strict FP operations are legal
946 for (MVT Ty : {MVT::f32, MVT::f64})
948 if (Subtarget->hasFullFP16())
949 setOperationAction(Op, MVT::f16, Legal);
950 }
951
953
959
961 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
964 } else {
967 }
970
971 // Generate outline atomics library calls only if LSE was not specified for
972 // subtarget
973 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
999 }
1000
1001 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
1006
1011
1016
1021
1026 }
1027
1028 if (Subtarget->hasLSE128()) {
1029 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1030 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1034 }
1035
1036 // 128-bit loads and stores can be done without expanding
1037 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1039
1040 // Aligned 128-bit loads and stores are single-copy atomic according to the
1041 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1042 if (Subtarget->hasLSE2()) {
1045 }
1046
1047 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1048 // custom lowering, as there are no un-paired non-temporal stores and
1049 // legalization will break up 256 bit inputs.
1050 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1051 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1052 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1053 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1054 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1055 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1056 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1057 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1058
1059 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1060 // custom lowering, as there are no un-paired non-temporal loads legalization
1061 // will break up 256 bit inputs.
1062 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1063 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1064 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1065 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1066 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1067 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1068 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1069 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1070
1071 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1073
1074 // Issue __sincos_stret if available.
1077
1078 // Make floating-point constants legal for the large code model, so they don't
1079 // become loads from the constant pool.
1080 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1083 }
1084
1085 // AArch64 does not have floating-point extending loads, i1 sign-extending
1086 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1087 for (MVT VT : MVT::fp_valuetypes()) {
1088 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1089 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1090 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1091 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1092 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1093 }
1094 for (MVT VT : MVT::integer_valuetypes())
1095 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1096
1097 for (MVT WideVT : MVT::fp_valuetypes()) {
1098 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1099 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1100 setTruncStoreAction(WideVT, NarrowVT, Expand);
1101 }
1102 }
1103 }
1104
1105 if (Subtarget->hasFPARMv8()) {
1109 }
1110
1111 // Indexed loads and stores are supported.
1112 for (unsigned im = (unsigned)ISD::PRE_INC;
1114 setIndexedLoadAction(im, MVT::i8, Legal);
1115 setIndexedLoadAction(im, MVT::i16, Legal);
1116 setIndexedLoadAction(im, MVT::i32, Legal);
1117 setIndexedLoadAction(im, MVT::i64, Legal);
1118 setIndexedLoadAction(im, MVT::f64, Legal);
1119 setIndexedLoadAction(im, MVT::f32, Legal);
1120 setIndexedLoadAction(im, MVT::f16, Legal);
1121 setIndexedLoadAction(im, MVT::bf16, Legal);
1122 setIndexedStoreAction(im, MVT::i8, Legal);
1123 setIndexedStoreAction(im, MVT::i16, Legal);
1124 setIndexedStoreAction(im, MVT::i32, Legal);
1125 setIndexedStoreAction(im, MVT::i64, Legal);
1126 setIndexedStoreAction(im, MVT::f64, Legal);
1127 setIndexedStoreAction(im, MVT::f32, Legal);
1128 setIndexedStoreAction(im, MVT::f16, Legal);
1129 setIndexedStoreAction(im, MVT::bf16, Legal);
1130 }
1131
1132 // Trap.
1133 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1136
1137 // We combine OR nodes for ccmp operations.
1139 // Try to create BICs for vector ANDs.
1141
1142 // llvm.init.trampoline and llvm.adjust.trampoline
1145
1146 // Vector add and sub nodes may conceal a high-half opportunity.
1147 // Also, try to fold ADD into CSINC/CSINV..
1150
1153
1154 // Try and combine setcc with csel
1156
1158
1166
1168
1170
1172
1176
1179
1181
1183
1185
1187
1191
1193
1197
1198 // In case of strict alignment, avoid an excessive number of byte wide stores.
1201 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1202
1206 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1207
1210 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1211
1214 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1215
1217
1219
1220 EnableExtLdPromotion = true;
1221
1222 // Set required alignment.
1224 // Set preferred alignments.
1225
1226 // Don't align loops on Windows. The SEH unwind info generation needs to
1227 // know the exact length of functions before the alignments have been
1228 // expanded.
1229 if (!Subtarget->isTargetWindows())
1233
1234 // Only change the limit for entries in a jump table if specified by
1235 // the sub target, but not at the command line.
1236 unsigned MaxJT = STI.getMaximumJumpTableSize();
1237 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1239
1241
1243
1245 if (Subtarget->hasSME())
1247
1248 if (Subtarget->isNeonAvailable()) {
1249 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1250 // silliness like this:
1251 // clang-format off
1252 for (auto Op :
1273 setOperationAction(Op, MVT::v1f64, Expand);
1274 // clang-format on
1275
1276 for (auto Op :
1281 setOperationAction(Op, MVT::v1i64, Expand);
1282
1283 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1284 // elements smaller than i32, so promote the input to i32 first.
1285 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1286 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1287
1288 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1289 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1290 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1293 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1295
1296 if (Subtarget->hasFullFP16()) {
1299
1308 } else {
1309 // when AArch64 doesn't have fullfp16 support, promote the input
1310 // to i32 first.
1311 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1312 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1313 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1314 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1315 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1316 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1317 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1318 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1319 }
1320
1321 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1322 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1329 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1334 }
1335
1336 // Custom handling for some quad-vector types to detect MULL.
1337 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1338 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1339 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1340 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1341 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1342 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1343
1344 // Saturates
1345 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1346 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1351 }
1352
1353 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1354 MVT::v4i32}) {
1361 }
1362
1363 // Vector reductions
1364 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1365 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1366 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1371
1373 }
1374 }
1375 if (Subtarget->hasFullFP16())
1377
1378 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1379 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1388 }
1393
1395 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1396 // Likewise, narrowing and extending vector loads/stores aren't handled
1397 // directly.
1400
1401 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1404 } else {
1407 }
1410
1413
1414 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1415 setTruncStoreAction(VT, InnerVT, Expand);
1416 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1417 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1418 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1419 }
1420 }
1421
1422 for (auto Op :
1428 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1430 if (Subtarget->hasFullFP16())
1431 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1433 }
1434
1435 // LRINT and LLRINT.
1436 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1437 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1439 if (Subtarget->hasFullFP16())
1440 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1442 }
1443
1444 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1445
1450
1454
1455 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1456 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1457 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1458 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1459 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1460 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1461 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1462 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1463 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1464 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1465 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1466 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1467 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1468 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1469 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1470 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1471 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1472 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1473
1474 // ADDP custom lowering
1475 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1477 // FADDP custom lowering
1478 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1480
1481 if (Subtarget->hasDotProd()) {
1482 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1484
1485 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1486 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1487 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
1488 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1489
1490 if (Subtarget->hasMatMulInt8()) {
1492 MVT::v16i8, Legal);
1494 MVT::v16i8, Custom);
1495
1497 MVT::v8i8, Legal);
1498 }
1499 }
1500
1501 } else /* !isNeonAvailable */ {
1503 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1505
1506 if (VT.is128BitVector() || VT.is64BitVector()) {
1510 Subtarget->isLittleEndian() ? Legal : Expand);
1511 }
1512 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1513 setTruncStoreAction(VT, InnerVT, Expand);
1514 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1515 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1516 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1517 }
1518 }
1519 }
1520
1521 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1525 }
1526
1527 if (Subtarget->hasSME()) {
1529 }
1530
1531 // FIXME: Move lowering for more nodes here if those are common between
1532 // SVE and SME.
1533 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1534 for (auto VT :
1535 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1540 }
1541 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1544 }
1545
1546 if (Subtarget->hasSVE2p1() ||
1547 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1549
1550 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1552
1553 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v2f64})
1555 }
1556
1557 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1558 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1602
1608
1617
1622
1626
1627 if (!Subtarget->isLittleEndian())
1629
1630 if (Subtarget->hasSVE2() ||
1631 (Subtarget->hasSME() && Subtarget->isStreaming()))
1632 // For SLI/SRI.
1634 }
1635
1636 // Illegal unpacked integer vector types.
1637 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1640 }
1641
1642 // Type legalize unpacked bitcasts.
1643 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1645
1646 for (auto VT :
1647 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1648 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1650
1651 // Promote predicate as counter load/stores to standard predicates.
1652 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
1653 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
1654
1655 // Predicate as counter legalization actions.
1656 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
1657 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
1658
1659 for (auto VT :
1660 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1668
1672
1673 // There are no legal MVT::nxv16f## based types.
1674 if (VT != MVT::nxv16i1) {
1679 }
1680 }
1681
1682 // NEON doesn't support masked loads/stores, but SME and SVE do.
1683 for (auto VT :
1684 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1685 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1686 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1689 }
1690
1691 // Firstly, exclude all scalable vector extending loads/truncating stores,
1692 // include both integer and floating scalable vector.
1694 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1695 setTruncStoreAction(VT, InnerVT, Expand);
1696 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1697 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1698 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1699 }
1700 }
1701
1702 // Then, selectively enable those which we directly support.
1703 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1704 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1705 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1706 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1707 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1708 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1709 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1710 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1711 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1712 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1713 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1714 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1715 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1716 }
1717
1718 // SVE supports truncating stores of 64 and 128-bit vectors
1719 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1720 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1721 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1722 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1723 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1724
1725 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1726 MVT::nxv4f32, MVT::nxv2f64}) {
1768
1790
1802 }
1803
1804 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1821 }
1822
1823 if (Subtarget->hasSVEB16B16() &&
1824 Subtarget->isNonStreamingSVEorSME2Available()) {
1825 // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
1826 for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
1827 MVT::nxv8bf16}) {
1836 }
1837 }
1838
1839 for (auto Opcode :
1844 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1845 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1846 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1847 }
1848
1849 if (!Subtarget->hasSVEB16B16() ||
1850 !Subtarget->isNonStreamingSVEorSME2Available()) {
1851 for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1852 MVT PromotedVT = VT.changeVectorElementType(MVT::f32);
1853 setOperationPromotedToType(ISD::FADD, VT, PromotedVT);
1854 setOperationPromotedToType(ISD::FMA, VT, PromotedVT);
1859 setOperationPromotedToType(ISD::FSUB, VT, PromotedVT);
1860
1861 if (VT != MVT::nxv2bf16 && Subtarget->hasBF16())
1863 else
1864 setOperationPromotedToType(ISD::FMUL, VT, PromotedVT);
1865 }
1866
1867 if (Subtarget->hasBF16() && Subtarget->isNeonAvailable())
1868 setOperationAction(ISD::FMUL, MVT::v8bf16, Custom);
1869 }
1870
1873
1874 // A number of operations like MULH and integer divides are not supported by
1875 // NEON but are available in SVE.
1876 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1877 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1882 }
1883
1884 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1885 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1886 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1887
1888 // NOTE: Currently this has to happen after computeRegisterProperties rather
1889 // than the preferred option of combining it with the addRegisterClass call.
1890 if (Subtarget->useSVEForFixedLengthVectors()) {
1893 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1894 addTypeForFixedLengthSVE(VT);
1895 }
1898 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1899 addTypeForFixedLengthSVE(VT);
1900 }
1901
1902 // 64bit results can mean a bigger than NEON input.
1903 for (auto VT : {MVT::v8i8, MVT::v4i16})
1906
1907 // 128bit results imply a bigger than NEON input.
1908 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1910 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v8bf16})
1912
1913 // These operations are not supported on NEON but SVE can do them.
1915 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1916 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1917 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1918 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1919 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1920 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1921 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1922 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1923 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1924 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1925 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1930
1931 // Int operations with no NEON support.
1932 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1933 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1939 }
1940
1941 // Use SVE for vectors with more than 2 elements.
1942 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1944 }
1945
1947 MVT::nxv2i64);
1949 MVT::nxv2i64);
1951 MVT::nxv4i32);
1953 MVT::nxv4i32);
1955 MVT::nxv8i16);
1957 MVT::nxv8i16);
1959 MVT::nxv16i8);
1961 MVT::nxv16i8);
1962
1964
1965 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1967 }
1968
1969 // Handle partial reduction operations
1970 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1971 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1972 // Other pairs will default to 'Expand'.
1973 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1975 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1976 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1977
1978 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1979
1980 if (Subtarget->hasMatMulInt8()) {
1982 MVT::nxv16i8, Legal);
1984 MVT::nxv16i8, Custom);
1985 }
1986
1987 // Wide add types
1988 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1989 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1990 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1991 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1992 }
1993
1994 // Handle floating-point partial reduction
1995 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
1997 MVT::nxv8f16, Legal);
1998 // We can use SVE2p1 fdot to emulate the fixed-length variant.
2000 MVT::v8f16, Custom);
2001 }
2002 }
2003
2004 // Handle non-aliasing elements mask
2005 if (Subtarget->hasSVE2() ||
2006 (Subtarget->hasSME() && Subtarget->isStreaming())) {
2007 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
2008 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
2011 }
2012 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
2015 }
2016 }
2017
2018 // Handle operations that are only available in non-streaming SVE mode.
2019 if (Subtarget->isSVEAvailable()) {
2020 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
2021 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2022 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
2023 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
2024 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
2025 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2026 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
2029 }
2030
2031 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2032 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
2033 MVT::v2f32, MVT::v4f32, MVT::v2f64})
2035
2036 // We can lower types that have <vscale x {2|4}> elements to compact.
2037 for (auto VT :
2038 {MVT::nxv4i32, MVT::nxv2i64, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64})
2040
2041 // If we have SVE, we can use SVE logic for legal NEON vectors in the lowest
2042 // bits of the SVE register.
2043 for (auto VT : {MVT::v2i32, MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32,
2044 MVT::v2f64})
2046
2047 // Promote v4i16/f16 to v4i32/f32 as the SVE container for v4i16 is nxv8,
2048 // which is not supported with for compact (with only +sve).
2049 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4bf16, MVT::v4i16);
2050 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4f16, MVT::v4i16);
2051 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4i16, MVT::v4i32);
2052
2053 for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
2054 MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
2055 MVT::nxv4i32, MVT::nxv4f32}) {
2056 // Use a custom lowering for masked stores that could be a supported
2057 // compressing store. Note: These types still use the normal (Legal)
2058 // lowering for non-compressing masked stores.
2060 }
2061
2062 // Histcnt is SVE2 only
2063 if (Subtarget->hasSVE2()) {
2065 Custom);
2067 Custom);
2068
2069 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2071 // Must be lowered to SVE instructions.
2072 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
2073 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
2074 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
2075 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
2076 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
2077 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
2078 }
2079 }
2080
2081 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
2082 // Only required for llvm.aarch64.mops.memset.tag
2084 }
2085
2087
2088 if (Subtarget->hasSVE()) {
2093 }
2094
2095 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2096
2097 IsStrictFPEnabled = true;
2099
2100 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2101 // it, but it's just a wrapper around ldexp.
2102 if (Subtarget->isTargetWindows()) {
2104 if (isOperationExpand(Op, MVT::f32))
2105 setOperationAction(Op, MVT::f32, Promote);
2106 }
2107
2108 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2109 // isn't legal.
2111 if (isOperationExpand(Op, MVT::f16))
2112 setOperationAction(Op, MVT::f16, Promote);
2113}
2114
2116 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2117}
2118
2119void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2120 assert(VT.isVector() && "VT should be a vector type");
2121
2122 if (VT.isFloatingPoint()) {
2124 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2125 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2126 }
2127
2128 // Mark vector float intrinsics as expand.
2129 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2147 }
2148
2149 // But we do support custom-lowering for FCOPYSIGN.
2150 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2151 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2152 VT == MVT::v8f16) &&
2153 Subtarget->hasFullFP16()))
2155
2170
2174 for (MVT InnerVT : MVT::all_valuetypes())
2175 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2176
2177 // CNT supports only B element sizes, then use UADDLP to widen.
2178 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2180
2186
2187 for (unsigned Opcode :
2190 setOperationAction(Opcode, VT, Custom);
2191
2192 if (!VT.isFloatingPoint())
2194
2195 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2196 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2197 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2198 setOperationAction(Opcode, VT, Legal);
2199
2200 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2201 // NEON types.
2202 if (VT.isFloatingPoint() &&
2203 VT.getVectorElementType() != MVT::bf16 &&
2204 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2205 for (unsigned Opcode :
2211 setOperationAction(Opcode, VT, Legal);
2212
2213 // Strict fp extend and trunc are legal
2214 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2216 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2218
2219 // FIXME: We could potentially make use of the vector comparison instructions
2220 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2221 // complications:
2222 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2223 // so we would need to expand when the condition code doesn't match the
2224 // kind of comparison.
2225 // * Some kinds of comparison require more than one FCMXY instruction so
2226 // would need to be expanded instead.
2227 // * The lowering of the non-strict versions involves target-specific ISD
2228 // nodes so we would likely need to add strict versions of all of them and
2229 // handle them appropriately.
2232
2233 // When little-endian we can use ordinary d and q register loads/stores for
2234 // vector types, but when big-endian we need to use structure load/store which
2235 // only allow post-index addressing.
2236 if (Subtarget->isLittleEndian()) {
2237 for (unsigned im = (unsigned)ISD::PRE_INC;
2238 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2241 }
2242 } else {
2245 }
2246
2247 if (Subtarget->hasD128()) {
2250 }
2251
2252 if (VT.isInteger()) {
2253 // Let common code emit inverted variants of compares we do support.
2259 }
2260}
2261
2263 EVT OpVT) const {
2264 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2265 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2266 ResVT.getVectorElementType() != MVT::i1)
2267 return true;
2268
2269 // Only support illegal types if the result is scalable and min elements > 1.
2270 if (ResVT.getVectorMinNumElements() == 1 ||
2271 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2272 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2273 return true;
2274
2275 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2276 // but anything larger should be expanded.
2277 if (OpVT.getFixedSizeInBits() > 64)
2278 return true;
2279
2280 return false;
2281}
2282
2284 if (!Subtarget->isSVEorStreamingSVEAvailable())
2285 return true;
2286
2287 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2288 // also support fixed-width predicates.
2289 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2290 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2291 VT != MVT::v4i1 && VT != MVT::v2i1;
2292}
2293
2295 unsigned SearchSize) const {
2296 // MATCH is SVE2 and only available in non-streaming mode.
2297 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2298 return true;
2299 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2300 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2301 return SearchSize != 8;
2302 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2303 return SearchSize != 8 && SearchSize != 16;
2304 return true;
2305}
2306
2307void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2308 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2309
2310 // By default everything must be expanded.
2311 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2313
2314 if (VT.isFloatingPoint()) {
2324 }
2325
2327 VT == MVT::v1f64 ? Expand : Custom;
2328
2329 // Mark integer truncating stores/extending loads as having custom lowering
2330 if (VT.isInteger()) {
2331 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2332 while (InnerVT != VT) {
2333 setTruncStoreAction(VT, InnerVT, Default);
2334 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2335 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2336 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2337 InnerVT = InnerVT.changeVectorElementType(
2338 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2339 }
2340 }
2341
2342 // Mark floating-point truncating stores/extending loads as having custom
2343 // lowering
2344 if (VT.isFloatingPoint()) {
2345 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2346 while (InnerVT != VT) {
2347 setTruncStoreAction(VT, InnerVT, Custom);
2348 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2349 InnerVT = InnerVT.changeVectorElementType(
2351 }
2352 }
2353
2354 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2355 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2356
2357 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2359 unsigned NumElts = VT.getVectorNumElements();
2360 if (VT.getVectorElementType() == MVT::i64) {
2361 setPartialReduceMLAAction(MLAOps, VT,
2362 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2363 setPartialReduceMLAAction(MLAOps, VT,
2364 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2365 setPartialReduceMLAAction(MLAOps, VT,
2366 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2367 } else if (VT.getVectorElementType() == MVT::i32) {
2368 setPartialReduceMLAAction(MLAOps, VT,
2369 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2370 setPartialReduceMLAAction(MLAOps, VT,
2371 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2372 } else if (VT.getVectorElementType() == MVT::i16) {
2373 setPartialReduceMLAAction(MLAOps, VT,
2374 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2375 }
2376 if (Subtarget->hasMatMulInt8()) {
2377 if (VT.getVectorElementType() == MVT::i32)
2379 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2380 else if (VT.getVectorElementType() == MVT::i64)
2382 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2383 }
2384
2385 if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
2387 MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
2388 }
2389
2390 // Lower fixed length vector operations to scalable equivalents.
2397 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2435 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2436 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2438 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2457 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2484}
2485
2486void AArch64TargetLowering::addDRType(MVT VT) {
2487 addRegisterClass(VT, &AArch64::FPR64RegClass);
2488 if (Subtarget->isNeonAvailable())
2489 addTypeForNEON(VT);
2490}
2491
2492void AArch64TargetLowering::addQRType(MVT VT) {
2493 addRegisterClass(VT, &AArch64::FPR128RegClass);
2494 if (Subtarget->isNeonAvailable())
2495 addTypeForNEON(VT);
2496}
2497
2499 LLVMContext &C, EVT VT) const {
2500 if (!VT.isVector())
2501 return MVT::i32;
2502 if (VT.isScalableVector())
2503 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2505}
2506
2507// isIntImmediate - This method tests to see if the node is a constant
2508// operand. If so Imm will receive the value.
2509static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2511 Imm = C->getZExtValue();
2512 return true;
2513 }
2514 return false;
2515}
2516
2517bool isVectorizedBinOp(unsigned Opcode) {
2518 switch (Opcode) {
2519 case AArch64ISD::SQDMULH:
2520 return true;
2521 default:
2522 return false;
2523 }
2524}
2525
2526// isOpcWithIntImmediate - This method tests to see if the node is a specific
2527// opcode and that it has a immediate integer right operand.
2528// If so Imm will receive the value.
2529static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2530 uint64_t &Imm) {
2531 return N->getOpcode() == Opc &&
2532 isIntImmediate(N->getOperand(1).getNode(), Imm);
2533}
2534
2535static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2536 const APInt &Demanded,
2538 unsigned NewOpc) {
2539 uint64_t OldImm = Imm, NewImm, Enc;
2540 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2541
2542 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2543 // bimm64.
2544 if (Imm == 0 || Imm == Mask ||
2546 return false;
2547
2548 unsigned EltSize = Size;
2549 uint64_t DemandedBits = Demanded.getZExtValue();
2550
2551 // Clear bits that are not demanded.
2552 Imm &= DemandedBits;
2553
2554 while (true) {
2555 // The goal here is to set the non-demanded bits in a way that minimizes
2556 // the number of switching between 0 and 1. In order to achieve this goal,
2557 // we set the non-demanded bits to the value of the preceding demanded bits.
2558 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2559 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2560 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2561 // The final result is 0b11000011.
2562 uint64_t NonDemandedBits = ~DemandedBits;
2563 uint64_t InvertedImm = ~Imm & DemandedBits;
2564 uint64_t RotatedImm =
2565 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2566 NonDemandedBits;
2567 uint64_t Sum = RotatedImm + NonDemandedBits;
2568 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2569 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2570 NewImm = (Imm | Ones) & Mask;
2571
2572 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2573 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2574 // we halve the element size and continue the search.
2575 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2576 break;
2577
2578 // We cannot shrink the element size any further if it is 2-bits.
2579 if (EltSize == 2)
2580 return false;
2581
2582 EltSize /= 2;
2583 Mask >>= EltSize;
2584 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2585
2586 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2587 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2588 return false;
2589
2590 // Merge the upper and lower halves of Imm and DemandedBits.
2591 Imm |= Hi;
2592 DemandedBits |= DemandedBitsHi;
2593 }
2594
2595 ++NumOptimizedImms;
2596
2597 // Replicate the element across the register width.
2598 while (EltSize < Size) {
2599 NewImm |= NewImm << EltSize;
2600 EltSize *= 2;
2601 }
2602
2603 (void)OldImm;
2604 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2605 "demanded bits should never be altered");
2606 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2607
2608 // Create the new constant immediate node.
2609 EVT VT = Op.getValueType();
2610 SDLoc DL(Op);
2611 SDValue New;
2612
2613 // If the new constant immediate is all-zeros or all-ones, let the target
2614 // independent DAG combine optimize this node.
2615 if (NewImm == 0 || NewImm == OrigMask) {
2616 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2617 TLO.DAG.getConstant(NewImm, DL, VT));
2618 // Otherwise, create a machine node so that target independent DAG combine
2619 // doesn't undo this optimization.
2620 } else {
2622 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2623 New = SDValue(
2624 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2625 }
2626
2627 return TLO.CombineTo(Op, New);
2628}
2629
2631 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2632 TargetLoweringOpt &TLO) const {
2633 // Delay this optimization to as late as possible.
2634 if (!TLO.LegalOps)
2635 return false;
2636
2638 return false;
2639
2640 EVT VT = Op.getValueType();
2641 if (VT.isVector())
2642 return false;
2643
2644 unsigned Size = VT.getSizeInBits();
2645
2646 if (Size != 32 && Size != 64)
2647 return false;
2648
2649 // Exit early if we demand all bits.
2650 if (DemandedBits.isAllOnes())
2651 return false;
2652
2653 unsigned NewOpc;
2654 switch (Op.getOpcode()) {
2655 default:
2656 return false;
2657 case ISD::AND:
2658 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2659 break;
2660 case ISD::OR:
2661 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2662 break;
2663 case ISD::XOR:
2664 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2665 break;
2666 }
2667 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2668 if (!C)
2669 return false;
2670 uint64_t Imm = C->getZExtValue();
2671 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2672}
2673
2674/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2675/// Mask are known to be either zero or one and return them Known.
2677 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2678 const SelectionDAG &DAG, unsigned Depth) const {
2679 switch (Op.getOpcode()) {
2680 default:
2681 break;
2682 case AArch64ISD::DUP: {
2683 SDValue SrcOp = Op.getOperand(0);
2684 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2685 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2686 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2687 "Expected DUP implicit truncation");
2688 Known = Known.trunc(Op.getScalarValueSizeInBits());
2689 }
2690 break;
2691 }
2692 case AArch64ISD::CSEL: {
2693 KnownBits Known2;
2694 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2695 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2696 Known = Known.intersectWith(Known2);
2697 break;
2698 }
2699 case AArch64ISD::CSNEG:
2700 case AArch64ISD::CSINC:
2701 case AArch64ISD::CSINV: {
2702 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2703 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2704
2705 // The result is either:
2706 // CSINC: KnownOp0 or KnownOp1 + 1
2707 // CSINV: KnownOp0 or ~KnownOp1
2708 // CSNEG: KnownOp0 or KnownOp1 * -1
2709 if (Op.getOpcode() == AArch64ISD::CSINC)
2710 KnownOp1 = KnownBits::add(
2711 KnownOp1,
2712 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2713 else if (Op.getOpcode() == AArch64ISD::CSINV)
2714 std::swap(KnownOp1.Zero, KnownOp1.One);
2715 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2716 KnownOp1 =
2718 Op.getScalarValueSizeInBits())));
2719
2720 Known = KnownOp0.intersectWith(KnownOp1);
2721 break;
2722 }
2723 case AArch64ISD::BICi: {
2724 // Compute the bit cleared value.
2725 APInt Mask =
2726 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2727 .trunc(Known.getBitWidth());
2728 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2729 Known &= KnownBits::makeConstant(Mask);
2730 break;
2731 }
2732 case AArch64ISD::VLSHR: {
2733 KnownBits Known2;
2734 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2735 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2736 Known = KnownBits::lshr(Known, Known2);
2737 break;
2738 }
2739 case AArch64ISD::VASHR: {
2740 KnownBits Known2;
2741 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2742 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2743 Known = KnownBits::ashr(Known, Known2);
2744 break;
2745 }
2746 case AArch64ISD::VSHL: {
2747 KnownBits Known2;
2748 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2749 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2750 Known = KnownBits::shl(Known, Known2);
2751 break;
2752 }
2753 case AArch64ISD::MOVI: {
2755 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2756 break;
2757 }
2758 case AArch64ISD::MOVIshift: {
2760 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2761 << Op->getConstantOperandVal(1)));
2762 break;
2763 }
2764 case AArch64ISD::MOVImsl: {
2765 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2767 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2768 break;
2769 }
2770 case AArch64ISD::MOVIedit: {
2772 Known.getBitWidth(),
2773 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2774 break;
2775 }
2776 case AArch64ISD::MVNIshift: {
2778 APInt(Known.getBitWidth(),
2779 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2780 /*isSigned*/ false, /*implicitTrunc*/ true));
2781 break;
2782 }
2783 case AArch64ISD::MVNImsl: {
2784 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2786 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2787 /*isSigned*/ false, /*implicitTrunc*/ true));
2788 break;
2789 }
2790 case AArch64ISD::LOADgot:
2791 case AArch64ISD::ADDlow: {
2792 if (!Subtarget->isTargetILP32())
2793 break;
2794 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2795 Known.Zero = APInt::getHighBitsSet(64, 32);
2796 break;
2797 }
2798 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2799 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2800 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2801 break;
2802 }
2804 Intrinsic::ID IntID =
2805 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2806 switch (IntID) {
2807 default: return;
2808 case Intrinsic::aarch64_ldaxr:
2809 case Intrinsic::aarch64_ldxr: {
2810 unsigned BitWidth = Known.getBitWidth();
2811 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2812 unsigned MemBits = VT.getScalarSizeInBits();
2813 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2814 return;
2815 }
2816 }
2817 break;
2818 }
2820 case ISD::INTRINSIC_VOID: {
2821 unsigned IntNo = Op.getConstantOperandVal(0);
2822 switch (IntNo) {
2823 default:
2824 break;
2825 case Intrinsic::aarch64_neon_uaddlv: {
2826 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2827 unsigned BitWidth = Known.getBitWidth();
2828 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2829 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2830 assert(BitWidth >= Bound && "Unexpected width!");
2832 Known.Zero |= Mask;
2833 }
2834 break;
2835 }
2836 case Intrinsic::aarch64_neon_umaxv:
2837 case Intrinsic::aarch64_neon_uminv: {
2838 // Figure out the datatype of the vector operand. The UMINV instruction
2839 // will zero extend the result, so we can mark as known zero all the
2840 // bits larger than the element datatype. 32-bit or larget doesn't need
2841 // this as those are legal types and will be handled by isel directly.
2842 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2843 unsigned BitWidth = Known.getBitWidth();
2844 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2845 assert(BitWidth >= 8 && "Unexpected width!");
2847 Known.Zero |= Mask;
2848 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2849 assert(BitWidth >= 16 && "Unexpected width!");
2851 Known.Zero |= Mask;
2852 }
2853 break;
2854 } break;
2855 }
2856 }
2857 }
2858}
2859
2861 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2862 unsigned Depth) const {
2863 EVT VT = Op.getValueType();
2864 unsigned VTBits = VT.getScalarSizeInBits();
2865 unsigned Opcode = Op.getOpcode();
2866 switch (Opcode) {
2867 case AArch64ISD::FCMEQ:
2868 case AArch64ISD::FCMGE:
2869 case AArch64ISD::FCMGT:
2870 // Compares return either 0 or all-ones
2871 return VTBits;
2872 case AArch64ISD::VASHR: {
2873 unsigned Tmp =
2874 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2875 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2876 }
2877 }
2878
2879 return 1;
2880}
2881
2883 EVT) const {
2884 return MVT::i64;
2885}
2886
2888 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2889 unsigned *Fast) const {
2890
2891 // Allow SVE loads/stores where the alignment >= the size of the element type,
2892 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2893 // for stores that come from IR, only require element-size alignment (even if
2894 // unaligned accesses are disabled). Without this, these will be forced to
2895 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2896 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2897 if (VT.isScalableVector()) {
2898 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2899 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2900 return true;
2901 }
2902
2903 if (Subtarget->requiresStrictAlign())
2904 return false;
2905
2906 if (Fast) {
2907 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2908 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2909 // See comments in performSTORECombine() for more details about
2910 // these conditions.
2911
2912 // Code that uses clang vector extensions can mark that it
2913 // wants unaligned accesses to be treated as fast by
2914 // underspecifying alignment to be 1 or 2.
2915 Alignment <= 2 ||
2916
2917 // Disregard v2i64. Memcpy lowering produces those and splitting
2918 // them regresses performance on micro-benchmarks and olden/bh.
2919 VT == MVT::v2i64;
2920 }
2921 return true;
2922}
2923
2924// Same as above but handling LLTs instead.
2926 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2927 unsigned *Fast) const {
2928 if (Subtarget->requiresStrictAlign())
2929 return false;
2930
2931 if (Fast) {
2932 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2933 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2934 Ty.getSizeInBytes() != 16 ||
2935 // See comments in performSTORECombine() for more details about
2936 // these conditions.
2937
2938 // Code that uses clang vector extensions can mark that it
2939 // wants unaligned accesses to be treated as fast by
2940 // underspecifying alignment to be 1 or 2.
2941 Alignment <= 2 ||
2942
2943 // Disregard v2i64. Memcpy lowering produces those and splitting
2944 // them regresses performance on micro-benchmarks and olden/bh.
2945 Ty == LLT::fixed_vector(2, 64);
2946 }
2947 return true;
2948}
2949
2951 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2952 const LibcallLoweringInfo *libcallLowering) const {
2953 return AArch64::createFastISel(funcInfo, libInfo, libcallLowering);
2954}
2955
2958 MachineBasicBlock *MBB) const {
2959 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2960 // phi node:
2961
2962 // OrigBB:
2963 // [... previous instrs leading to comparison ...]
2964 // b.ne TrueBB
2965 // b EndBB
2966 // TrueBB:
2967 // ; Fallthrough
2968 // EndBB:
2969 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2970
2971 MachineFunction *MF = MBB->getParent();
2972 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2973 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2974 DebugLoc DL = MI.getDebugLoc();
2975 MachineFunction::iterator It = ++MBB->getIterator();
2976
2977 Register DestReg = MI.getOperand(0).getReg();
2978 Register IfTrueReg = MI.getOperand(1).getReg();
2979 Register IfFalseReg = MI.getOperand(2).getReg();
2980 unsigned CondCode = MI.getOperand(3).getImm();
2981 bool NZCVKilled = MI.getOperand(4).isKill();
2982
2983 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2984 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2985 MF->insert(It, TrueBB);
2986 MF->insert(It, EndBB);
2987
2988 // Transfer rest of current basic-block to EndBB
2989 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2990 MBB->end());
2992
2993 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2994 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2995 MBB->addSuccessor(TrueBB);
2996 MBB->addSuccessor(EndBB);
2997
2998 // TrueBB falls through to the end.
2999 TrueBB->addSuccessor(EndBB);
3000
3001 if (!NZCVKilled) {
3002 TrueBB->addLiveIn(AArch64::NZCV);
3003 EndBB->addLiveIn(AArch64::NZCV);
3004 }
3005
3006 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3007 .addReg(IfTrueReg)
3008 .addMBB(TrueBB)
3009 .addReg(IfFalseReg)
3010 .addMBB(MBB);
3011
3012 MI.eraseFromParent();
3013 return EndBB;
3014}
3015
3023
3026 MachineBasicBlock *MBB) const {
3027 MachineFunction &MF = *MBB->getParent();
3028 MachineBasicBlock::iterator MBBI = MI.getIterator();
3029 const AArch64InstrInfo &TII =
3030 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3031 Register TargetReg = MI.getOperand(0).getReg();
3033 TII.probedStackAlloc(MBBI, TargetReg, false);
3034
3035 MI.eraseFromParent();
3036 return NextInst->getParent();
3037}
3038
3041 MachineBasicBlock *MBB) const {
3042 MachineFunction *MF = MBB->getParent();
3044
3045 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
3046 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
3047
3048 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
3049 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
3050 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
3051 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
3052
3053 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3054 DebugLoc DL = MI.getDebugLoc();
3055
3056 // RDVL requires GPR64, ADDSVL requires GPR64sp
3057 // We need to insert COPY instructions, these will later be removed by the
3058 // RegisterCoalescer
3059 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
3060 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
3061 .addReg(RegVL_GPR);
3062
3063 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
3064 .addReg(RegVL_GPRsp)
3065 .addImm(-1);
3066 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
3067 .addReg(RegSVL_GPRsp);
3068
3069 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3070 MachineFunction::iterator It = ++MBB->getIterator();
3071 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
3072 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
3073 MF->insert(It, TrapBB);
3074 MF->insert(It, PassBB);
3075
3076 // Continue if vector lengths match
3077 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
3078 .addReg(RegSVL_GPR)
3079 .addMBB(PassBB);
3080
3081 // Transfer rest of current BB to PassBB
3082 PassBB->splice(PassBB->begin(), MBB,
3083 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
3085
3086 // Trap if vector lengths mismatch
3087 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
3088
3089 MBB->addSuccessor(TrapBB);
3090 MBB->addSuccessor(PassBB);
3091
3092 MI.eraseFromParent();
3093 return PassBB;
3094}
3095
3097AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3099 MachineBasicBlock *BB) const {
3100 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3101 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3102
3103 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3104 MIB.add(MI.getOperand(1)); // slice index register
3105 MIB.add(MI.getOperand(2)); // slice index offset
3106 MIB.add(MI.getOperand(3)); // pg
3107 MIB.add(MI.getOperand(4)); // base
3108 MIB.add(MI.getOperand(5)); // offset
3109
3110 MI.eraseFromParent(); // The pseudo is gone now.
3111 return BB;
3112}
3113
3116 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3118 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3119
3120 MIB.addReg(AArch64::ZA, RegState::Define);
3121 MIB.add(MI.getOperand(0)); // Vector select register
3122 MIB.add(MI.getOperand(1)); // Vector select offset
3123 MIB.add(MI.getOperand(2)); // Base
3124 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3125
3126 MI.eraseFromParent(); // The pseudo is gone now.
3127 return BB;
3128}
3129
3132 unsigned Opcode,
3133 bool Op0IsDef) const {
3134 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3136
3137 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3138 .addReg(MI.getOperand(0).getReg(), getDefRegState(Op0IsDef));
3139 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3140 MIB.add(MI.getOperand(I));
3141
3142 MI.eraseFromParent(); // The pseudo is gone now.
3143 return BB;
3144}
3145
3147AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3149 MachineBasicBlock *BB) const {
3150 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3151 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3152 unsigned StartIdx = 0;
3153
3154 bool HasTile = BaseReg != AArch64::ZA;
3155 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3156 if (HasZPROut) {
3157 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3158 ++StartIdx;
3159 }
3160 if (HasTile) {
3161 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3162 RegState::Define); // Output ZA Tile
3163 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3164 StartIdx++;
3165 } else {
3166 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3167 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3168 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3169 ++StartIdx;
3170 }
3171 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3172 }
3173 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3174 MIB.add(MI.getOperand(I));
3175
3176 MI.eraseFromParent(); // The pseudo is gone now.
3177 return BB;
3178}
3179
3182 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3184 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3185 MIB.add(MI.getOperand(0)); // Mask
3186
3187 unsigned Mask = MI.getOperand(0).getImm();
3188 for (unsigned I = 0; I < 8; I++) {
3189 if (Mask & (1 << I))
3190 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3191 }
3192
3193 MI.eraseFromParent(); // The pseudo is gone now.
3194 return BB;
3195}
3196
3199 MachineBasicBlock *BB) const {
3200 MachineFunction *MF = BB->getParent();
3201 MachineFrameInfo &MFI = MF->getFrameInfo();
3203 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3204 if (TPIDR2.Uses > 0) {
3205 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3206 // generally don't support big-endian SVE/SME.
3207 if (!Subtarget->isLittleEndian())
3209 "TPIDR2 block initialization is not supported on big-endian targets");
3210
3211 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3212 // Store buffer pointer and num_za_save_slices.
3213 // Bytes 10-15 are implicitly zeroed.
3214 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3215 .addReg(MI.getOperand(0).getReg())
3216 .addReg(MI.getOperand(1).getReg())
3217 .addFrameIndex(TPIDR2.FrameIndex)
3218 .addImm(0);
3219 } else
3220 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3221
3222 BB->remove_instr(&MI);
3223 return BB;
3224}
3225
3228 MachineBasicBlock *BB) const {
3229 MachineFunction *MF = BB->getParent();
3230 MachineFrameInfo &MFI = MF->getFrameInfo();
3232 // TODO This function grows the stack with a subtraction, which doesn't work
3233 // on Windows. Some refactoring to share the functionality in
3234 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3235 // supports SME
3237 "Lazy ZA save is not yet supported on Windows");
3238
3239 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3240
3241 if (TPIDR2.Uses > 0) {
3242 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3244
3245 // The SUBXrs below won't always be emitted in a form that accepts SP
3246 // directly
3247 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3248 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3249 .addReg(AArch64::SP);
3250
3251 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3252 auto Size = MI.getOperand(1).getReg();
3253 auto Dest = MI.getOperand(0).getReg();
3254 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3255 .addReg(Size)
3256 .addReg(Size)
3257 .addReg(SP);
3258 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3259 AArch64::SP)
3260 .addReg(Dest);
3261
3262 // We have just allocated a variable sized object, tell this to PEI.
3263 MFI.CreateVariableSizedObject(Align(16), nullptr);
3264 }
3265
3266 BB->remove_instr(&MI);
3267 return BB;
3268}
3269
3270// TODO: Find a way to merge this with EmitAllocateZABuffer.
3273 MachineBasicBlock *BB) const {
3274 MachineFunction *MF = BB->getParent();
3275 MachineFrameInfo &MFI = MF->getFrameInfo();
3278 "Lazy ZA save is not yet supported on Windows");
3279
3280 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3281 if (FuncInfo->isSMESaveBufferUsed()) {
3282 // Allocate a buffer object of the size given by MI.getOperand(1).
3283 auto Size = MI.getOperand(1).getReg();
3284 auto Dest = MI.getOperand(0).getReg();
3285 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3286 .addReg(AArch64::SP)
3287 .addReg(Size)
3289 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3290 .addReg(AArch64::SP);
3291
3292 // We have just allocated a variable sized object, tell this to PEI.
3293 MFI.CreateVariableSizedObject(Align(16), nullptr);
3294 } else
3295 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3296 MI.getOperand(0).getReg());
3297
3298 BB->remove_instr(&MI);
3299 return BB;
3300}
3301
3304 MachineBasicBlock *BB) const {
3305 // If the buffer is used, emit a call to __arm_sme_state_size()
3306 MachineFunction *MF = BB->getParent();
3308 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3309 if (FuncInfo->isSMESaveBufferUsed()) {
3310 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3311 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3312 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3314 .addReg(AArch64::X0, RegState::ImplicitDefine)
3315 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3316 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3317 MI.getOperand(0).getReg())
3318 .addReg(AArch64::X0);
3319 } else
3320 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3321 MI.getOperand(0).getReg())
3322 .addReg(AArch64::XZR);
3323 BB->remove_instr(&MI);
3324 return BB;
3325}
3326
3329 MachineBasicBlock *BB) const {
3330 MachineFunction *MF = BB->getParent();
3331 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3332 const DebugLoc &DL = MI.getDebugLoc();
3333 Register ResultReg = MI.getOperand(0).getReg();
3334 if (MF->getRegInfo().use_empty(ResultReg)) {
3335 // Nothing to do. Pseudo erased below.
3336 } else if (Subtarget->hasSME()) {
3337 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3338 .addImm(AArch64SysReg::SVCR)
3339 .addReg(AArch64::VG, RegState::Implicit);
3340 } else {
3341 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3342 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3343 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3345 .addReg(AArch64::X0, RegState::ImplicitDefine)
3346 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3347 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3348 .addReg(AArch64::X0);
3349 }
3350 MI.eraseFromParent();
3351 return BB;
3352}
3353
3354// Helper function to find the instruction that defined a virtual register.
3355// If unable to find such instruction, returns nullptr.
3357 Register Reg) {
3358 while (Reg.isVirtual()) {
3359 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3360 assert(DefMI && "Virtual register definition not found");
3361 unsigned Opcode = DefMI->getOpcode();
3362
3363 if (Opcode == AArch64::COPY) {
3364 Reg = DefMI->getOperand(1).getReg();
3365 // Vreg is defined by copying from physreg.
3366 if (Reg.isPhysical())
3367 return DefMI;
3368 continue;
3369 }
3370 if (Opcode == AArch64::SUBREG_TO_REG) {
3371 Reg = DefMI->getOperand(2).getReg();
3372 continue;
3373 }
3374
3375 return DefMI;
3376 }
3377 return nullptr;
3378}
3379
3382 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3383 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3384 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3385 const DebugLoc &DL = MI.getDebugLoc();
3386
3387 Register AddrDisc = AddrDiscOp.getReg();
3388 int64_t IntDisc = IntDiscOp.getImm();
3389 assert(IntDisc == 0 && "Blend components are already expanded");
3390
3391 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3392 if (DiscMI) {
3393 switch (DiscMI->getOpcode()) {
3394 case AArch64::MOVKXi:
3395 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3396 // #imm should be an immediate and not a global symbol, for example.
3397 if (DiscMI->getOperand(2).isImm() &&
3398 DiscMI->getOperand(3).getImm() == 48) {
3399 AddrDisc = DiscMI->getOperand(1).getReg();
3400 IntDisc = DiscMI->getOperand(2).getImm();
3401 }
3402 break;
3403 case AArch64::MOVi32imm:
3404 case AArch64::MOVi64imm:
3405 // Small immediate integer constant passed via VReg.
3406 if (DiscMI->getOperand(1).isImm() &&
3407 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3408 AddrDisc = AArch64::NoRegister;
3409 IntDisc = DiscMI->getOperand(1).getImm();
3410 }
3411 break;
3412 }
3413 }
3414
3415 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3416 // in the requested register class.
3417 if (AddrDisc == AArch64::XZR)
3418 AddrDisc = AArch64::NoRegister;
3419
3420 // Make sure AddrDisc operand respects the register class imposed by MI.
3421 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3422 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3423 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3424 AddrDisc = TmpReg;
3425 }
3426
3427 AddrDiscOp.setReg(AddrDisc);
3428 IntDiscOp.setImm(IntDisc);
3429}
3430
3432 MachineInstr &MI, MachineBasicBlock *BB) const {
3433
3434 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3435 if (SMEOrigInstr != -1) {
3436 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3437 uint64_t SMEMatrixType =
3438 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3439 switch (SMEMatrixType) {
3441 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3443 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3445 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3447 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3449 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3451 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3452 }
3453 }
3454
3455 switch (MI.getOpcode()) {
3456 default:
3457#ifndef NDEBUG
3458 MI.dump();
3459#endif
3460 llvm_unreachable("Unexpected instruction for custom inserter!");
3461 case AArch64::InitTPIDR2Obj:
3462 return EmitInitTPIDR2Object(MI, BB);
3463 case AArch64::AllocateZABuffer:
3464 return EmitAllocateZABuffer(MI, BB);
3465 case AArch64::AllocateSMESaveBuffer:
3466 return EmitAllocateSMESaveBuffer(MI, BB);
3467 case AArch64::GetSMESaveSize:
3468 return EmitGetSMESaveSize(MI, BB);
3469 case AArch64::EntryPStateSM:
3470 return EmitEntryPStateSM(MI, BB);
3471 case AArch64::F128CSEL:
3472 return EmitF128CSEL(MI, BB);
3473 case TargetOpcode::STATEPOINT:
3474 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3475 // while bl call instruction (where statepoint will be lowered at the end)
3476 // has implicit def. This def is early-clobber as it will be set at
3477 // the moment of the call and earlier than any use is read.
3478 // Add this implicit dead def here as a workaround.
3479 MI.addOperand(*MI.getMF(),
3481 AArch64::LR, /*isDef*/ true,
3482 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3483 /*isUndef*/ false, /*isEarlyClobber*/ true));
3484 [[fallthrough]];
3485 case TargetOpcode::STACKMAP:
3486 case TargetOpcode::PATCHPOINT:
3487 return emitPatchPoint(MI, BB);
3488
3489 case TargetOpcode::PATCHABLE_EVENT_CALL:
3490 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3491 return BB;
3492
3493 case AArch64::CATCHRET:
3494 return EmitLoweredCatchRet(MI, BB);
3495
3496 case AArch64::PROBED_STACKALLOC_DYN:
3497 return EmitDynamicProbedAlloc(MI, BB);
3498
3499 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3500 return EmitCheckMatchingVL(MI, BB);
3501
3502 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3503 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3504 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3505 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3506 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3507 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3508 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3509 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3510 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3511 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3512 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3513 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3514 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3515 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3516 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3517 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3518 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3519 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3520 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3521 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3522 case AArch64::LDR_ZA_PSEUDO:
3523 return EmitFill(MI, BB);
3524 case AArch64::LDR_TX_PSEUDO:
3525 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3526 case AArch64::STR_TX_PSEUDO:
3527 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3528 case AArch64::ZERO_M_PSEUDO:
3529 return EmitZero(MI, BB);
3530 case AArch64::ZERO_T_PSEUDO:
3531 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3532 case AArch64::MOVT_TIZ_PSEUDO:
3533 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3534
3535 case AArch64::PAC:
3536 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3537 &AArch64::GPR64noipRegClass);
3538 return BB;
3539 }
3540}
3541
3542//===----------------------------------------------------------------------===//
3543// AArch64 Lowering private implementation.
3544//===----------------------------------------------------------------------===//
3545
3546//===----------------------------------------------------------------------===//
3547// Lowering Code
3548//===----------------------------------------------------------------------===//
3549
3550// Forward declarations of SVE fixed length lowering helpers
3555 SelectionDAG &DAG);
3558 EVT VT);
3559
3560/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3561static bool isZerosVector(const SDNode *N) {
3562 // Look through a bit convert.
3563 while (N->getOpcode() == ISD::BITCAST)
3564 N = N->getOperand(0).getNode();
3565
3567 return true;
3568
3569 if (N->getOpcode() != AArch64ISD::DUP)
3570 return false;
3571
3572 auto Opnd0 = N->getOperand(0);
3573 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3574}
3575
3576/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3577/// CC
3579 SDValue RHS = {}) {
3580 switch (CC) {
3581 default:
3582 llvm_unreachable("Unknown condition code!");
3583 case ISD::SETNE:
3584 return AArch64CC::NE;
3585 case ISD::SETEQ:
3586 return AArch64CC::EQ;
3587 case ISD::SETGT:
3588 return AArch64CC::GT;
3589 case ISD::SETGE:
3591 case ISD::SETLT:
3593 case ISD::SETLE:
3594 return AArch64CC::LE;
3595 case ISD::SETUGT:
3596 return AArch64CC::HI;
3597 case ISD::SETUGE:
3598 return AArch64CC::HS;
3599 case ISD::SETULT:
3600 return AArch64CC::LO;
3601 case ISD::SETULE:
3602 return AArch64CC::LS;
3603 }
3604}
3605
3606/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3608 AArch64CC::CondCode &CondCode,
3609 AArch64CC::CondCode &CondCode2) {
3610 CondCode2 = AArch64CC::AL;
3611 switch (CC) {
3612 default:
3613 llvm_unreachable("Unknown FP condition!");
3614 case ISD::SETEQ:
3615 case ISD::SETOEQ:
3616 CondCode = AArch64CC::EQ;
3617 break;
3618 case ISD::SETGT:
3619 case ISD::SETOGT:
3620 CondCode = AArch64CC::GT;
3621 break;
3622 case ISD::SETGE:
3623 case ISD::SETOGE:
3624 CondCode = AArch64CC::GE;
3625 break;
3626 case ISD::SETOLT:
3627 CondCode = AArch64CC::MI;
3628 break;
3629 case ISD::SETOLE:
3630 CondCode = AArch64CC::LS;
3631 break;
3632 case ISD::SETONE:
3633 CondCode = AArch64CC::MI;
3634 CondCode2 = AArch64CC::GT;
3635 break;
3636 case ISD::SETO:
3637 CondCode = AArch64CC::VC;
3638 break;
3639 case ISD::SETUO:
3640 CondCode = AArch64CC::VS;
3641 break;
3642 case ISD::SETUEQ:
3643 CondCode = AArch64CC::EQ;
3644 CondCode2 = AArch64CC::VS;
3645 break;
3646 case ISD::SETUGT:
3647 CondCode = AArch64CC::HI;
3648 break;
3649 case ISD::SETUGE:
3650 CondCode = AArch64CC::PL;
3651 break;
3652 case ISD::SETLT:
3653 case ISD::SETULT:
3654 CondCode = AArch64CC::LT;
3655 break;
3656 case ISD::SETLE:
3657 case ISD::SETULE:
3658 CondCode = AArch64CC::LE;
3659 break;
3660 case ISD::SETNE:
3661 case ISD::SETUNE:
3662 CondCode = AArch64CC::NE;
3663 break;
3664 }
3665}
3666
3667/// Convert a DAG fp condition code to an AArch64 CC.
3668/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3669/// should be AND'ed instead of OR'ed.
3671 AArch64CC::CondCode &CondCode,
3672 AArch64CC::CondCode &CondCode2) {
3673 CondCode2 = AArch64CC::AL;
3674 switch (CC) {
3675 default:
3676 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3677 assert(CondCode2 == AArch64CC::AL);
3678 break;
3679 case ISD::SETONE:
3680 // (a one b)
3681 // == ((a olt b) || (a ogt b))
3682 // == ((a ord b) && (a une b))
3683 CondCode = AArch64CC::VC;
3684 CondCode2 = AArch64CC::NE;
3685 break;
3686 case ISD::SETUEQ:
3687 // (a ueq b)
3688 // == ((a uno b) || (a oeq b))
3689 // == ((a ule b) && (a uge b))
3690 CondCode = AArch64CC::PL;
3691 CondCode2 = AArch64CC::LE;
3692 break;
3693 }
3694}
3695
3696/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3697/// CC usable with the vector instructions. Fewer operations are available
3698/// without a real NZCV register, so we have to use less efficient combinations
3699/// to get the same effect.
3701 AArch64CC::CondCode &CondCode,
3702 AArch64CC::CondCode &CondCode2,
3703 bool &Invert) {
3704 Invert = false;
3705 switch (CC) {
3706 default:
3707 // Mostly the scalar mappings work fine.
3708 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3709 break;
3710 case ISD::SETUO:
3711 Invert = true;
3712 [[fallthrough]];
3713 case ISD::SETO:
3714 CondCode = AArch64CC::MI;
3715 CondCode2 = AArch64CC::GE;
3716 break;
3717 case ISD::SETUEQ:
3718 case ISD::SETULT:
3719 case ISD::SETULE:
3720 case ISD::SETUGT:
3721 case ISD::SETUGE:
3722 // All of the compare-mask comparisons are ordered, but we can switch
3723 // between the two by a double inversion. E.g. ULE == !OGT.
3724 Invert = true;
3725 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3726 CondCode, CondCode2);
3727 break;
3728 }
3729}
3730
3731/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3733 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3734 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3735}
3736
3738 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3739 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3740 LLVM_DEBUG(dbgs() << "Is imm " << C
3741 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3742 return IsLegal;
3743}
3744
3746 // Works for negative immediates too, as it can be written as an ADDS
3747 // instruction with a negated immediate.
3748 return isLegalArithImmed(C.abs().getZExtValue());
3749}
3750
3752 uint64_t Imm = C.getZExtValue();
3754 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3755 return Insn.size();
3756}
3757
3759 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3760 if (Op->getFlags().hasNoSignedWrap())
3761 return true;
3762
3763 // We can still figure out if the second operand is safe to use
3764 // in a CMN instruction by checking if it is known to be not the minimum
3765 // signed value. If it is not, then we can safely use CMN.
3766 // Note: We can eventually remove this check and simply rely on
3767 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3768 // consistently sets them appropriately when making said nodes.
3769
3770 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3771 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3772}
3773
3774// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3775// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3776// can be set differently by this operation. It comes down to whether
3777// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3778// everything is fine. If not then the optimization is wrong. Thus general
3779// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3780//
3781// So, finally, the only LLVM-native comparisons that don't mention C or V
3782// are the ones that aren't unsigned comparisons. They're the only ones we can
3783// safely use CMN for in the absence of information about op2.
3785 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3786 (isIntEqualitySetCC(CC) ||
3787 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3788 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3789}
3790
3792 SelectionDAG &DAG, SDValue Chain,
3793 bool IsSignaling) {
3794 EVT VT = LHS.getValueType();
3795 assert(VT != MVT::f128);
3796
3797 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3798
3799 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3800 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3801 {Chain, LHS});
3802 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3803 {LHS.getValue(1), RHS});
3804 Chain = RHS.getValue(1);
3805 }
3806 unsigned Opcode =
3807 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3808 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3809}
3810
3812 const SDLoc &DL, SelectionDAG &DAG) {
3813 EVT VT = LHS.getValueType();
3814 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3815
3816 if (VT.isFloatingPoint()) {
3817 assert(VT != MVT::f128);
3818 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3819 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3820 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3821 }
3822 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3823 }
3824
3825 // The CMP instruction is just an alias for SUBS, and representing it as
3826 // SUBS means that it's possible to get CSE with subtract operations.
3827 // A later phase can perform the optimization of setting the destination
3828 // register to WZR/XZR if it ends up being unused.
3829 unsigned Opcode = AArch64ISD::SUBS;
3830
3831 if (isCMN(RHS, CC, DAG)) {
3832 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3833 Opcode = AArch64ISD::ADDS;
3834 RHS = RHS.getOperand(1);
3835 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3836 isIntEqualitySetCC(CC)) {
3837 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3838 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3839 Opcode = AArch64ISD::ADDS;
3840 LHS = LHS.getOperand(1);
3841 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3842 if (LHS.getOpcode() == ISD::AND) {
3843 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3844 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3845 // of the signed comparisons.
3846 const SDValue ANDSNode =
3847 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3848 LHS.getOperand(0), LHS.getOperand(1));
3849 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3850 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3851 return ANDSNode.getValue(1);
3852 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3853 // Use result of ANDS
3854 return LHS.getValue(1);
3855 }
3856 }
3857
3858 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3859 .getValue(1);
3860}
3861
3862/// \defgroup AArch64CCMP CMP;CCMP matching
3863///
3864/// These functions deal with the formation of CMP;CCMP;... sequences.
3865/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3866/// a comparison. They set the NZCV flags to a predefined value if their
3867/// predicate is false. This allows to express arbitrary conjunctions, for
3868/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3869/// expressed as:
3870/// cmp A
3871/// ccmp B, inv(CB), CA
3872/// check for CB flags
3873///
3874/// This naturally lets us implement chains of AND operations with SETCC
3875/// operands. And we can even implement some other situations by transforming
3876/// them:
3877/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3878/// negating the flags used in a CCMP/FCCMP operations.
3879/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3880/// by negating the flags we test for afterwards. i.e.
3881/// NEG (CMP CCMP CCCMP ...) can be implemented.
3882/// - Note that we can only ever negate all previously processed results.
3883/// What we can not implement by flipping the flags to test is a negation
3884/// of two sub-trees (because the negation affects all sub-trees emitted so
3885/// far, so the 2nd sub-tree we emit would also affect the first).
3886/// With those tools we can implement some OR operations:
3887/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3888/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3889/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3890/// elimination rules from earlier to implement the whole thing as a
3891/// CCMP/FCCMP chain.
3892///
3893/// As complete example:
3894/// or (or (setCA (cmp A)) (setCB (cmp B)))
3895/// (and (setCC (cmp C)) (setCD (cmp D)))"
3896/// can be reassociated to:
3897/// or (and (setCC (cmp C)) setCD (cmp D))
3898// (or (setCA (cmp A)) (setCB (cmp B)))
3899/// can be transformed to:
3900/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3901/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3902/// which can be implemented as:
3903/// cmp C
3904/// ccmp D, inv(CD), CC
3905/// ccmp A, CA, inv(CD)
3906/// ccmp B, CB, inv(CA)
3907/// check for CB flags
3908///
3909/// A counterexample is "or (and A B) (and C D)" which translates to
3910/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3911/// can only implement 1 of the inner (not) operations, but not both!
3912/// @{
3913
3914/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3916 ISD::CondCode CC, SDValue CCOp,
3918 AArch64CC::CondCode OutCC,
3919 const SDLoc &DL, SelectionDAG &DAG) {
3920 unsigned Opcode = 0;
3921 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3922
3923 if (LHS.getValueType().isFloatingPoint()) {
3924 assert(LHS.getValueType() != MVT::f128);
3925 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3926 LHS.getValueType() == MVT::bf16) {
3927 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3928 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3929 }
3930 Opcode = AArch64ISD::FCCMP;
3931 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3932 APInt Imm = Const->getAPIntValue();
3933 if (Imm.isNegative() && Imm.sgt(-32)) {
3934 Opcode = AArch64ISD::CCMN;
3935 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3936 }
3937 } else if (isCMN(RHS, CC, DAG)) {
3938 Opcode = AArch64ISD::CCMN;
3939 RHS = RHS.getOperand(1);
3940 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3941 isIntEqualitySetCC(CC)) {
3942 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3943 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3944 Opcode = AArch64ISD::CCMN;
3945 LHS = LHS.getOperand(1);
3946 }
3947 if (Opcode == 0)
3948 Opcode = AArch64ISD::CCMP;
3949
3950 SDValue Condition = getCondCode(DAG, Predicate);
3952 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3953 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3954 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3955}
3956
3957/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3958/// expressed as a conjunction. See \ref AArch64CCMP.
3959/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3960/// changing the conditions on the SETCC tests.
3961/// (this means we can call emitConjunctionRec() with
3962/// Negate==true on this sub-tree)
3963/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3964/// cannot do the negation naturally. We are required to
3965/// emit the subtree first in this case.
3966/// \param PreferFirst Set to true if processing this subtree first may
3967/// result in more efficient code.
3968/// \param WillNegate Is true if are called when the result of this
3969/// subexpression must be negated. This happens when the
3970/// outer expression is an OR. We can use this fact to know
3971/// that we have a double negation (or (or ...) ...) that
3972/// can be implemented for free.
3973static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val,
3974 bool &CanNegate, bool &MustBeFirst,
3975 bool &PreferFirst, bool WillNegate,
3976 unsigned Depth = 0) {
3977 if (!Val.hasOneUse())
3978 return false;
3979 unsigned Opcode = Val->getOpcode();
3980 if (Opcode == ISD::SETCC) {
3981 EVT VT = Val->getOperand(0).getValueType();
3982 if (VT == MVT::f128)
3983 return false;
3984 CanNegate = true;
3985 MustBeFirst = false;
3986 // Designate this operation as a preferred first operation if the result
3987 // of a SUB operation can be reused.
3988 PreferFirst = DAG.doesNodeExist(ISD::SUB, DAG.getVTList(VT),
3989 {Val->getOperand(0), Val->getOperand(1)});
3990 return true;
3991 }
3992 // Protect against exponential runtime and stack overflow.
3993 if (Depth > 6)
3994 return false;
3995 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3996 bool IsOR = Opcode == ISD::OR;
3997 SDValue O0 = Val->getOperand(0);
3998 SDValue O1 = Val->getOperand(1);
3999 bool CanNegateL;
4000 bool MustBeFirstL;
4001 bool PreferFirstL;
4002 if (!canEmitConjunction(DAG, O0, CanNegateL, MustBeFirstL, PreferFirstL,
4003 IsOR, Depth + 1))
4004 return false;
4005 bool CanNegateR;
4006 bool MustBeFirstR;
4007 bool PreferFirstR;
4008 if (!canEmitConjunction(DAG, O1, CanNegateR, MustBeFirstR, PreferFirstR,
4009 IsOR, Depth + 1))
4010 return false;
4011
4012 if (MustBeFirstL && MustBeFirstR)
4013 return false;
4014
4015 if (IsOR) {
4016 // For an OR expression we need to be able to naturally negate at least
4017 // one side or we cannot do the transformation at all.
4018 if (!CanNegateL && !CanNegateR)
4019 return false;
4020 // If we the result of the OR will be negated and we can naturally negate
4021 // the leafs, then this sub-tree as a whole negates naturally.
4022 CanNegate = WillNegate && CanNegateL && CanNegateR;
4023 // If we cannot naturally negate the whole sub-tree, then this must be
4024 // emitted first.
4025 MustBeFirst = !CanNegate;
4026 } else {
4027 assert(Opcode == ISD::AND && "Must be OR or AND");
4028 // We cannot naturally negate an AND operation.
4029 CanNegate = false;
4030 MustBeFirst = MustBeFirstL || MustBeFirstR;
4031 }
4032 PreferFirst = PreferFirstL || PreferFirstR;
4033 return true;
4034 }
4035 return false;
4036}
4037
4038/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
4039/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
4040/// Tries to transform the given i1 producing node @p Val to a series compare
4041/// and conditional compare operations. @returns an NZCV flags producing node
4042/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
4043/// transformation was not possible.
4044/// \p Negate is true if we want this sub-tree being negated just by changing
4045/// SETCC conditions.
4047 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
4049 // We're at a tree leaf, produce a conditional comparison operation.
4050 unsigned Opcode = Val->getOpcode();
4051 if (Opcode == ISD::SETCC) {
4052 SDValue LHS = Val->getOperand(0);
4053 SDValue RHS = Val->getOperand(1);
4054 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
4055 bool isInteger = LHS.getValueType().isInteger();
4056 if (Negate)
4057 CC = getSetCCInverse(CC, LHS.getValueType());
4058 SDLoc DL(Val);
4059 // Determine OutCC and handle FP special case.
4060 if (isInteger) {
4061 OutCC = changeIntCCToAArch64CC(CC, RHS);
4062 } else {
4063 assert(LHS.getValueType().isFloatingPoint());
4064 AArch64CC::CondCode ExtraCC;
4065 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4066 // Some floating point conditions can't be tested with a single condition
4067 // code. Construct an additional comparison in this case.
4068 if (ExtraCC != AArch64CC::AL) {
4069 SDValue ExtraCmp;
4070 if (!CCOp.getNode())
4071 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
4072 else
4073 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
4074 ExtraCC, DL, DAG);
4075 CCOp = ExtraCmp;
4076 Predicate = ExtraCC;
4077 }
4078 }
4079
4080 // Produce a normal comparison if we are first in the chain
4081 if (!CCOp)
4082 return emitComparison(LHS, RHS, CC, DL, DAG);
4083 // Otherwise produce a ccmp.
4084 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
4085 DAG);
4086 }
4087 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
4088
4089 bool IsOR = Opcode == ISD::OR;
4090
4091 SDValue LHS = Val->getOperand(0);
4092 bool CanNegateL;
4093 bool MustBeFirstL;
4094 bool PreferFirstL;
4095 bool ValidL = canEmitConjunction(DAG, LHS, CanNegateL, MustBeFirstL,
4096 PreferFirstL, IsOR);
4097 assert(ValidL && "Valid conjunction/disjunction tree");
4098 (void)ValidL;
4099
4100 SDValue RHS = Val->getOperand(1);
4101 bool CanNegateR;
4102 bool MustBeFirstR;
4103 bool PreferFirstR;
4104 bool ValidR = canEmitConjunction(DAG, RHS, CanNegateR, MustBeFirstR,
4105 PreferFirstR, IsOR);
4106 assert(ValidR && "Valid conjunction/disjunction tree");
4107 (void)ValidR;
4108
4109 bool ShouldFirstL = PreferFirstL && !PreferFirstR && !MustBeFirstR;
4110
4111 // Swap sub-tree that must or should come first to the right side.
4112 if (MustBeFirstL || ShouldFirstL) {
4113 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4114 std::swap(LHS, RHS);
4115 std::swap(CanNegateL, CanNegateR);
4116 std::swap(MustBeFirstL, MustBeFirstR);
4117 }
4118
4119 bool NegateR;
4120 bool NegateAfterR;
4121 bool NegateL;
4122 bool NegateAfterAll;
4123 if (Opcode == ISD::OR) {
4124 // Swap the sub-tree that we can negate naturally to the left.
4125 if (!CanNegateL) {
4126 assert(CanNegateR && "at least one side must be negatable");
4127 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4128 assert(!Negate);
4129 std::swap(LHS, RHS);
4130 NegateR = false;
4131 NegateAfterR = true;
4132 } else {
4133 // Negate the left sub-tree if possible, otherwise negate the result.
4134 NegateR = CanNegateR;
4135 NegateAfterR = !CanNegateR;
4136 }
4137 NegateL = true;
4138 NegateAfterAll = !Negate;
4139 } else {
4140 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4141 assert(!Negate && "Valid conjunction/disjunction tree");
4142
4143 NegateL = false;
4144 NegateR = false;
4145 NegateAfterR = false;
4146 NegateAfterAll = false;
4147 }
4148
4149 // Emit sub-trees.
4150 AArch64CC::CondCode RHSCC;
4151 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4152 if (NegateAfterR)
4153 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4154 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4155 if (NegateAfterAll)
4156 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4157 return CmpL;
4158}
4159
4160/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4161/// In some cases this is even possible with OR operations in the expression.
4162/// See \ref AArch64CCMP.
4163/// \see emitConjunctionRec().
4165 AArch64CC::CondCode &OutCC) {
4166 bool DummyCanNegate;
4167 bool DummyMustBeFirst;
4168 bool DummyPreferFirst;
4169 if (!canEmitConjunction(DAG, Val, DummyCanNegate, DummyMustBeFirst,
4170 DummyPreferFirst, false))
4171 return SDValue();
4172
4173 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4174}
4175
4176/// @}
4177
4178/// Returns how profitable it is to fold a comparison's operand's shift and/or
4179/// extension operations.
4181 auto isSupportedExtend = [&](SDValue V) {
4182 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4183 return true;
4184
4185 if (V.getOpcode() == ISD::AND)
4186 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4187 uint64_t Mask = MaskCst->getZExtValue();
4188 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4189 }
4190
4191 return false;
4192 };
4193
4194 if (!Op.hasOneUse())
4195 return 0;
4196
4197 if (isSupportedExtend(Op))
4198 return 1;
4199
4200 unsigned Opc = Op.getOpcode();
4201 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4202 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4203 uint64_t Shift = ShiftCst->getZExtValue();
4204 if (isSupportedExtend(Op.getOperand(0)))
4205 return (Shift <= 4) ? 2 : 1;
4206 EVT VT = Op.getValueType();
4207 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4208 return 1;
4209 }
4210
4211 return 0;
4212}
4213
4214// emitComparison() converts comparison with one or negative one to comparison
4215// with 0. Note that this only works for signed comparisons because of how ANDS
4216// works.
4218 // Only works for ANDS and AND.
4219 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4220 return false;
4221
4222 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4223 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4224 return true;
4225 }
4226
4227 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4228 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4229 return true;
4230 }
4231
4232 return false;
4233}
4234
4236 SDValue &AArch64cc, SelectionDAG &DAG,
4237 const SDLoc &DL) {
4238 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4239 EVT VT = RHS.getValueType();
4240 APInt C = RHSC->getAPIntValue();
4241 // shouldBeAdjustedToZero is a special case to better fold with
4242 // emitComparison().
4243 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4244 // Adjust the constant to zero.
4245 // CC has already been adjusted.
4246 RHS = DAG.getConstant(0, DL, VT);
4247 } else if (!isLegalCmpImmed(C)) {
4248 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4249 // Constant does not fit, try adjusting it by one?
4250 switch (CC) {
4251 default:
4252 break;
4253 case ISD::SETLT:
4254 case ISD::SETGE:
4255 if (!C.isMinSignedValue()) {
4256 APInt CMinusOne = C - 1;
4257 if (isLegalCmpImmed(CMinusOne) ||
4258 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4259 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4260 RHS = DAG.getConstant(CMinusOne, DL, VT);
4261 }
4262 }
4263 break;
4264 case ISD::SETULT:
4265 case ISD::SETUGE: {
4266 // C is not 0 because it is a legal immediate.
4267 assert(!C.isZero() && "C should not be zero here");
4268 APInt CMinusOne = C - 1;
4269 if (isLegalCmpImmed(CMinusOne) ||
4270 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4271 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4272 RHS = DAG.getConstant(CMinusOne, DL, VT);
4273 }
4274 break;
4275 }
4276 case ISD::SETLE:
4277 case ISD::SETGT:
4278 if (!C.isMaxSignedValue()) {
4279 APInt CPlusOne = C + 1;
4280 if (isLegalCmpImmed(CPlusOne) ||
4281 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4282 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4283 RHS = DAG.getConstant(CPlusOne, DL, VT);
4284 }
4285 }
4286 break;
4287 case ISD::SETULE:
4288 case ISD::SETUGT: {
4289 if (!C.isAllOnes()) {
4290 APInt CPlusOne = C + 1;
4291 if (isLegalCmpImmed(CPlusOne) ||
4292 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4293 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4294 RHS = DAG.getConstant(CPlusOne, DL, VT);
4295 }
4296 }
4297 break;
4298 }
4299 }
4300 }
4301 }
4302
4303 // Comparisons are canonicalized so that the RHS operand is simpler than the
4304 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4305 // can fold some shift+extend operations on the RHS operand, so swap the
4306 // operands if that can be done.
4307 //
4308 // For example:
4309 // lsl w13, w11, #1
4310 // cmp w13, w12
4311 // can be turned into:
4312 // cmp w12, w11, lsl #1
4313 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4314 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4315 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4316 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4317 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4318
4319 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4320 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4321 std::swap(LHS, RHS);
4323 }
4324 }
4325
4326 SDValue Cmp;
4328 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4330
4331 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4332 // For the i8 operand, the largest immediate is 255, so this can be easily
4333 // encoded in the compare instruction. For the i16 operand, however, the
4334 // largest immediate cannot be encoded in the compare.
4335 // Therefore, use a sign extending load and cmn to avoid materializing the
4336 // -1 constant. For example,
4337 // movz w1, #65535
4338 // ldrh w0, [x0, #0]
4339 // cmp w0, w1
4340 // >
4341 // ldrsh w0, [x0, #0]
4342 // cmn w0, #1
4343 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4344 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4345 // ensure both the LHS and RHS are truly zero extended and to make sure the
4346 // transformation is profitable.
4347 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4348 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4349 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4350 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4351 int16_t ValueofRHS = RHS->getAsZExtVal();
4352 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4353 SDValue SExt =
4354 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4355 DAG.getValueType(MVT::i16));
4356 Cmp = emitComparison(
4357 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4358 DL, DAG);
4360 }
4361 }
4362
4363 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4364 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4365 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4367 }
4368 }
4369 }
4370
4371 if (!Cmp) {
4372 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4374 }
4375 AArch64cc = getCondCode(DAG, AArch64CC);
4376 return Cmp;
4377}
4378
4379static std::pair<SDValue, SDValue>
4381 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4382 "Unsupported value type");
4383 SDValue Value, Overflow;
4384 SDLoc DL(Op);
4385 SDValue LHS = Op.getOperand(0);
4386 SDValue RHS = Op.getOperand(1);
4387 unsigned Opc = 0;
4388 switch (Op.getOpcode()) {
4389 default:
4390 llvm_unreachable("Unknown overflow instruction!");
4391 case ISD::SADDO:
4392 Opc = AArch64ISD::ADDS;
4393 CC = AArch64CC::VS;
4394 break;
4395 case ISD::UADDO:
4396 Opc = AArch64ISD::ADDS;
4397 CC = AArch64CC::HS;
4398 break;
4399 case ISD::SSUBO:
4400 Opc = AArch64ISD::SUBS;
4401 CC = AArch64CC::VS;
4402 break;
4403 case ISD::USUBO:
4404 Opc = AArch64ISD::SUBS;
4405 CC = AArch64CC::LO;
4406 break;
4407 // Multiply needs a little bit extra work.
4408 case ISD::SMULO:
4409 case ISD::UMULO: {
4410 CC = AArch64CC::NE;
4411 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4412 if (Op.getValueType() == MVT::i32) {
4413 // Extend to 64-bits, then perform a 64-bit multiply.
4414 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4415 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4416 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4417 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4418 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4419
4420 // Check that the result fits into a 32-bit integer.
4421 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4422 if (IsSigned) {
4423 // cmp xreg, wreg, sxtw
4424 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4425 Overflow =
4426 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4427 } else {
4428 // tst xreg, #0xffffffff00000000
4429 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4430 Overflow =
4431 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4432 }
4433 break;
4434 }
4435 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4436 // For the 64 bit multiply
4437 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4438 if (IsSigned) {
4439 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4440 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4441 DAG.getConstant(63, DL, MVT::i64));
4442 // It is important that LowerBits is last, otherwise the arithmetic
4443 // shift will not be folded into the compare (SUBS).
4444 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4445 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4446 .getValue(1);
4447 } else {
4448 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4449 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4450 Overflow =
4451 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4452 DAG.getConstant(0, DL, MVT::i64),
4453 UpperBits).getValue(1);
4454 }
4455 break;
4456 }
4457 } // switch (...)
4458
4459 if (Opc) {
4460 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4461
4462 // Emit the AArch64 operation with overflow check.
4463 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4464 Overflow = Value.getValue(1);
4465 }
4466 return std::make_pair(Value, Overflow);
4467}
4468
4469SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4470 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4471 !Subtarget->isNeonAvailable()))
4472 return LowerToScalableOp(Op, DAG);
4473
4474 SDValue Sel = Op.getOperand(0);
4475 SDValue Other = Op.getOperand(1);
4476 SDLoc DL(Sel);
4477
4478 // If the operand is an overflow checking operation, invert the condition
4479 // code and kill the Not operation. I.e., transform:
4480 // (xor (overflow_op_bool, 1))
4481 // -->
4482 // (csel 1, 0, invert(cc), overflow_op_bool)
4483 // ... which later gets transformed to just a cset instruction with an
4484 // inverted condition code, rather than a cset + eor sequence.
4486 // Only lower legal XALUO ops.
4488 return SDValue();
4489
4490 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4491 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4493 SDValue Value, Overflow;
4494 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4495 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4496 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4497 CCVal, Overflow);
4498 }
4499 // If neither operand is a SELECT_CC, give up.
4500 if (Sel.getOpcode() != ISD::SELECT_CC)
4501 std::swap(Sel, Other);
4502 if (Sel.getOpcode() != ISD::SELECT_CC)
4503 return Op;
4504
4505 // The folding we want to perform is:
4506 // (xor x, (select_cc a, b, cc, 0, -1) )
4507 // -->
4508 // (csel x, (xor x, -1), cc ...)
4509 //
4510 // The latter will get matched to a CSINV instruction.
4511
4512 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4513 SDValue LHS = Sel.getOperand(0);
4514 SDValue RHS = Sel.getOperand(1);
4515 SDValue TVal = Sel.getOperand(2);
4516 SDValue FVal = Sel.getOperand(3);
4517
4518 // FIXME: This could be generalized to non-integer comparisons.
4519 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4520 return Op;
4521
4522 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4523 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4524
4525 // The values aren't constants, this isn't the pattern we're looking for.
4526 if (!CFVal || !CTVal)
4527 return Op;
4528
4529 // We can commute the SELECT_CC by inverting the condition. This
4530 // might be needed to make this fit into a CSINV pattern.
4531 if (CTVal->isAllOnes() && CFVal->isZero()) {
4532 std::swap(TVal, FVal);
4533 std::swap(CTVal, CFVal);
4534 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4535 }
4536
4537 // If the constants line up, perform the transform!
4538 if (CTVal->isZero() && CFVal->isAllOnes()) {
4539 SDValue CCVal;
4540 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4541
4542 FVal = Other;
4543 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4544 DAG.getAllOnesConstant(DL, Other.getValueType()));
4545
4546 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4547 CCVal, Cmp);
4548 }
4549
4550 return Op;
4551}
4552
4553// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4554// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4555// sets 'C' bit to 0.
4557 SDLoc DL(Value);
4558 EVT VT = Value.getValueType();
4559 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4560 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4561 SDValue Cmp =
4562 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4563 return Cmp.getValue(1);
4564}
4565
4566// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4567// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4569 bool Invert) {
4570 assert(Glue.getResNo() == 1);
4571 SDLoc DL(Glue);
4572 SDValue Zero = DAG.getConstant(0, DL, VT);
4573 SDValue One = DAG.getConstant(1, DL, VT);
4575 SDValue CC = getCondCode(DAG, Cond);
4576 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4577}
4578
4579// Value is 1 if 'V' bit of NZCV is 1, else 0
4581 assert(Glue.getResNo() == 1);
4582 SDLoc DL(Glue);
4583 SDValue Zero = DAG.getConstant(0, DL, VT);
4584 SDValue One = DAG.getConstant(1, DL, VT);
4586 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4587}
4588
4589// This lowering is inefficient, but it will get cleaned up by
4590// `foldOverflowCheck`
4592 unsigned Opcode, bool IsSigned) {
4593 EVT VT0 = Op.getValue(0).getValueType();
4594 EVT VT1 = Op.getValue(1).getValueType();
4595
4596 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4597 return SDValue();
4598
4599 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4600 SDValue OpLHS = Op.getOperand(0);
4601 SDValue OpRHS = Op.getOperand(1);
4602 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4603
4604 SDLoc DL(Op);
4605
4606 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4607 OpRHS, OpCarryIn);
4608
4609 SDValue OutFlag =
4610 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4611 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4612
4613 return DAG.getMergeValues({Sum, OutFlag}, DL);
4614}
4615
4616static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
4617 SelectionDAG &DAG,
4618 bool LastOperandIsImm = false) {
4619 if (Op.getValueType().isVector())
4620 return SDValue();
4621
4622 SDLoc DL(Op);
4624 const unsigned NumOperands = Op.getNumOperands();
4625 auto getFloatVT = [](EVT VT) {
4626 assert((VT == MVT::i32 || VT == MVT::i64) && "Unexpected VT");
4627 return VT == MVT::i32 ? MVT::f32 : MVT::f64;
4628 };
4629 auto bitcastToFloat = [&](SDValue Val) {
4630 return DAG.getBitcast(getFloatVT(Val.getValueType()), Val);
4631 };
4632
4633 // Skip first operand as it is intrinsic ID.
4634 for (unsigned I = 1; I < NumOperands; ++I) {
4635 SDValue Val = Op.getOperand(I);
4636 const bool KeepInt = LastOperandIsImm && (I == NumOperands - 1);
4637 NewOps.push_back(KeepInt ? Val : bitcastToFloat(Val));
4638 }
4639 EVT OrigVT = Op.getValueType();
4640 SDValue OpNode = DAG.getNode(Opcode, DL, getFloatVT(OrigVT), NewOps);
4641 return DAG.getBitcast(OrigVT, OpNode);
4642}
4643
4645 // Let legalize expand this if it isn't a legal type yet.
4646 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4647 return SDValue();
4648
4649 SDLoc DL(Op);
4651 // The actual operation that sets the overflow or carry flag.
4652 SDValue Value, Overflow;
4653 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4654
4655 // We use 0 and 1 as false and true values.
4656 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4657 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4658
4659 // We use an inverted condition, because the conditional select is inverted
4660 // too. This will allow it to be selected to a single instruction:
4661 // CSINC Wd, WZR, WZR, invert(cond).
4662 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4663 Overflow =
4664 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4665
4666 return DAG.getMergeValues({Value, Overflow}, DL);
4667}
4668
4669// Prefetch operands are:
4670// 1: Address to prefetch
4671// 2: bool isWrite
4672// 3: int locality (0 = no locality ... 3 = extreme locality)
4673// 4: bool isDataCache
4675 SDLoc DL(Op);
4676 unsigned IsWrite = Op.getConstantOperandVal(2);
4677 unsigned Locality = Op.getConstantOperandVal(3);
4678 unsigned IsData = Op.getConstantOperandVal(4);
4679
4680 bool IsStream = !Locality;
4681 // When the locality number is set
4682 if (Locality) {
4683 // The front-end should have filtered out the out-of-range values
4684 assert(Locality <= 3 && "Prefetch locality out-of-range");
4685 // The locality degree is the opposite of the cache speed.
4686 // Put the number the other way around.
4687 // The encoding starts at 0 for level 1
4688 Locality = 3 - Locality;
4689 }
4690
4691 // built the mask value encoding the expected behavior.
4692 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4693 (!IsData << 3) | // IsDataCache bit
4694 (Locality << 1) | // Cache level bits
4695 (unsigned)IsStream; // Stream bit
4696 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4697 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4698 Op.getOperand(1));
4699}
4700
4701// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4702// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4703// (AND X Y) Z which produces a better opt with EmitComparison
4705 SelectionDAG &DAG, const SDLoc DL) {
4706 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4707 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4709 if (LHSConstOp && RHSConst) {
4710 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4711 uint64_t RHSConstant = RHSConst->getZExtValue();
4712 if (isPowerOf2_64(RHSConstant)) {
4713 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4714 LHS =
4715 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4716 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4717 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4718 CC = ISD::SETEQ;
4719 }
4720 }
4721 }
4722}
4723
4724SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4725 SelectionDAG &DAG) const {
4726 EVT VT = Op.getValueType();
4727 if (VT.isScalableVector()) {
4728 SDValue SrcVal = Op.getOperand(0);
4729
4730 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4731 // Break conversion in two with the first part converting to f32 and the
4732 // second using native f32->VT instructions.
4733 SDLoc DL(Op);
4734 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4735 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4736 }
4737
4738 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4739 }
4740
4741 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4742 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4743
4744 bool IsStrict = Op->isStrictFPOpcode();
4745 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4746 EVT Op0VT = Op0.getValueType();
4747 if (VT == MVT::f64) {
4748 // FP16->FP32 extends are legal for v32 and v4f32.
4749 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4750 return Op;
4751 // Split bf16->f64 extends into two fpextends.
4752 if (Op0VT == MVT::bf16 && IsStrict) {
4753 SDValue Ext1 =
4754 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4755 {Op0, Op.getOperand(0)});
4756 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4757 {Ext1, Ext1.getValue(1)});
4758 }
4759 if (Op0VT == MVT::bf16)
4760 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4761 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4762 return SDValue();
4763 }
4764
4765 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4766 return SDValue();
4767}
4768
4769SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4770 SelectionDAG &DAG) const {
4771 EVT VT = Op.getValueType();
4772 bool IsStrict = Op->isStrictFPOpcode();
4773 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4774 EVT SrcVT = SrcVal.getValueType();
4775 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4776
4777 if (VT.isScalableVector()) {
4778 // Let common code split the operation.
4779 if (SrcVT == MVT::nxv8f32)
4780 return Op;
4781
4782 if (VT.getScalarType() != MVT::bf16)
4783 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4784
4785 SDLoc DL(Op);
4786 constexpr EVT I32 = MVT::nxv4i32;
4787 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4788
4789 SDValue NaN;
4790 SDValue Narrow;
4791
4792 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4793 if (Subtarget->hasBF16())
4794 return LowerToPredicatedOp(Op, DAG,
4795 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4796
4797 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4798
4799 // Set the quiet bit.
4800 if (!DAG.isKnownNeverSNaN(SrcVal))
4801 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4802 } else if (SrcVT == MVT::nxv2f64 &&
4803 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4804 // Round to float without introducing rounding errors and try again.
4805 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4806 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4807 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4808
4810 if (IsStrict)
4811 NewOps.push_back(Op.getOperand(0));
4812 NewOps.push_back(Narrow);
4813 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4814 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4815 } else
4816 return SDValue();
4817
4818 if (!Trunc) {
4819 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4820 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4821 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4822 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4823 }
4824
4825 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4826 // 0x80000000.
4827 if (NaN) {
4828 EVT I1 = I32.changeElementType(*DAG.getContext(), MVT::i1);
4829 EVT CondVT = VT.changeElementType(*DAG.getContext(), MVT::i1);
4830 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4831 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4832 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4833 }
4834
4835 // Now that we have rounded, shift the bits into position.
4836 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4837 return getSVESafeBitCast(VT, Narrow, DAG);
4838 }
4839
4840 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4841 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4842
4843 // Expand cases where the result type is BF16 but we don't have hardware
4844 // instructions to lower it.
4845 if (VT.getScalarType() == MVT::bf16 &&
4846 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4847 Subtarget->hasBF16())) {
4848 SDLoc DL(Op);
4849 SDValue Narrow = SrcVal;
4850 SDValue NaN;
4851 EVT I32 = SrcVT.changeElementType(*DAG.getContext(), MVT::i32);
4852 EVT F32 = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
4853 if (SrcVT.getScalarType() == MVT::f32) {
4854 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4855 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4856 if (!NeverSNaN) {
4857 // Set the quiet bit.
4858 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4859 DAG.getConstant(0x400000, DL, I32));
4860 }
4861 } else if (SrcVT.getScalarType() == MVT::f64) {
4862 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4863 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4864 } else {
4865 return SDValue();
4866 }
4867 if (!Trunc) {
4868 SDValue One = DAG.getConstant(1, DL, I32);
4869 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4870 DAG.getShiftAmountConstant(16, I32, DL));
4871 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4872 SDValue RoundingBias =
4873 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4874 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4875 }
4876
4877 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4878 // 0x80000000.
4879 if (NaN) {
4880 SDValue IsNaN = DAG.getSetCC(
4881 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4882 SrcVal, SrcVal, ISD::SETUO);
4883 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4884 }
4885
4886 // Now that we have rounded, shift the bits into position.
4887 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4888 DAG.getShiftAmountConstant(16, I32, DL));
4889 if (VT.isVector()) {
4890 EVT I16 = I32.changeVectorElementType(*DAG.getContext(), MVT::i16);
4891 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4892 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4893 }
4894 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4895 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4896 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4897 : Result;
4898 }
4899
4900 if (SrcVT != MVT::f128) {
4901 // Expand cases where the input is a vector bigger than NEON.
4903 return SDValue();
4904
4905 // It's legal except when f128 is involved
4906 return Op;
4907 }
4908
4909 return SDValue();
4910}
4911
4912SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4913 SelectionDAG &DAG) const {
4914 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4915 // Any additional optimization in this function should be recorded
4916 // in the cost tables.
4917 bool IsStrict = Op->isStrictFPOpcode();
4918 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4919 EVT VT = Op.getValueType();
4920
4921 assert(!(IsStrict && VT.isScalableVector()) &&
4922 "Unimplemented SVE support for STRICT_FP_to_INT!");
4923
4924 // f16 conversions are promoted to f32 when full fp16 is not supported.
4925 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4926 InVT.getVectorElementType() == MVT::bf16) {
4927 EVT NewVT = VT.changeElementType(*DAG.getContext(), MVT::f32);
4928 SDLoc DL(Op);
4929 if (IsStrict) {
4930 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4931 {Op.getOperand(0), Op.getOperand(1)});
4932 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4933 {Ext.getValue(1), Ext.getValue(0)});
4934 }
4935 return DAG.getNode(
4936 Op.getOpcode(), DL, Op.getValueType(),
4937 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4938 }
4939
4940 if (VT.isScalableVector()) {
4941 if (VT.getVectorElementType() == MVT::i1) {
4942 SDLoc DL(Op);
4943 EVT CvtVT = getPromotedVTForPredicate(VT);
4944 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4945 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4946 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4947 }
4948
4949 // Let common code split the operation.
4950 if (InVT == MVT::nxv8f32)
4951 return Op;
4952
4953 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4954 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4955 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4956 return LowerToPredicatedOp(Op, DAG, Opcode);
4957 }
4958
4959 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4960 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4961 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4962
4963 uint64_t VTSize = VT.getFixedSizeInBits();
4964 uint64_t InVTSize = InVT.getFixedSizeInBits();
4965 if (VTSize < InVTSize) {
4966 SDLoc DL(Op);
4967 if (IsStrict) {
4969 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4970 {Op.getOperand(0), Op.getOperand(1)});
4971 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4972 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4973 }
4974 SDValue Cv =
4975 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4976 Op.getOperand(0));
4977 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4978 }
4979
4980 if (VTSize > InVTSize) {
4981 SDLoc DL(Op);
4982 MVT ExtVT =
4985 if (IsStrict) {
4986 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4987 {Op.getOperand(0), Op.getOperand(1)});
4988 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4989 {Ext.getValue(1), Ext.getValue(0)});
4990 }
4991 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4992 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4993 }
4994
4995 // Use a scalar operation for conversions between single-element vectors of
4996 // the same size.
4997 if (InVT.getVectorNumElements() == 1) {
4998 SDLoc DL(Op);
4999 SDValue Extract = DAG.getNode(
5001 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
5002 EVT ScalarVT = VT.getScalarType();
5003 if (IsStrict)
5004 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5005 {Op.getOperand(0), Extract});
5006 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5007 }
5008
5009 // Type changing conversions are illegal.
5010 return Op;
5011}
5012
5013SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
5014 SelectionDAG &DAG) const {
5015 bool IsStrict = Op->isStrictFPOpcode();
5016 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5017
5018 if (SrcVal.getValueType().isVector())
5019 return LowerVectorFP_TO_INT(Op, DAG);
5020
5021 // f16 conversions are promoted to f32 when full fp16 is not supported.
5022 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
5023 SrcVal.getValueType() == MVT::bf16) {
5024 SDLoc DL(Op);
5025 if (IsStrict) {
5026 SDValue Ext =
5027 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
5028 {Op.getOperand(0), SrcVal});
5029 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
5030 {Ext.getValue(1), Ext.getValue(0)});
5031 }
5032 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
5033 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
5034 }
5035
5036 if (SrcVal.getValueType() != MVT::f128) {
5037 // It's legal except when f128 is involved
5038 return Op;
5039 }
5040
5041 return SDValue();
5042}
5043
5044SDValue
5045AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
5046 SelectionDAG &DAG) const {
5047 // AArch64 FP-to-int conversions saturate to the destination element size, so
5048 // we can lower common saturating conversions to simple instructions.
5049 SDValue SrcVal = Op.getOperand(0);
5050 EVT SrcVT = SrcVal.getValueType();
5051 EVT DstVT = Op.getValueType();
5052 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5053
5054 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
5055 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
5056 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5057 assert(SatWidth <= DstElementWidth &&
5058 "Saturation width cannot exceed result width");
5059
5060 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
5061 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
5062 // types, so this is hard to reach.
5063 if (DstVT.isScalableVector())
5064 return SDValue();
5065
5066 EVT SrcElementVT = SrcVT.getVectorElementType();
5067
5068 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5069 SDLoc DL(Op);
5070 SDValue SrcVal2;
5071 if ((SrcElementVT == MVT::f16 &&
5072 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
5073 SrcElementVT == MVT::bf16) {
5074 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
5075 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
5076 // If we are extending to a v8f32, split into two v4f32 to produce legal
5077 // types.
5078 if (F32VT.getSizeInBits() > 128) {
5079 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
5080 F32VT = F32VT.getHalfNumVectorElementsVT();
5081 }
5082 SrcVT = F32VT;
5083 SrcElementVT = MVT::f32;
5084 SrcElementWidth = 32;
5085 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
5086 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
5087 return SDValue();
5088
5089 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
5090 // width and produce a fcvtzu.
5091 if (SatWidth == 64 && SrcElementWidth < 64) {
5092 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
5093 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
5094 SrcVT = F64VT;
5095 SrcElementVT = MVT::f64;
5096 SrcElementWidth = 64;
5097 }
5098 // Cases that we can emit directly.
5099 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
5100 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5101 DAG.getValueType(DstVT.getScalarType()));
5102 if (SrcVal2) {
5103 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
5104 DAG.getValueType(DstVT.getScalarType()));
5105 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
5106 }
5107 return Res;
5108 }
5109
5110 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5111 // result. This is only valid if the legal cvt is larger than the saturate
5112 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
5113 // (at least until sqxtn is selected).
5114 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
5115 return SDValue();
5116
5117 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
5118 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
5119 DAG.getValueType(IntVT.getScalarType()));
5120 SDValue NativeCvt2 =
5121 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
5122 DAG.getValueType(IntVT.getScalarType()))
5123 : SDValue();
5124 SDValue Sat, Sat2;
5125 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5126 SDValue MinC = DAG.getConstant(
5127 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5128 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
5129 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5130 SDValue MaxC = DAG.getConstant(
5131 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5132 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
5133 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
5134 } else {
5135 SDValue MinC = DAG.getConstant(
5136 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
5137 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
5138 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5139 }
5140
5141 if (SrcVal2)
5142 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
5144 Sat, Sat2);
5145
5146 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5147}
5148
5149SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5150 SelectionDAG &DAG) const {
5151 // AArch64 FP-to-int conversions saturate to the destination register size, so
5152 // we can lower common saturating conversions to simple instructions.
5153 SDValue SrcVal = Op.getOperand(0);
5154 EVT SrcVT = SrcVal.getValueType();
5155
5156 if (SrcVT.isVector())
5157 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5158
5159 EVT DstVT = Op.getValueType();
5160 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5161 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5162 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5163 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5164
5165 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5166 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5167 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5168 SrcVT = MVT::f32;
5169 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5170 SrcVT != MVT::bf16)
5171 return SDValue();
5172
5173 SDLoc DL(Op);
5174 // Cases that we can emit directly.
5175 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5176 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5177 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5178 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5179 DAG.getValueType(DstVT));
5180
5181 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5182 // result. This is only valid if the legal cvt is larger than the saturate
5183 // width.
5184 if (DstWidth < SatWidth)
5185 return SDValue();
5186
5187 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5188 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5189 SDValue CVTf32 =
5190 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5191 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5192 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5193 DAG.getValueType(SatVT));
5194 }
5195 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5196 return DAG.getBitcast(DstVT, CVTf32);
5197 }
5198
5199 SDValue NativeCvt =
5200 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5201 SDValue Sat;
5202 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5203 SDValue MinC = DAG.getConstant(
5204 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5205 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5206 SDValue MaxC = DAG.getConstant(
5207 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5208 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5209 } else {
5210 SDValue MinC = DAG.getConstant(
5211 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5212 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5213 }
5214
5215 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5216}
5217
5218SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5219 SelectionDAG &DAG) const {
5220 EVT VT = Op.getValueType();
5221 SDValue Src = Op.getOperand(0);
5222 SDLoc DL(Op);
5223
5224 assert(VT.isVector() && "Expected vector type");
5225
5226 EVT CastVT = VT.changeVectorElementType(
5227 *DAG.getContext(), Src.getValueType().getVectorElementType());
5228
5229 // Round the floating-point value into a floating-point register with the
5230 // current rounding mode.
5231 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5232
5233 // Truncate the rounded floating point to an integer.
5234 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5236}
5237
5238SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5239 SelectionDAG &DAG) const {
5240 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5241 // Any additional optimization in this function should be recorded
5242 // in the cost tables.
5243 bool IsStrict = Op->isStrictFPOpcode();
5244 EVT VT = Op.getValueType();
5245 SDLoc DL(Op);
5246 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5247 EVT InVT = In.getValueType();
5248 unsigned Opc = Op.getOpcode();
5249 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5250
5251 assert(!(IsStrict && VT.isScalableVector()) &&
5252 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5253
5254 // NOTE: i1->bf16 does not require promotion to f32.
5255 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5256 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5257 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5258 : DAG.getConstantFP(1.0, DL, VT);
5259 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5260 }
5261
5262 // Promote bf16 conversions to f32.
5263 if (VT.getVectorElementType() == MVT::bf16) {
5264 EVT F32 = VT.changeElementType(*DAG.getContext(), MVT::f32);
5265 if (IsStrict) {
5266 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5267 {Op.getOperand(0), In});
5268 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5269 {Op.getValueType(), MVT::Other},
5270 {Val.getValue(1), Val.getValue(0),
5271 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5272 }
5273 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5274 DAG.getNode(Op.getOpcode(), DL, F32, In),
5275 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5276 }
5277
5278 if (VT.isScalableVector()) {
5279 // Let common code split the operation.
5280 if (VT == MVT::nxv8f32)
5281 return Op;
5282
5283 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5284 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5285 return LowerToPredicatedOp(Op, DAG, Opcode);
5286 }
5287
5288 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5289 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5290 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5291
5292 uint64_t VTSize = VT.getFixedSizeInBits();
5293 uint64_t InVTSize = InVT.getFixedSizeInBits();
5294 if (VTSize < InVTSize) {
5295 // AArch64 doesn't have a direct vector instruction to convert
5296 // fixed point to floating point AND narrow it at the same time.
5297 // Additional rounding when the target is f32/f64 causes double
5298 // rounding issues. Conversion to f16 is fine due to narrow width.
5299 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5300 bool IsTargetf16 = false;
5301 if (Op.hasOneUse() &&
5302 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5303 // Some vector types are split during legalization into half, followed by
5304 // concatenation, followed by rounding to the original vector type. If we
5305 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5306 SDNode *U = *Op->user_begin();
5307 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5308 EVT TmpVT = U->user_begin()->getValueType(0);
5309 if (TmpVT.getScalarType() == MVT::f16)
5310 IsTargetf16 = true;
5311 }
5312 }
5313
5314 if (IsTargetf32 && !IsTargetf16) {
5315 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5316 }
5317
5318 MVT CastVT =
5320 InVT.getVectorNumElements());
5321 if (IsStrict) {
5322 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5323 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5324 {In.getValue(1), In.getValue(0),
5325 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5326 }
5327 In = DAG.getNode(Opc, DL, CastVT, In);
5328 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5329 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5330 }
5331
5332 if (VTSize > InVTSize) {
5333 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5334 EVT CastVT = VT.changeVectorElementTypeToInteger();
5335 In = DAG.getNode(CastOpc, DL, CastVT, In);
5336 if (IsStrict)
5337 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5338 return DAG.getNode(Opc, DL, VT, In);
5339 }
5340
5341 // Use a scalar operation for conversions between single-element vectors of
5342 // the same size.
5343 if (VT.getVectorNumElements() == 1) {
5344 SDValue Extract =
5346 DAG.getConstant(0, DL, MVT::i64));
5347 EVT ScalarVT = VT.getScalarType();
5348 if (IsStrict)
5349 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5350 {Op.getOperand(0), Extract});
5351 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5352 }
5353
5354 return Op;
5355}
5356
5357SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5358 SelectionDAG &DAG) const {
5359 if (Op.getValueType().isVector())
5360 return LowerVectorINT_TO_FP(Op, DAG);
5361
5362 bool IsStrict = Op->isStrictFPOpcode();
5363 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5364
5365 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5366 Op->getOpcode() == ISD::SINT_TO_FP;
5367
5368 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5369 SDLoc DL(Op);
5370 if (IsStrict) {
5371 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5372 {Op.getOperand(0), SrcVal});
5373 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5374 {Op.getValueType(), MVT::Other},
5375 {Val.getValue(1), Val.getValue(0),
5376 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5377 }
5378 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5379 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5380 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5381 };
5382
5383 if (Op.getValueType() == MVT::bf16) {
5384 unsigned MaxWidth = IsSigned
5385 ? DAG.ComputeMaxSignificantBits(SrcVal)
5386 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5387 // bf16 conversions are promoted to f32 when converting from i16.
5388 if (MaxWidth <= 24) {
5389 return IntToFpViaPromotion(MVT::f32);
5390 }
5391
5392 // bf16 conversions are promoted to f64 when converting from i32.
5393 if (MaxWidth <= 53) {
5394 return IntToFpViaPromotion(MVT::f64);
5395 }
5396
5397 // We need to be careful about i64 -> bf16.
5398 // Consider an i32 22216703.
5399 // This number cannot be represented exactly as an f32 and so a itofp will
5400 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5401 // However, the correct bf16 was supposed to be 22151168.0
5402 // We need to use sticky rounding to get this correct.
5403 if (SrcVal.getValueType() == MVT::i64) {
5404 SDLoc DL(Op);
5405 // This algorithm is equivalent to the following:
5406 // uint64_t SrcHi = SrcVal & ~0xfffull;
5407 // uint64_t SrcLo = SrcVal & 0xfffull;
5408 // uint64_t Highest = SrcVal >> 53;
5409 // bool HasHighest = Highest != 0;
5410 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5411 // double Rounded = static_cast<double>(ToRound);
5412 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5413 // uint64_t HasLo = SrcLo != 0;
5414 // bool NeedsAdjustment = HasHighest & HasLo;
5415 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5416 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5417 // return static_cast<__bf16>(Adjusted);
5418 //
5419 // Essentially, what happens is that SrcVal either fits perfectly in a
5420 // double-precision value or it is too big. If it is sufficiently small,
5421 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5422 // ensure that u64 -> double has no rounding error by only using the 52
5423 // MSB of the input. The low order bits will get merged into a sticky bit
5424 // which will avoid issues incurred by double rounding.
5425
5426 // Signed conversion is more or less like so:
5427 // copysign((__bf16)abs(SrcVal), SrcVal)
5428 SDValue SignBit;
5429 if (IsSigned) {
5430 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5431 DAG.getConstant(1ull << 63, DL, MVT::i64));
5432 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5433 }
5434 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5435 DAG.getConstant(~0xfffull, DL, MVT::i64));
5436 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5437 DAG.getConstant(0xfffull, DL, MVT::i64));
5438 SDValue Highest =
5439 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5440 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5441 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5442 SDValue ToRound =
5443 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5444 SDValue Rounded =
5445 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5446 {Op.getOperand(0), ToRound})
5447 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5448
5449 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5450 if (SignBit) {
5451 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5452 }
5453
5454 SDValue HasHighest = DAG.getSetCC(
5455 DL,
5456 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5457 Highest, Zero64, ISD::SETNE);
5458
5459 SDValue HasLo = DAG.getSetCC(
5460 DL,
5461 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5462 SrcLo, Zero64, ISD::SETNE);
5463
5464 SDValue NeedsAdjustment =
5465 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5466 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5467
5468 SDValue AdjustedBits =
5469 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5470 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5471 return IsStrict
5472 ? DAG.getNode(
5474 {Op.getValueType(), MVT::Other},
5475 {Rounded.getValue(1), Adjusted,
5476 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5477 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5478 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5479 }
5480 }
5481
5482 // f16 conversions are promoted to f32 when full fp16 is not supported.
5483 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5484 return IntToFpViaPromotion(MVT::f32);
5485 }
5486
5487 // i128 conversions are libcalls.
5488 if (SrcVal.getValueType() == MVT::i128)
5489 return SDValue();
5490
5491 // Other conversions are legal, unless it's to the completely software-based
5492 // fp128.
5493 if (Op.getValueType() != MVT::f128)
5494 return Op;
5495 return SDValue();
5496}
5497
5498static MVT getSVEContainerType(EVT ContentTy);
5499
5500SDValue
5501AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5502 SelectionDAG &DAG) const {
5503 assert((Subtarget->hasSVE2() ||
5504 (Subtarget->hasSME() && Subtarget->isStreaming())) &&
5505 "Lowering loop_dependence_raw_mask or loop_dependence_war_mask "
5506 "requires SVE or SME");
5507
5508 SDLoc DL(Op);
5509 EVT VT = Op.getValueType();
5510 unsigned LaneOffset = Op.getConstantOperandVal(3);
5511 unsigned NumElements = VT.getVectorMinNumElements();
5512 uint64_t EltSizeInBytes = Op.getConstantOperandVal(2);
5513
5514 // Lane offsets and other element sizes are not supported by whilewr/rw.
5515 if (LaneOffset != 0 || !is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes))
5516 return SDValue();
5517
5518 EVT EltVT = MVT::getIntegerVT(EltSizeInBytes * 8);
5519 EVT PredVT =
5520 getPackedSVEVectorVT(EltVT).changeElementType(*DAG.getContext(), MVT::i1);
5521
5522 // Legal whilewr/rw (lowered by tablegen matcher).
5523 if (PredVT == VT)
5524 return Op;
5525
5526 // Expand if this mask needs splitting (this will produce a whilelo).
5527 if (NumElements > PredVT.getVectorMinNumElements())
5528 return SDValue();
5529
5530 SDValue Mask =
5531 DAG.getNode(Op.getOpcode(), DL, PredVT, to_vector(Op->op_values()));
5532
5533 if (VT.isFixedLengthVector()) {
5534 EVT WidePredVT =
5535 PredVT.changeElementType(*DAG.getContext(), VT.getScalarType());
5536 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, WidePredVT, Mask);
5537 return convertFromScalableVector(DAG, VT, MaskAsInt);
5538 }
5539
5540 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Mask,
5541 DAG.getConstant(0, DL, MVT::i64));
5542}
5543
5544SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5545 SelectionDAG &DAG) const {
5546 EVT OpVT = Op.getValueType();
5547 EVT ArgVT = Op.getOperand(0).getValueType();
5548
5550 return LowerFixedLengthBitcastToSVE(Op, DAG);
5551
5552 if (OpVT.isScalableVector()) {
5553 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5554
5555 // Handle type legalisation first.
5556 if (!isTypeLegal(ArgVT)) {
5557 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5558 "Expected int->fp bitcast!");
5559
5560 // Bitcasting between unpacked vector types of different element counts is
5561 // not a NOP because the live elements are laid out differently.
5562 // 01234567
5563 // e.g. nxv2i32 = XX??XX??
5564 // nxv4f16 = X?X?X?X?
5565 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5566 return SDValue();
5567
5568 SDValue ExtResult =
5569 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5570 Op.getOperand(0));
5571 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5572 }
5573
5574 // Bitcasts between legal types with the same element count are legal.
5575 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5576 return Op;
5577
5578 // getSVESafeBitCast does not support casting between unpacked types.
5579 if (!isPackedVectorType(OpVT, DAG))
5580 return SDValue();
5581
5582 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5583 }
5584
5585 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5586 return SDValue();
5587
5588 // Bitcasts between f16 and bf16 are legal.
5589 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5590 return Op;
5591
5592 assert(ArgVT == MVT::i16);
5593 SDLoc DL(Op);
5594
5595 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5596 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5597 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5598}
5599
5600// Returns lane if Op extracts from a two-element vector and lane is constant
5601// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5602static std::optional<uint64_t>
5604 SDNode *OpNode = Op.getNode();
5605 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5606 return std::nullopt;
5607
5608 EVT VT = OpNode->getOperand(0).getValueType();
5610 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5611 return std::nullopt;
5612
5613 return C->getZExtValue();
5614}
5615
5617 bool isSigned) {
5618 EVT VT = N.getValueType();
5619
5620 if (N.getOpcode() != ISD::BUILD_VECTOR)
5621 return false;
5622
5623 for (const SDValue &Elt : N->op_values()) {
5625 unsigned EltSize = VT.getScalarSizeInBits();
5626 unsigned HalfSize = EltSize / 2;
5627 if (isSigned) {
5628 if (!isIntN(HalfSize, C->getSExtValue()))
5629 return false;
5630 } else {
5631 if (!isUIntN(HalfSize, C->getZExtValue()))
5632 return false;
5633 }
5634 continue;
5635 }
5636 return false;
5637 }
5638
5639 return true;
5640}
5641
5643 EVT VT = N.getValueType();
5644 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5645 EVT HalfVT = EVT::getVectorVT(
5646 *DAG.getContext(),
5649 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5650}
5651
5653 return N.getOpcode() == ISD::SIGN_EXTEND ||
5654 N.getOpcode() == ISD::ANY_EXTEND ||
5655 isExtendedBUILD_VECTOR(N, DAG, true);
5656}
5657
5659 return N.getOpcode() == ISD::ZERO_EXTEND ||
5660 N.getOpcode() == ISD::ANY_EXTEND ||
5661 isExtendedBUILD_VECTOR(N, DAG, false);
5662}
5663
5665 unsigned Opcode = N.getOpcode();
5666 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5667 SDValue N0 = N.getOperand(0);
5668 SDValue N1 = N.getOperand(1);
5669 return N0->hasOneUse() && N1->hasOneUse() &&
5670 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5671 }
5672 return false;
5673}
5674
5676 unsigned Opcode = N.getOpcode();
5677 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5678 SDValue N0 = N.getOperand(0);
5679 SDValue N1 = N.getOperand(1);
5680 return N0->hasOneUse() && N1->hasOneUse() &&
5681 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5682 }
5683 return false;
5684}
5685
5686SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5687 SelectionDAG &DAG) const {
5688 // The rounding mode is in bits 23:22 of the FPSCR.
5689 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5690 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5691 // so that the shift + and get folded into a bitfield extract.
5692 SDLoc DL(Op);
5693
5694 SDValue Chain = Op.getOperand(0);
5695 SDValue FPCR_64 =
5696 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5697 {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
5698 MVT::i64)});
5699 Chain = FPCR_64.getValue(1);
5700 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5701 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5702 DAG.getConstant(1U << 22, DL, MVT::i32));
5703 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5704 DAG.getConstant(22, DL, MVT::i32));
5705 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5706 DAG.getConstant(3, DL, MVT::i32));
5707 return DAG.getMergeValues({AND, Chain}, DL);
5708}
5709
5710SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5711 SelectionDAG &DAG) const {
5712 SDLoc DL(Op);
5713 SDValue Chain = Op->getOperand(0);
5714 SDValue RMValue = Op->getOperand(1);
5715
5716 // The rounding mode is in bits 23:22 of the FPCR.
5717 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5718 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5719 // ((arg - 1) & 3) << 22).
5720 //
5721 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5722 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5723 // generated llvm.set.rounding to ensure this condition.
5724
5725 // Calculate new value of FPCR[23:22].
5726 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5727 DAG.getConstant(1, DL, MVT::i32));
5728 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5729 DAG.getConstant(0x3, DL, MVT::i32));
5730 RMValue =
5731 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5732 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5733 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5734
5735 // Get current value of FPCR.
5736 SDValue Ops[] = {
5737 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5738 SDValue FPCR =
5739 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5740 Chain = FPCR.getValue(1);
5741 FPCR = FPCR.getValue(0);
5742
5743 // Put new rounding mode into FPSCR[23:22].
5744 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5745 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5746 DAG.getConstant(RMMask, DL, MVT::i64));
5747 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5748 SDValue Ops2[] = {
5749 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5750 FPCR};
5751 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5752}
5753
5754SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5755 SelectionDAG &DAG) const {
5756 SDLoc DL(Op);
5757 SDValue Chain = Op->getOperand(0);
5758
5759 // Get current value of FPCR.
5760 SDValue Ops[] = {
5761 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5762 SDValue FPCR =
5763 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5764 Chain = FPCR.getValue(1);
5765 FPCR = FPCR.getValue(0);
5766
5767 // Truncate FPCR to 32 bits.
5768 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5769
5770 return DAG.getMergeValues({Result, Chain}, DL);
5771}
5772
5773SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5774 SelectionDAG &DAG) const {
5775 SDLoc DL(Op);
5776 SDValue Chain = Op->getOperand(0);
5777 SDValue Mode = Op->getOperand(1);
5778
5779 // Extend the specified value to 64 bits.
5780 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5781
5782 // Set new value of FPCR.
5783 SDValue Ops2[] = {
5784 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5785 FPCR};
5786 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5787}
5788
5789SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5790 SelectionDAG &DAG) const {
5791 SDLoc DL(Op);
5792 SDValue Chain = Op->getOperand(0);
5793
5794 // Get current value of FPCR.
5795 SDValue Ops[] = {
5796 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5797 SDValue FPCR =
5798 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5799 Chain = FPCR.getValue(1);
5800 FPCR = FPCR.getValue(0);
5801
5802 // Clear bits that are not reserved.
5803 SDValue FPSCRMasked = DAG.getNode(
5804 ISD::AND, DL, MVT::i64, FPCR,
5806
5807 // Set new value of FPCR.
5808 SDValue Ops2[] = {
5809 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5810 FPSCRMasked};
5811 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5812}
5813
5814static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5815 SDLoc DL, bool &IsMLA) {
5816 bool IsN0SExt = isSignExtended(N0, DAG);
5817 bool IsN1SExt = isSignExtended(N1, DAG);
5818 if (IsN0SExt && IsN1SExt)
5819 return AArch64ISD::SMULL;
5820
5821 bool IsN0ZExt = isZeroExtended(N0, DAG);
5822 bool IsN1ZExt = isZeroExtended(N1, DAG);
5823
5824 if (IsN0ZExt && IsN1ZExt)
5825 return AArch64ISD::UMULL;
5826
5827 // Select UMULL if we can replace the other operand with an extend.
5828 EVT VT = N0.getValueType();
5829 unsigned EltSize = VT.getScalarSizeInBits();
5830 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5831 if (IsN0ZExt || IsN1ZExt) {
5832 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5833 return AArch64ISD::UMULL;
5834 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5835 DAG.MaskedValueIsZero(N1, Mask)) {
5836 // For v2i64 we look more aggressively at both operands being zero, to avoid
5837 // scalarization.
5838 return AArch64ISD::UMULL;
5839 }
5840
5841 if (IsN0SExt || IsN1SExt) {
5842 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5843 return AArch64ISD::SMULL;
5844 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5845 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5846 return AArch64ISD::SMULL;
5847 }
5848
5849 if (!IsN1SExt && !IsN1ZExt)
5850 return 0;
5851
5852 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5853 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5854 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5855 IsMLA = true;
5856 return AArch64ISD::SMULL;
5857 }
5858 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5859 IsMLA = true;
5860 return AArch64ISD::UMULL;
5861 }
5862 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5863 std::swap(N0, N1);
5864 IsMLA = true;
5865 return AArch64ISD::UMULL;
5866 }
5867 return 0;
5868}
5869
5870SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5871 EVT VT = Op.getValueType();
5872
5873 bool OverrideNEON = !Subtarget->isNeonAvailable();
5874 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5875 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5876
5877 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5878 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5879 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5880 "unexpected type for custom-lowering ISD::MUL");
5881 SDValue N0 = Op.getOperand(0);
5882 SDValue N1 = Op.getOperand(1);
5883 bool isMLA = false;
5884 EVT OVT = VT;
5885 if (VT.is64BitVector()) {
5886 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5887 isNullConstant(N0.getOperand(1)) &&
5890 isNullConstant(N1.getOperand(1)) &&
5892 N0 = N0.getOperand(0);
5893 N1 = N1.getOperand(0);
5894 VT = N0.getValueType();
5895 } else {
5896 if (VT == MVT::v1i64) {
5897 if (Subtarget->hasSVE())
5898 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5899 // Fall through to expand this. It is not legal.
5900 return SDValue();
5901 } else
5902 // Other vector multiplications are legal.
5903 return Op;
5904 }
5905 }
5906
5907 SDLoc DL(Op);
5908 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5909
5910 if (!NewOpc) {
5911 if (VT.getVectorElementType() == MVT::i64) {
5912 // If SVE is available then i64 vector multiplications can also be made
5913 // legal.
5914 if (Subtarget->hasSVE())
5915 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5916 // Fall through to expand this. It is not legal.
5917 return SDValue();
5918 } else
5919 // Other vector multiplications are legal.
5920 return Op;
5921 }
5922
5923 // Legalize to a S/UMULL instruction
5924 SDValue Op0;
5925 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5926 if (!isMLA) {
5927 Op0 = skipExtensionForVectorMULL(N0, DAG);
5929 Op1.getValueType().is64BitVector() &&
5930 "unexpected types for extended operands to VMULL");
5931 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5932 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5933 DAG.getConstant(0, DL, MVT::i64));
5934 }
5935 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5936 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5937 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5940 EVT Op1VT = Op1.getValueType();
5941 return DAG.getNode(
5943 DAG.getNode(N0.getOpcode(), DL, VT,
5944 DAG.getNode(NewOpc, DL, VT,
5945 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5946 DAG.getNode(NewOpc, DL, VT,
5947 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5948 DAG.getConstant(0, DL, MVT::i64));
5949}
5950
5951static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5952 int Pattern) {
5953 if (Pattern == AArch64SVEPredPattern::all)
5954 return DAG.getConstant(1, DL, VT);
5955
5956 // When the number of active elements of a pattern matches the scalable vector
5957 // length, we can upgrade the pattern to ALL and emit a splat instead.
5958 if (unsigned PatNumElts = getNumElementsFromSVEPredPattern(Pattern)) {
5959 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
5960 unsigned NumElts = VT.getVectorMinNumElements();
5961 unsigned VScale = Subtarget.getSVEVectorSizeInBits() / 128;
5962 if (PatNumElts == (NumElts * VScale))
5963 return DAG.getConstant(1, DL, VT);
5964 }
5965
5966 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5967 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5968}
5969
5971 bool IsSigned, bool IsEqual) {
5972 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5973 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5974
5975 if (!N->getValueType(0).isScalableVector() ||
5976 !isa<ConstantSDNode>(N->getOperand(Op1)))
5977 return SDValue();
5978
5979 SDLoc DL(N);
5980 APInt Y = N->getConstantOperandAPInt(Op1);
5981
5982 // When the second operand is the maximum value, comparisons that include
5983 // equality can never fail and thus we can return an all active predicate.
5984 if (IsEqual)
5985 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5986 return DAG.getConstant(1, DL, N->getValueType(0));
5987
5988 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5989 return SDValue();
5990
5991 APInt X = N->getConstantOperandAPInt(Op0);
5992
5993 bool Overflow;
5994 APInt NumActiveElems =
5995 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5996
5997 if (Overflow)
5998 return SDValue();
5999
6000 if (IsEqual) {
6001 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
6002 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
6003 : NumActiveElems.uadd_ov(One, Overflow);
6004 if (Overflow)
6005 return SDValue();
6006 }
6007
6008 std::optional<unsigned> PredPattern =
6010 unsigned MinSVEVectorSize = std::max(
6012 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
6013 if (PredPattern != std::nullopt &&
6014 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
6015 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
6016
6017 return SDValue();
6018}
6019
6020// Returns a safe bitcast between two scalable vector predicates, where
6021// any newly created lanes from a widening bitcast are defined as zero.
6023 SDLoc DL(Op);
6024 EVT InVT = Op.getValueType();
6025
6026 assert(InVT.getVectorElementType() == MVT::i1 &&
6027 VT.getVectorElementType() == MVT::i1 &&
6028 "Expected a predicate-to-predicate bitcast");
6030 InVT.isScalableVector() &&
6031 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
6032 "Only expect to cast between legal scalable predicate types!");
6033
6034 // Return the operand if the cast isn't changing type,
6035 if (InVT == VT)
6036 return Op;
6037
6038 // Look through casts to <vscale x 16 x i1> when their input has more lanes
6039 // than VT. This will increase the chances of removing casts that introduce
6040 // new lanes, which have to be explicitly zero'd.
6041 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6042 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
6043 Op.getOperand(1).getValueType().bitsGT(VT))
6044 Op = Op.getOperand(1);
6045
6046 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
6047
6048 // We only have to zero the lanes if new lanes are being defined, e.g. when
6049 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
6050 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
6051 // we can return here.
6052 if (InVT.bitsGT(VT))
6053 return Reinterpret;
6054
6055 // Check if the other lanes are already known to be zeroed by
6056 // construction.
6058 return Reinterpret;
6059
6060 // Zero the newly introduced lanes.
6061 SDValue Mask = DAG.getConstant(1, DL, InVT);
6062 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
6063 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
6064}
6065
6066SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
6067 SDValue Chain, SDLoc DL,
6068 EVT VT) const {
6069 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
6070 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
6071 SDValue Callee =
6072 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
6073 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
6074 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
6075 TargetLowering::CallLoweringInfo CLI(DAG);
6077 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
6078 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
6079 std::move(Args));
6080 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6081 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
6082 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
6083 Mask);
6084}
6085
6086// Lower an SME LDR/STR ZA intrinsic
6087// Case 1: If the vector number (vecnum) is an immediate in range, it gets
6088// folded into the instruction
6089// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
6090// Case 2: If the vecnum is not an immediate, then it is used to modify the base
6091// and tile slice registers
6092// ldr(%tileslice, %ptr, %vecnum)
6093// ->
6094// %svl = rdsvl
6095// %ptr2 = %ptr + %svl * %vecnum
6096// %tileslice2 = %tileslice + %vecnum
6097// ldr [%tileslice2, 0], [%ptr2, 0]
6098// Case 3: If the vecnum is an immediate out of range, then the same is done as
6099// case 2, but the base and slice registers are modified by the greatest
6100// multiple of 15 lower than the vecnum and the remainder is folded into the
6101// instruction. This means that successive loads and stores that are offset from
6102// each other can share the same base and slice register updates.
6103// ldr(%tileslice, %ptr, 22)
6104// ldr(%tileslice, %ptr, 23)
6105// ->
6106// %svl = rdsvl
6107// %ptr2 = %ptr + %svl * 15
6108// %tileslice2 = %tileslice + 15
6109// ldr [%tileslice2, 7], [%ptr2, 7]
6110// ldr [%tileslice2, 8], [%ptr2, 8]
6111// Case 4: If the vecnum is an add of an immediate, then the non-immediate
6112// operand and the immediate can be folded into the instruction, like case 2.
6113// ldr(%tileslice, %ptr, %vecnum + 7)
6114// ldr(%tileslice, %ptr, %vecnum + 8)
6115// ->
6116// %svl = rdsvl
6117// %ptr2 = %ptr + %svl * %vecnum
6118// %tileslice2 = %tileslice + %vecnum
6119// ldr [%tileslice2, 7], [%ptr2, 7]
6120// ldr [%tileslice2, 8], [%ptr2, 8]
6121// Case 5: The vecnum being an add of an immediate out of range is also handled,
6122// in which case the same remainder logic as case 3 is used.
6124 SDLoc DL(N);
6125
6126 SDValue TileSlice = N->getOperand(2);
6127 SDValue Base = N->getOperand(3);
6128 SDValue VecNum = N->getOperand(4);
6129 int32_t ConstAddend = 0;
6130 SDValue VarAddend = VecNum;
6131
6132 // If the vnum is an add of an immediate, we can fold it into the instruction
6133 if (VecNum.getOpcode() == ISD::ADD &&
6134 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6135 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6136 VarAddend = VecNum.getOperand(0);
6137 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6138 ConstAddend = ImmNode->getSExtValue();
6139 VarAddend = SDValue();
6140 }
6141
6142 int32_t ImmAddend = ConstAddend % 16;
6143 if (int32_t C = (ConstAddend - ImmAddend)) {
6144 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6145 VarAddend = VarAddend
6146 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6147 : CVal;
6148 }
6149
6150 if (VarAddend) {
6151 // Get the vector length that will be multiplied by vnum
6152 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6153 DAG.getConstant(1, DL, MVT::i32));
6154
6155 // Multiply SVL and vnum then add it to the base
6156 SDValue Mul = DAG.getNode(
6157 ISD::MUL, DL, MVT::i64,
6158 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6159 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6160 // Just add vnum to the tileslice
6161 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6162 }
6163
6164 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6165 DL, MVT::Other,
6166 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6167 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6168}
6169
6171 SDLoc DL(Op);
6172 SDValue ID =
6173 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6174
6175 auto Op1 = Op.getOperand(1);
6176 auto Op2 = Op.getOperand(2);
6177 auto Mask = Op.getOperand(3);
6178
6179 EVT Op1VT = Op1.getValueType();
6180 EVT Op2VT = Op2.getValueType();
6181 EVT ResVT = Op.getValueType();
6182
6183 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6184 Op1VT.getVectorElementType() == MVT::i16) &&
6185 "Expected 8-bit or 16-bit characters.");
6186
6187 // Scalable vector type used to wrap operands.
6188 // A single container is enough for both operands because ultimately the
6189 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6190 EVT OpContainerVT = Op1VT.isScalableVector()
6191 ? Op1VT
6193
6194 if (Op2VT.is128BitVector()) {
6195 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6196 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6197 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6198 if (ResVT.isScalableVector())
6199 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6200 DAG.getTargetConstant(0, DL, MVT::i64));
6201 } else {
6202 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6203 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6204 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6205 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6206 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6207 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6208 DAG.getConstant(0, DL, MVT::i64));
6209 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6210 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6211 }
6212
6213 // If the result is scalable, we just need to carry out the MATCH.
6214 if (ResVT.isScalableVector())
6215 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6216
6217 // If the result is fixed, we can still use MATCH but we need to wrap the
6218 // first operand and the mask in scalable vectors before doing so.
6219
6220 // Wrap the operands.
6221 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6222 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6223 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6224
6225 // Carry out the match.
6226 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6227 ID, Mask, Op1, Op2);
6228
6229 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6230 // (v16i8/v8i8).
6231 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6232 Match = convertFromScalableVector(DAG, Op1VT, Match);
6233 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6234}
6235
6236SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6237 SelectionDAG &DAG) const {
6238 unsigned IntNo = Op.getConstantOperandVal(1);
6239 SDLoc DL(Op);
6240 switch (IntNo) {
6241 default:
6242 return SDValue(); // Don't custom lower most intrinsics.
6243 case Intrinsic::aarch64_prefetch: {
6244 SDValue Chain = Op.getOperand(0);
6245 SDValue Addr = Op.getOperand(2);
6246
6247 unsigned IsWrite = Op.getConstantOperandVal(3);
6248 unsigned Locality = Op.getConstantOperandVal(4);
6249 unsigned IsStream = Op.getConstantOperandVal(5);
6250 unsigned IsData = Op.getConstantOperandVal(6);
6251 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6252 (!IsData << 3) | // IsDataCache bit
6253 (Locality << 1) | // Cache level bits
6254 (unsigned)IsStream; // Stream bit
6255
6256 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6257 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6258 }
6259 case Intrinsic::aarch64_range_prefetch: {
6260 SDValue Chain = Op.getOperand(0);
6261 SDValue Addr = Op.getOperand(2);
6262
6263 unsigned IsWrite = Op.getConstantOperandVal(3);
6264 unsigned IsStream = Op.getConstantOperandVal(4);
6265 unsigned PrfOp = (IsStream << 2) | IsWrite;
6266
6267 SDValue Metadata = Op.getOperand(5);
6268 return DAG.getNode(AArch64ISD::RANGE_PREFETCH, DL, MVT::Other, Chain,
6269 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr,
6270 Metadata);
6271 }
6272 case Intrinsic::aarch64_sme_str:
6273 case Intrinsic::aarch64_sme_ldr: {
6274 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6275 }
6276 case Intrinsic::aarch64_sme_za_enable:
6277 return DAG.getNode(
6278 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6279 Op->getOperand(0), // Chain
6280 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6281 case Intrinsic::aarch64_sme_za_disable:
6282 return DAG.getNode(
6283 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6284 Op->getOperand(0), // Chain
6285 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6286 }
6287}
6288
6289SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6290 SelectionDAG &DAG) const {
6291 unsigned IntNo = Op.getConstantOperandVal(1);
6292 SDLoc DL(Op);
6293 switch (IntNo) {
6294 default:
6295 return SDValue(); // Don't custom lower most intrinsics.
6296 case Intrinsic::aarch64_mops_memset_tag: {
6297 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6298 SDValue Chain = Node->getChain();
6299 SDValue Dst = Op.getOperand(2);
6300 SDValue Val = Op.getOperand(3);
6301 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6302 SDValue Size = Op.getOperand(4);
6303 auto Alignment = Node->getMemOperand()->getAlign();
6304 bool IsVol = Node->isVolatile();
6305 auto DstPtrInfo = Node->getPointerInfo();
6306
6307 const auto &SDI =
6308 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6309 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6310 Chain, Dst, Val, Size, Alignment, IsVol,
6311 DstPtrInfo, MachinePointerInfo{});
6312
6313 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6314 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6315 // LowerOperationWrapper will complain that the number of results has
6316 // changed.
6317 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6318 }
6319 }
6320}
6321
6322SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6323 SelectionDAG &DAG) const {
6324 unsigned IntNo = Op.getConstantOperandVal(0);
6325 SDLoc DL(Op);
6326 switch (IntNo) {
6327 default: return SDValue(); // Don't custom lower most intrinsics.
6328 case Intrinsic::thread_pointer: {
6329 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6330 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6331 }
6332 case Intrinsic::aarch64_sve_whilewr_b:
6333 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6334 Op.getOperand(1), Op.getOperand(2),
6335 DAG.getConstant(1, DL, MVT::i64),
6336 DAG.getConstant(0, DL, MVT::i64));
6337 case Intrinsic::aarch64_sve_whilewr_h:
6338 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6339 Op.getOperand(1), Op.getOperand(2),
6340 DAG.getConstant(2, DL, MVT::i64),
6341 DAG.getConstant(0, DL, MVT::i64));
6342 case Intrinsic::aarch64_sve_whilewr_s:
6343 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6344 Op.getOperand(1), Op.getOperand(2),
6345 DAG.getConstant(4, DL, MVT::i64),
6346 DAG.getConstant(0, DL, MVT::i64));
6347 case Intrinsic::aarch64_sve_whilewr_d:
6348 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6349 Op.getOperand(1), Op.getOperand(2),
6350 DAG.getConstant(8, DL, MVT::i64),
6351 DAG.getConstant(0, DL, MVT::i64));
6352 case Intrinsic::aarch64_sve_whilerw_b:
6353 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6354 Op.getOperand(1), Op.getOperand(2),
6355 DAG.getConstant(1, DL, MVT::i64),
6356 DAG.getConstant(0, DL, MVT::i64));
6357 case Intrinsic::aarch64_sve_whilerw_h:
6358 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6359 Op.getOperand(1), Op.getOperand(2),
6360 DAG.getConstant(2, DL, MVT::i64),
6361 DAG.getConstant(0, DL, MVT::i64));
6362 case Intrinsic::aarch64_sve_whilerw_s:
6363 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6364 Op.getOperand(1), Op.getOperand(2),
6365 DAG.getConstant(4, DL, MVT::i64),
6366 DAG.getConstant(0, DL, MVT::i64));
6367 case Intrinsic::aarch64_sve_whilerw_d:
6368 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6369 Op.getOperand(1), Op.getOperand(2),
6370 DAG.getConstant(8, DL, MVT::i64),
6371 DAG.getConstant(0, DL, MVT::i64));
6372 case Intrinsic::aarch64_neon_abs: {
6373 EVT Ty = Op.getValueType();
6374 if (Ty == MVT::i64) {
6375 SDValue Result =
6376 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6377 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6378 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6379 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6380 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6381 } else {
6382 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6383 }
6384 }
6385 case Intrinsic::aarch64_neon_pmull64: {
6386 SDValue LHS = Op.getOperand(1);
6387 SDValue RHS = Op.getOperand(2);
6388
6389 std::optional<uint64_t> LHSLane =
6391 std::optional<uint64_t> RHSLane =
6393
6394 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6395 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6396
6397 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6398 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6399 // which ISel recognizes better. For example, generate a ldr into d*
6400 // registers as opposed to a GPR load followed by a fmov.
6401 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6402 std::optional<uint64_t> OtherLane,
6403 const SDLoc &DL,
6404 SelectionDAG &DAG) -> SDValue {
6405 // If the operand is an higher half itself, rewrite it to
6406 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6407 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6408 if (NLane == 1)
6409 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6410 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6411
6412 // Operand N is not a higher half but the other operand is.
6413 if (OtherLane == 1) {
6414 // If this operand is a lower half, rewrite it to
6415 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6416 // align lanes of two operands. A roundtrip sequence (to move from lane
6417 // 1 to lane 0) is like this:
6418 // mov x8, v0.d[1]
6419 // fmov d0, x8
6420 if (NLane == 0)
6421 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6422 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6423 N.getOperand(0),
6424 DAG.getConstant(0, DL, MVT::i64)),
6425 DAG.getConstant(1, DL, MVT::i64));
6426
6427 // Otherwise just dup from main to all lanes.
6428 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6429 }
6430
6431 // Neither operand is an extract of higher half, so codegen may just use
6432 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6433 assert(N.getValueType() == MVT::i64 &&
6434 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6435 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6436 };
6437
6438 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6439 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6440
6441 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6442 }
6443 case Intrinsic::aarch64_neon_smax:
6444 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6445 Op.getOperand(2));
6446 case Intrinsic::aarch64_neon_umax:
6447 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6448 Op.getOperand(2));
6449 case Intrinsic::aarch64_neon_smin:
6450 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6451 Op.getOperand(2));
6452 case Intrinsic::aarch64_neon_umin:
6453 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6454 Op.getOperand(2));
6455 case Intrinsic::aarch64_neon_scalar_sqxtn:
6456 case Intrinsic::aarch64_neon_scalar_sqxtun:
6457 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6458 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6459 if (Op.getValueType() == MVT::i32)
6460 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6461 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6462 Op.getOperand(0),
6463 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6464 Op.getOperand(1))));
6465 return SDValue();
6466 }
6467 case Intrinsic::aarch64_neon_sqxtn:
6468 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6469 Op.getOperand(1));
6470 case Intrinsic::aarch64_neon_sqxtun:
6471 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6472 Op.getOperand(1));
6473 case Intrinsic::aarch64_neon_uqxtn:
6474 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6475 Op.getOperand(1));
6476 case Intrinsic::aarch64_neon_sqshrn:
6477 if (Op.getValueType().isVector())
6478 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6479 DAG.getNode(AArch64ISD::VASHR, DL,
6480 Op.getOperand(1).getValueType(),
6481 Op.getOperand(1), Op.getOperand(2)));
6482 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRN, DAG,
6483 /*LastOperandIsImm=*/true);
6484 case Intrinsic::aarch64_neon_sqshrun:
6485 if (Op.getValueType().isVector())
6486 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6487 DAG.getNode(AArch64ISD::VASHR, DL,
6488 Op.getOperand(1).getValueType(),
6489 Op.getOperand(1), Op.getOperand(2)));
6490 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRUN, DAG,
6491 /*LastOperandIsImm=*/true);
6492 case Intrinsic::aarch64_neon_uqshrn:
6493 if (Op.getValueType().isVector())
6494 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6495 DAG.getNode(AArch64ISD::VLSHR, DL,
6496 Op.getOperand(1).getValueType(),
6497 Op.getOperand(1), Op.getOperand(2)));
6498 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHRN, DAG,
6499 /*LastOperandIsImm=*/true);
6500 case Intrinsic::aarch64_neon_sqrshrn:
6501 if (Op.getValueType().isVector())
6502 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6503 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6504 Op.getOperand(1).getValueType(),
6505 Op.getOperand(1), Op.getOperand(2)));
6506 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRN, DAG,
6507 /*LastOperandIsImm=*/true);
6508 case Intrinsic::aarch64_neon_sqrshrun:
6509 if (Op.getValueType().isVector())
6510 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6511 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6512 Op.getOperand(1).getValueType(),
6513 Op.getOperand(1), Op.getOperand(2)));
6514 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRUN, DAG,
6515 /*LastOperandIsImm=*/true);
6516 case Intrinsic::aarch64_neon_uqrshrn:
6517 if (Op.getValueType().isVector())
6518 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6519 DAG.getNode(AArch64ISD::URSHR_I, DL,
6520 Op.getOperand(1).getValueType(),
6521 Op.getOperand(1), Op.getOperand(2)));
6522 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHRN, DAG,
6523 /*LastOperandIsImm=*/true);
6524 case Intrinsic::aarch64_neon_sqdmulh:
6525 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULH, DAG);
6526 case Intrinsic::aarch64_neon_sqrdmulh:
6527 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMULH, DAG);
6528 case Intrinsic::aarch64_neon_sqrdmlah:
6529 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLAH, DAG);
6530 case Intrinsic::aarch64_neon_sqrdmlsh:
6531 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLSH, DAG);
6532 case Intrinsic::aarch64_neon_sqrshl:
6533 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
6534 case Intrinsic::aarch64_neon_sqshl:
6535 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
6536 case Intrinsic::aarch64_neon_uqrshl:
6537 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
6538 case Intrinsic::aarch64_neon_uqshl:
6539 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
6540 case Intrinsic::aarch64_neon_sqadd:
6541 if (Op.getValueType().isVector())
6542 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6543 Op.getOperand(2));
6544 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
6545
6546 case Intrinsic::aarch64_neon_sqsub:
6547 if (Op.getValueType().isVector())
6548 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6549 Op.getOperand(2));
6550 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
6551
6552 case Intrinsic::aarch64_neon_uqadd:
6553 if (Op.getValueType().isVector())
6554 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6555 Op.getOperand(2));
6556 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
6557 case Intrinsic::aarch64_neon_uqsub:
6558 if (Op.getValueType().isVector())
6559 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6560 Op.getOperand(2));
6561 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
6562 case Intrinsic::aarch64_neon_sqdmulls_scalar:
6563 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULL, DAG);
6564 case Intrinsic::aarch64_sve_whilelt:
6565 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6566 /*IsEqual=*/false);
6567 case Intrinsic::aarch64_sve_whilels:
6568 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6569 /*IsEqual=*/true);
6570 case Intrinsic::aarch64_sve_whilele:
6571 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6572 /*IsEqual=*/true);
6573 case Intrinsic::aarch64_sve_sunpkhi:
6574 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6575 Op.getOperand(1));
6576 case Intrinsic::aarch64_sve_sunpklo:
6577 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6578 Op.getOperand(1));
6579 case Intrinsic::aarch64_sve_uunpkhi:
6580 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6581 Op.getOperand(1));
6582 case Intrinsic::aarch64_sve_uunpklo:
6583 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6584 Op.getOperand(1));
6585 case Intrinsic::aarch64_sve_clasta_n:
6586 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6587 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6588 case Intrinsic::aarch64_sve_clastb_n:
6589 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6590 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6591 case Intrinsic::aarch64_sve_lasta:
6592 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6593 Op.getOperand(1), Op.getOperand(2));
6594 case Intrinsic::aarch64_sve_lastb:
6595 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6596 Op.getOperand(1), Op.getOperand(2));
6597 case Intrinsic::aarch64_sve_tbl:
6598 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6599 Op.getOperand(2));
6600 case Intrinsic::aarch64_sve_trn1:
6601 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6602 Op.getOperand(1), Op.getOperand(2));
6603 case Intrinsic::aarch64_sve_trn2:
6604 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6605 Op.getOperand(1), Op.getOperand(2));
6606 case Intrinsic::aarch64_sve_uzp1:
6607 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6608 Op.getOperand(1), Op.getOperand(2));
6609 case Intrinsic::aarch64_sve_uzp2:
6610 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6611 Op.getOperand(1), Op.getOperand(2));
6612 case Intrinsic::aarch64_sve_zip1:
6613 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6614 Op.getOperand(1), Op.getOperand(2));
6615 case Intrinsic::aarch64_sve_zip2:
6616 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6617 Op.getOperand(1), Op.getOperand(2));
6618 case Intrinsic::aarch64_sve_splice:
6619 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6620 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6621 case Intrinsic::aarch64_sve_ptrue:
6622 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6623 case Intrinsic::aarch64_sve_clz:
6624 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6625 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6626 case Intrinsic::aarch64_sme_cntsd: {
6627 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6628 DAG.getConstant(1, DL, MVT::i32));
6629 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6630 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6631 }
6632 case Intrinsic::aarch64_sve_cnt: {
6633 SDValue Data = Op.getOperand(3);
6634 // CTPOP only supports integer operands.
6635 if (Data.getValueType().isFloatingPoint())
6636 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6637 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6638 Op.getOperand(2), Data, Op.getOperand(1));
6639 }
6640 case Intrinsic::aarch64_sve_dupq_lane:
6641 return LowerDUPQLane(Op, DAG);
6642 case Intrinsic::aarch64_sve_convert_from_svbool:
6643 if (Op.getValueType() == MVT::aarch64svcount)
6644 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6645 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6646 case Intrinsic::aarch64_sve_convert_to_svbool:
6647 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6648 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6649 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6650 case Intrinsic::aarch64_sve_fneg:
6651 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6652 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6653 case Intrinsic::aarch64_sve_frintp:
6654 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6655 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6656 case Intrinsic::aarch64_sve_frintm:
6657 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6658 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6659 case Intrinsic::aarch64_sve_frinti:
6660 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6661 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6662 Op.getOperand(1));
6663 case Intrinsic::aarch64_sve_frintx:
6664 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6665 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6666 case Intrinsic::aarch64_sve_frint32x:
6667 return DAG.getNode(AArch64ISD::FRINT32_MERGE_PASSTHRU, DL,
6668 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6669 Op.getOperand(1));
6670 case Intrinsic::aarch64_sve_frint64x:
6671 return DAG.getNode(AArch64ISD::FRINT64_MERGE_PASSTHRU, DL,
6672 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6673 Op.getOperand(1));
6674 case Intrinsic::aarch64_sve_frinta:
6675 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6676 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6677 case Intrinsic::aarch64_sve_frintn:
6678 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6679 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6680 Op.getOperand(1));
6681 case Intrinsic::aarch64_sve_frintz:
6682 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6683 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6684 case Intrinsic::aarch64_sve_frint32z:
6685 return DAG.getNode(AArch64ISD::FTRUNC32_MERGE_PASSTHRU, DL,
6686 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6687 Op.getOperand(1));
6688 case Intrinsic::aarch64_sve_frint64z:
6689 return DAG.getNode(AArch64ISD::FTRUNC64_MERGE_PASSTHRU, DL,
6690 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6691 Op.getOperand(1));
6692 case Intrinsic::aarch64_sve_ucvtf:
6693 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6694 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6695 Op.getOperand(1));
6696 case Intrinsic::aarch64_sve_scvtf:
6697 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6698 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6699 Op.getOperand(1));
6700 case Intrinsic::aarch64_sve_fcvtzu:
6701 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6702 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6703 case Intrinsic::aarch64_sve_fcvtzs:
6704 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6705 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6706 case Intrinsic::aarch64_sve_fsqrt:
6707 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6708 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6709 case Intrinsic::aarch64_sve_frecpx:
6710 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6711 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6712 case Intrinsic::aarch64_sve_frecpe_x:
6713 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6714 Op.getOperand(1));
6715 case Intrinsic::aarch64_sve_frecps_x:
6716 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6717 Op.getOperand(1), Op.getOperand(2));
6718 case Intrinsic::aarch64_sve_frsqrte_x:
6719 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6720 Op.getOperand(1));
6721 case Intrinsic::aarch64_sve_frsqrts_x:
6722 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6723 Op.getOperand(1), Op.getOperand(2));
6724 case Intrinsic::aarch64_sve_fabs:
6725 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6726 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6727 case Intrinsic::aarch64_sve_abs:
6728 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6729 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6730 case Intrinsic::aarch64_sve_neg:
6731 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6732 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6733 case Intrinsic::aarch64_sve_insr: {
6734 SDValue Scalar = Op.getOperand(2);
6735 EVT ScalarTy = Scalar.getValueType();
6736 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6737 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6738
6739 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6740 Op.getOperand(1), Scalar);
6741 }
6742 case Intrinsic::aarch64_sve_rbit:
6743 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6744 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6745 Op.getOperand(1));
6746 case Intrinsic::aarch64_sve_revb:
6747 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6748 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6749 case Intrinsic::aarch64_sve_revh:
6750 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6751 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6752 case Intrinsic::aarch64_sve_revw:
6753 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6754 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6755 case Intrinsic::aarch64_sve_revd:
6756 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6757 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6758 case Intrinsic::aarch64_sve_sxtb:
6759 return DAG.getNode(
6760 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6761 Op.getOperand(2), Op.getOperand(3),
6762 DAG.getValueType(Op.getValueType().changeVectorElementType(
6763 *DAG.getContext(), MVT::i8)),
6764 Op.getOperand(1));
6765 case Intrinsic::aarch64_sve_sxth:
6766 return DAG.getNode(
6767 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6768 Op.getOperand(2), Op.getOperand(3),
6769 DAG.getValueType(Op.getValueType().changeVectorElementType(
6770 *DAG.getContext(), MVT::i16)),
6771 Op.getOperand(1));
6772 case Intrinsic::aarch64_sve_sxtw:
6773 return DAG.getNode(
6774 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6775 Op.getOperand(2), Op.getOperand(3),
6776 DAG.getValueType(Op.getValueType().changeVectorElementType(
6777 *DAG.getContext(), MVT::i32)),
6778 Op.getOperand(1));
6779 case Intrinsic::aarch64_sve_uxtb:
6780 return DAG.getNode(
6781 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6782 Op.getOperand(2), Op.getOperand(3),
6783 DAG.getValueType(Op.getValueType().changeVectorElementType(
6784 *DAG.getContext(), MVT::i8)),
6785 Op.getOperand(1));
6786 case Intrinsic::aarch64_sve_uxth:
6787 return DAG.getNode(
6788 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6789 Op.getOperand(2), Op.getOperand(3),
6790 DAG.getValueType(Op.getValueType().changeVectorElementType(
6791 *DAG.getContext(), MVT::i16)),
6792 Op.getOperand(1));
6793 case Intrinsic::aarch64_sve_uxtw:
6794 return DAG.getNode(
6795 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6796 Op.getOperand(2), Op.getOperand(3),
6797 DAG.getValueType(Op.getValueType().changeVectorElementType(
6798 *DAG.getContext(), MVT::i32)),
6799 Op.getOperand(1));
6800 case Intrinsic::localaddress: {
6801 const auto &MF = DAG.getMachineFunction();
6802 const auto *RegInfo = Subtarget->getRegisterInfo();
6803 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6804 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6805 Op.getSimpleValueType());
6806 }
6807
6808 case Intrinsic::eh_recoverfp: {
6809 // FIXME: This needs to be implemented to correctly handle highly aligned
6810 // stack objects. For now we simply return the incoming FP. Refer D53541
6811 // for more details.
6812 SDValue FnOp = Op.getOperand(1);
6813 SDValue IncomingFPOp = Op.getOperand(2);
6814 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6815 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6816 if (!Fn)
6818 "llvm.eh.recoverfp must take a function as the first argument");
6819 return IncomingFPOp;
6820 }
6821 case Intrinsic::aarch64_neon_vsri:
6822 case Intrinsic::aarch64_neon_vsli:
6823 case Intrinsic::aarch64_sve_sri:
6824 case Intrinsic::aarch64_sve_sli: {
6825 EVT Ty = Op.getValueType();
6826
6827 if (!Ty.isVector())
6828 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6829
6830 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6831
6832 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6833 IntNo == Intrinsic::aarch64_sve_sri;
6834 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6835 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6836 Op.getOperand(3));
6837 }
6838
6839 case Intrinsic::aarch64_neon_srhadd:
6840 case Intrinsic::aarch64_neon_urhadd:
6841 case Intrinsic::aarch64_neon_shadd:
6842 case Intrinsic::aarch64_neon_uhadd: {
6843 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6844 IntNo == Intrinsic::aarch64_neon_shadd);
6845 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6846 IntNo == Intrinsic::aarch64_neon_urhadd);
6847 unsigned Opcode = IsSignedAdd
6848 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6849 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6850 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6851 Op.getOperand(2));
6852 }
6853 case Intrinsic::aarch64_neon_saddlp:
6854 case Intrinsic::aarch64_neon_uaddlp: {
6855 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6856 ? AArch64ISD::UADDLP
6857 : AArch64ISD::SADDLP;
6858 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6859 }
6860 case Intrinsic::aarch64_neon_sdot:
6861 case Intrinsic::aarch64_neon_udot:
6862 case Intrinsic::aarch64_sve_sdot:
6863 case Intrinsic::aarch64_sve_udot: {
6864 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6865 IntNo == Intrinsic::aarch64_sve_udot)
6866 ? AArch64ISD::UDOT
6867 : AArch64ISD::SDOT;
6868 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6869 Op.getOperand(2), Op.getOperand(3));
6870 }
6871 case Intrinsic::aarch64_neon_usdot:
6872 case Intrinsic::aarch64_sve_usdot: {
6873 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6874 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6875 }
6876 case Intrinsic::aarch64_neon_saddlv:
6877 case Intrinsic::aarch64_neon_uaddlv: {
6878 EVT OpVT = Op.getOperand(1).getValueType();
6879 EVT ResVT = Op.getValueType();
6880 assert(
6881 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6882 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6883 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6884 "Unexpected aarch64_neon_u/saddlv type");
6885 (void)OpVT;
6886 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6887 SDValue ADDLV = DAG.getNode(
6888 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6889 : AArch64ISD::SADDLV,
6890 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6891 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6892 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6893 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6894 return EXTRACT_VEC_ELT;
6895 }
6896 case Intrinsic::experimental_cttz_elts: {
6897 SDValue CttzOp = Op.getOperand(1);
6898 EVT VT = CttzOp.getValueType();
6899 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6900
6901 if (VT.isFixedLengthVector()) {
6902 // We can use SVE instructions to lower this intrinsic by first creating
6903 // an SVE predicate register mask from the fixed-width vector.
6904 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6905 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6906 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6907 }
6908
6909 SDValue NewCttzElts =
6910 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6911 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6912 }
6913 case Intrinsic::experimental_vector_match: {
6914 return LowerVectorMatch(Op, DAG);
6915 }
6916 case Intrinsic::aarch64_cls:
6917 case Intrinsic::aarch64_cls64:
6918 SDValue Res = DAG.getNode(ISD::CTLS, DL, Op.getOperand(1).getValueType(),
6919 Op.getOperand(1));
6920 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
6921 }
6922}
6923
6924bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6925 if (VT.getVectorElementType() == MVT::i8 ||
6926 VT.getVectorElementType() == MVT::i16) {
6927 EltTy = MVT::i32;
6928 return true;
6929 }
6930 return false;
6931}
6932
6933bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6934 EVT DataVT) const {
6935 const EVT IndexVT = Extend.getOperand(0).getValueType();
6936 // SVE only supports implicit extension of 32-bit indices.
6937 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6938 return false;
6939
6940 // Indices cannot be smaller than the main data type.
6941 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6942 return false;
6943
6944 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6945 // element container type, which would violate the previous clause.
6946 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6947}
6948
6949/// Helper function to check if a small vector load can be optimized.
6951 const AArch64Subtarget &Subtarget) {
6952 if (!Subtarget.isNeonAvailable())
6953 return false;
6954 if (LD->isVolatile())
6955 return false;
6956
6957 EVT MemVT = LD->getMemoryVT();
6958 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
6959 return false;
6960
6961 Align Alignment = LD->getAlign();
6962 Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
6963 if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
6964 return false;
6965
6966 return true;
6967}
6968
6969bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6970 EVT ExtVT = ExtVal.getValueType();
6971 // Small, illegal vectors can be extended inreg.
6972 if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
6973 if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
6974 isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
6975 return true;
6976 }
6977 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6978 return false;
6979
6980 // It may be worth creating extending masked loads if there are multiple
6981 // masked loads using the same predicate. That way we'll end up creating
6982 // extending masked loads that may then get split by the legaliser. This
6983 // results in just one set of predicate unpacks at the start, instead of
6984 // multiple sets of vector unpacks after each load.
6985 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6986 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6987 // Disable extending masked loads for fixed-width for now, since the code
6988 // quality doesn't look great.
6989 if (!ExtVT.isScalableVector())
6990 return false;
6991
6992 unsigned NumExtMaskedLoads = 0;
6993 for (auto *U : Ld->getMask()->users())
6994 if (isa<MaskedLoadSDNode>(U))
6995 NumExtMaskedLoads++;
6996
6997 if (NumExtMaskedLoads <= 1)
6998 return false;
6999 }
7000 }
7001
7002 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
7003 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
7004 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
7005}
7006
7007unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
7008 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
7009 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
7010 AArch64ISD::GLD1_MERGE_ZERO},
7011 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
7012 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
7013 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
7014 AArch64ISD::GLD1_MERGE_ZERO},
7015 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
7016 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
7017 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
7018 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7019 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
7020 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
7021 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
7022 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7023 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
7024 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
7025 };
7026 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
7027 return AddrModes.find(Key)->second;
7028}
7029
7030unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
7031 switch (Opcode) {
7032 default:
7033 llvm_unreachable("unimplemented opcode");
7034 return Opcode;
7035 case AArch64ISD::GLD1_MERGE_ZERO:
7036 return AArch64ISD::GLD1S_MERGE_ZERO;
7037 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
7038 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
7039 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
7040 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
7041 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
7042 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
7043 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
7044 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
7045 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
7046 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
7047 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
7048 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
7049 }
7050}
7051
7052SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
7053 SelectionDAG &DAG) const {
7054 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
7055
7056 SDLoc DL(Op);
7057 SDValue Chain = MGT->getChain();
7058 SDValue PassThru = MGT->getPassThru();
7059 SDValue Mask = MGT->getMask();
7060 SDValue BasePtr = MGT->getBasePtr();
7061 SDValue Index = MGT->getIndex();
7062 SDValue Scale = MGT->getScale();
7063 EVT VT = Op.getValueType();
7064 EVT MemVT = MGT->getMemoryVT();
7065 ISD::LoadExtType ExtType = MGT->getExtensionType();
7066 ISD::MemIndexType IndexType = MGT->getIndexType();
7067
7068 // SVE supports zero (and so undef) passthrough values only, everything else
7069 // must be handled manually by an explicit select on the load's output.
7070 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
7071 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
7072 SDValue Load =
7073 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7074 MGT->getMemOperand(), IndexType, ExtType);
7075 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7076 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
7077 }
7078
7079 bool IsScaled = MGT->isIndexScaled();
7080 bool IsSigned = MGT->isIndexSigned();
7081
7082 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7083 // must be calculated before hand.
7084 uint64_t ScaleVal = Scale->getAsZExtVal();
7085 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7086 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7087 EVT IndexVT = Index.getValueType();
7088 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7089 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7090 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7091
7092 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7093 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7094 MGT->getMemOperand(), IndexType, ExtType);
7095 }
7096
7097 // Lower fixed length gather to a scalable equivalent.
7098 if (VT.isFixedLengthVector()) {
7099 assert(Subtarget->useSVEForFixedLengthVectors() &&
7100 "Cannot lower when not using SVE for fixed vectors!");
7101
7102 // NOTE: Handle floating-point as if integer then bitcast the result.
7103 EVT DataVT = VT.changeVectorElementTypeToInteger();
7104 MemVT = MemVT.changeVectorElementTypeToInteger();
7105
7106 // Find the smallest integer fixed length vector we can use for the gather.
7107 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7108 if (DataVT.getVectorElementType() == MVT::i64 ||
7109 Index.getValueType().getVectorElementType() == MVT::i64 ||
7110 Mask.getValueType().getVectorElementType() == MVT::i64)
7111 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7112
7113 // Promote vector operands except for passthrough, which we know is either
7114 // undef or zero, and thus best constructed directly.
7115 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7116 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7117 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7118
7119 // A promoted result type forces the need for an extending load.
7120 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
7121 ExtType = ISD::EXTLOAD;
7122
7123 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7124
7125 // Convert fixed length vector operands to scalable.
7126 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7127 MemVT.getVectorElementType());
7128 Index = convertToScalableVector(DAG, ContainerVT, Index);
7130 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
7131 : DAG.getConstant(0, DL, ContainerVT);
7132
7133 // Emit equivalent scalable vector gather.
7134 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7135 SDValue Load =
7136 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
7137 Ops, MGT->getMemOperand(), IndexType, ExtType);
7138
7139 // Extract fixed length data then convert to the required result type.
7140 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
7141 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
7142 if (VT.isFloatingPoint())
7143 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
7144
7145 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7146 }
7147
7148 // Everything else is legal.
7149 return Op;
7150}
7151
7152SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
7153 SelectionDAG &DAG) const {
7154 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
7155
7156 SDLoc DL(Op);
7157 SDValue Chain = MSC->getChain();
7158 SDValue StoreVal = MSC->getValue();
7159 SDValue Mask = MSC->getMask();
7160 SDValue BasePtr = MSC->getBasePtr();
7161 SDValue Index = MSC->getIndex();
7162 SDValue Scale = MSC->getScale();
7163 EVT VT = StoreVal.getValueType();
7164 EVT MemVT = MSC->getMemoryVT();
7165 ISD::MemIndexType IndexType = MSC->getIndexType();
7166 bool Truncating = MSC->isTruncatingStore();
7167
7168 bool IsScaled = MSC->isIndexScaled();
7169 bool IsSigned = MSC->isIndexSigned();
7170
7171 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7172 // must be calculated before hand.
7173 uint64_t ScaleVal = Scale->getAsZExtVal();
7174 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7175 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7176 EVT IndexVT = Index.getValueType();
7177 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7178 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7179 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7180
7181 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7182 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7183 MSC->getMemOperand(), IndexType, Truncating);
7184 }
7185
7186 // Lower fixed length scatter to a scalable equivalent.
7187 if (VT.isFixedLengthVector()) {
7188 assert(Subtarget->useSVEForFixedLengthVectors() &&
7189 "Cannot lower when not using SVE for fixed vectors!");
7190
7191 // Once bitcast we treat floating-point scatters as if integer.
7192 if (VT.isFloatingPoint()) {
7194 MemVT = MemVT.changeVectorElementTypeToInteger();
7195 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
7196 }
7197
7198 // Find the smallest integer fixed length vector we can use for the scatter.
7199 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7200 if (VT.getVectorElementType() == MVT::i64 ||
7201 Index.getValueType().getVectorElementType() == MVT::i64 ||
7202 Mask.getValueType().getVectorElementType() == MVT::i64)
7203 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7204
7205 // Promote vector operands.
7206 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7207 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7208 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7209 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
7210
7211 // A promoted value type forces the need for a truncating store.
7212 if (PromotedVT != VT)
7213 Truncating = true;
7214
7215 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7216
7217 // Convert fixed length vector operands to scalable.
7218 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7219 MemVT.getVectorElementType());
7220 Index = convertToScalableVector(DAG, ContainerVT, Index);
7222 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
7223
7224 // Emit equivalent scalable vector scatter.
7225 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7226 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7227 MSC->getMemOperand(), IndexType, Truncating);
7228 }
7229
7230 // Everything else is legal.
7231 return Op;
7232}
7233
7234SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7235 SDLoc DL(Op);
7236 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7237 assert(LoadNode && "Expected custom lowering of a masked load node");
7238 EVT VT = Op->getValueType(0);
7239
7240 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7241 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7242
7243 SDValue PassThru = LoadNode->getPassThru();
7244 SDValue Mask = LoadNode->getMask();
7245
7246 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7247 return Op;
7248
7250 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7251 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7252 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7253 LoadNode->getExtensionType());
7254
7255 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7256
7257 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7258}
7259
7260// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7262 EVT VT, EVT MemVT,
7263 SelectionDAG &DAG) {
7264 assert(VT.isVector() && "VT should be a vector type");
7265 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7266
7267 SDValue Value = ST->getValue();
7268
7269 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7270 // the word lane which represent the v4i8 subvector. It optimizes the store
7271 // to:
7272 //
7273 // xtn v0.8b, v0.8h
7274 // str s0, [x0]
7275
7276 SDValue Undef = DAG.getUNDEF(MVT::i16);
7277 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7278 {Undef, Undef, Undef, Undef});
7279
7280 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7281 Value, UndefVec);
7282 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7283
7284 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7285 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7286 Trunc, DAG.getConstant(0, DL, MVT::i64));
7287
7288 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7289 ST->getBasePtr(), ST->getMemOperand());
7290}
7291
7293 SDLoc DL(Op);
7294 SDValue Src = Op.getOperand(0);
7295 MVT DestVT = Op.getSimpleValueType();
7296 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7298
7299 unsigned SrcAS = N->getSrcAddressSpace();
7300 unsigned DestAS = N->getDestAddressSpace();
7301 assert(SrcAS != DestAS &&
7302 "addrspacecast must be between different address spaces");
7303 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7304 TLI.getTargetMachine().getPointerSize(DestAS) &&
7305 "addrspacecast must be between different ptr sizes");
7306 (void)TLI;
7307
7308 if (SrcAS == ARM64AS::PTR32_SPTR) {
7309 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7310 DAG.getTargetConstant(0, DL, DestVT));
7311 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7312 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7313 DAG.getTargetConstant(0, DL, DestVT));
7314 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7315 (DestAS == ARM64AS::PTR32_UPTR)) {
7316 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7317 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7318 return Trunc;
7319 } else {
7320 return Src;
7321 }
7322}
7323
7324// Custom lowering for any store, vector or scalar and/or default or with
7325// a truncate operations. Currently only custom lower truncate operation
7326// from vector v4i16 to v4i8 or volatile stores of i128.
7327SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7328 SelectionDAG &DAG) const {
7329 SDLoc Dl(Op);
7330 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7331 assert (StoreNode && "Can only custom lower store nodes");
7332
7333 SDValue Value = StoreNode->getValue();
7334
7335 EVT VT = Value.getValueType();
7336 EVT MemVT = StoreNode->getMemoryVT();
7337
7338 if (VT.isVector()) {
7340 VT,
7341 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7342 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7343
7344 unsigned AS = StoreNode->getAddressSpace();
7345 Align Alignment = StoreNode->getAlign();
7346 if (Alignment < MemVT.getStoreSize() &&
7347 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7348 StoreNode->getMemOperand()->getFlags(),
7349 nullptr)) {
7350 return scalarizeVectorStore(StoreNode, DAG);
7351 }
7352
7353 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7354 MemVT == MVT::v4i8) {
7355 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7356 }
7357 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7358 // the custom lowering, as there are no un-paired non-temporal stores and
7359 // legalization will break up 256 bit inputs.
7360 //
7361 // Currently, STNP lowering can only either keep or increase code size, thus
7362 // we predicate it to not apply when optimizing for code size.
7363 ElementCount EC = MemVT.getVectorElementCount();
7364 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7365 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7366 !DAG.shouldOptForSize() &&
7367 (MemVT.getScalarSizeInBits() == 8u ||
7368 MemVT.getScalarSizeInBits() == 16u ||
7369 MemVT.getScalarSizeInBits() == 32u ||
7370 MemVT.getScalarSizeInBits() == 64u)) {
7371 SDValue Lo =
7374 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7375 SDValue Hi =
7378 StoreNode->getValue(),
7379 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7381 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7382 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7383 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7384 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7385 return Result;
7386 }
7387 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7388 return LowerStore128(Op, DAG);
7389 } else if (MemVT == MVT::i64x8) {
7390 SDValue Value = StoreNode->getValue();
7391 assert(Value->getValueType(0) == MVT::i64x8);
7392 SDValue Chain = StoreNode->getChain();
7393 SDValue Base = StoreNode->getBasePtr();
7394 EVT PtrVT = Base.getValueType();
7395 for (unsigned i = 0; i < 8; i++) {
7396 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7397 Value, DAG.getConstant(i, Dl, MVT::i32));
7398 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7399 DAG.getConstant(i * 8, Dl, PtrVT));
7400 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7401 StoreNode->getBaseAlign());
7402 }
7403 return Chain;
7404 }
7405
7406 return SDValue();
7407}
7408
7409/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7410SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7411 SelectionDAG &DAG) const {
7412 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7413 assert(StoreNode->getMemoryVT() == MVT::i128);
7414 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7415
7416 bool IsStoreRelease =
7418 if (StoreNode->isAtomic())
7419 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7420 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7423
7424 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7425 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7426 ? StoreNode->getOperand(1)
7427 : StoreNode->getOperand(2);
7428 SDLoc DL(Op);
7429 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7430 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7431 if (DAG.getDataLayout().isBigEndian())
7432 std::swap(StoreValue.first, StoreValue.second);
7434 Opcode, DL, DAG.getVTList(MVT::Other),
7435 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7436 StoreNode->getBasePtr()},
7437 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7438 return Result;
7439}
7440
7441/// Helper function to optimize loads of extended small vectors.
7442/// These patterns would otherwise get scalarized into inefficient sequences.
7444 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7445 if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7446 return SDValue();
7447
7448 EVT MemVT = Load->getMemoryVT();
7449 EVT ResVT = Load->getValueType(0);
7450 unsigned NumElts = ResVT.getVectorNumElements();
7451 unsigned DstEltBits = ResVT.getScalarSizeInBits();
7452 unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7453
7454 unsigned ExtOpcode;
7455 switch (Load->getExtensionType()) {
7456 case ISD::EXTLOAD:
7457 case ISD::ZEXTLOAD:
7458 ExtOpcode = ISD::ZERO_EXTEND;
7459 break;
7460 case ISD::SEXTLOAD:
7461 ExtOpcode = ISD::SIGN_EXTEND;
7462 break;
7463 case ISD::NON_EXTLOAD:
7464 return SDValue();
7465 }
7466
7467 SDLoc DL(Load);
7468 SDValue Chain = Load->getChain();
7469 SDValue BasePtr = Load->getBasePtr();
7470 const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7471 Align Alignment = Load->getAlign();
7472
7473 // Load the data as an FP scalar to avoid issues with integer loads.
7474 unsigned LoadBits = MemVT.getStoreSizeInBits();
7475 MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7476 SDValue ScalarLoad =
7477 DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7478
7479 MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7480 SDValue ScalarToVec =
7481 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7482 MVT BitcastTy =
7483 MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7484 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7485
7486 SDValue Res = Bitcast;
7487 unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7488 unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7489 while (CurrentEltBits < DstEltBits) {
7490 if (Res.getValueSizeInBits() >= 128) {
7491 CurrentNumElts = CurrentNumElts / 2;
7492 MVT ExtractVT =
7493 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7494 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7495 DAG.getConstant(0, DL, MVT::i64));
7496 }
7497 CurrentEltBits = CurrentEltBits * 2;
7498 MVT ExtVT =
7499 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7500 Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7501 }
7502
7503 if (CurrentNumElts != NumElts) {
7504 MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7505 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7506 DAG.getConstant(0, DL, MVT::i64));
7507 }
7508
7509 return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7510}
7511
7512SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7513 SelectionDAG &DAG) const {
7514 SDLoc DL(Op);
7515 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7516 assert(LoadNode && "Expected custom lowering of a load node");
7517
7518 if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7519 return Result;
7520
7521 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7523 SDValue Base = LoadNode->getBasePtr();
7524 SDValue Chain = LoadNode->getChain();
7525 EVT PtrVT = Base.getValueType();
7526 for (unsigned i = 0; i < 8; i++) {
7527 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7528 DAG.getConstant(i * 8, DL, PtrVT));
7529 SDValue Part =
7530 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7531 LoadNode->getBaseAlign());
7532 Ops.push_back(Part);
7533 Chain = SDValue(Part.getNode(), 1);
7534 }
7535 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7536 return DAG.getMergeValues({Loaded, Chain}, DL);
7537 }
7538
7539 return SDValue();
7540}
7541
7542SDValue AArch64TargetLowering::LowerFixedLengthVectorCompressToSVE(
7543 SDValue Op, SelectionDAG &DAG) const {
7544 SDLoc DL(Op);
7545 EVT VT = Op.getValueType();
7546
7547 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7548 SDValue Vec = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
7549 SDValue Mask = convertFixedMaskToScalableVector(Op.getOperand(1), DAG);
7550 SDValue Passthru =
7551 convertToScalableVector(DAG, ContainerVT, Op.getOperand(2));
7552
7553 SDValue Result =
7554 DAG.getNode(ISD::VECTOR_COMPRESS, DL, ContainerVT, Vec, Mask, Passthru);
7555 return convertFromScalableVector(DAG, VT, Result);
7556}
7557
7558SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7559 SelectionDAG &DAG) const {
7560 EVT VT = Op.getValueType();
7561 if (!Subtarget->isSVEAvailable())
7562 return SDValue();
7563
7564 if (VT.isFixedLengthVector())
7565 return LowerFixedLengthVectorCompressToSVE(Op, DAG);
7566
7567 SDLoc DL(Op);
7568 SDValue Vec = Op.getOperand(0);
7569 SDValue Mask = Op.getOperand(1);
7570 SDValue Passthru = Op.getOperand(2);
7571 EVT MaskVT = Mask.getValueType();
7572
7573 SDValue Compressed = DAG.getNode(
7575 DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
7576 Vec);
7577
7578 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7579 if (Passthru.isUndef() ||
7581 return Compressed;
7582
7583 SDValue CntActive = DAG.getNode(
7584 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7585 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7586 Mask);
7587
7588 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7589 SDValue CompressedMask =
7590 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
7591
7592 return DAG.getNode(ISD::VSELECT, DL, VT, CompressedMask, Compressed,
7593 Passthru);
7594}
7595
7596// Generate SUBS and CSEL for integer abs.
7597SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7598 MVT VT = Op.getSimpleValueType();
7599
7600 if (VT.isVector())
7601 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7602
7603 SDLoc DL(Op);
7604 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7605
7606 // Generate SUBS & CSEL.
7607 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7608 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7609 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7610 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7611}
7612
7614 SDValue Chain = Op.getOperand(0);
7615 SDValue Cond = Op.getOperand(1);
7616 SDValue Dest = Op.getOperand(2);
7617
7619 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7620 SDLoc DL(Op);
7621 SDValue CCVal = getCondCode(DAG, CC);
7622 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7623 Cmp);
7624 }
7625
7626 return SDValue();
7627}
7628
7629// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7630// FSHL is converted to FSHR before deciding what to do with it
7632 SDValue Shifts = Op.getOperand(2);
7633 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7634 // If opcode is FSHL, convert it to FSHR
7635 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7636 SDLoc DL(Op);
7637 MVT VT = Op.getSimpleValueType();
7638 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7639
7640 if (Op.getOpcode() == ISD::FSHL) {
7641 if (NewShiftNo == 0)
7642 return Op.getOperand(0);
7643
7644 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7645 return DAG.getNode(
7646 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7647 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7648 }
7649
7650 if (Op.getOpcode() == ISD::FSHR) {
7651 if (NewShiftNo == 0)
7652 return Op.getOperand(1);
7653
7654 if (ShiftNo->getZExtValue() == NewShiftNo)
7655 return Op;
7656
7657 // Rewrite using the normalised shift amount.
7658 return DAG.getNode(
7659 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7660 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7661 }
7662 }
7663
7664 return SDValue();
7665}
7666
7668 SDValue X = Op.getOperand(0);
7669 EVT XScalarTy = X.getValueType();
7670 SDValue Exp = Op.getOperand(1);
7671
7672 SDLoc DL(Op);
7673 EVT XVT, ExpVT;
7674 switch (Op.getSimpleValueType().SimpleTy) {
7675 default:
7676 return SDValue();
7677 case MVT::bf16:
7678 case MVT::f16:
7679 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7680 [[fallthrough]];
7681 case MVT::f32:
7682 XVT = MVT::nxv4f32;
7683 ExpVT = MVT::nxv4i32;
7684 break;
7685 case MVT::f64:
7686 XVT = MVT::nxv2f64;
7687 ExpVT = MVT::nxv2i64;
7688 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7689 break;
7690 }
7691
7692 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7693 SDValue VX =
7694 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7695 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7696 DAG.getUNDEF(ExpVT), Exp, Zero);
7697 SDValue VPg = DAG.getConstant(
7698 1, DL, XVT.changeVectorElementType(*DAG.getContext(), MVT::i1));
7699 SDValue FScale = DAG.getNode(
7701 DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
7702 VX, VExp);
7703 SDValue Final =
7704 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7705 if (X.getValueType() != XScalarTy)
7706 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7707 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7708 return Final;
7709}
7710
7711SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7712 SelectionDAG &DAG) const {
7713 return Op.getOperand(0);
7714}
7715
7716SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7717 SelectionDAG &DAG) const {
7718 SDValue Chain = Op.getOperand(0);
7719 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7720 SDValue FPtr = Op.getOperand(2); // nested function
7721 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7722
7723 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7724
7725 // ldr NestReg, .+16
7726 // ldr x17, .+20
7727 // br x17
7728 // .word 0
7729 // .nest: .qword nest
7730 // .fptr: .qword fptr
7731 SDValue OutChains[5];
7732
7733 const Function *Func =
7734 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7735 CallingConv::ID CC = Func->getCallingConv();
7736 unsigned NestReg;
7737
7738 switch (CC) {
7739 default:
7740 NestReg = 0x0f; // X15
7741 break;
7743 // Must be kept in sync with AArch64CallingConv.td
7744 NestReg = 0x04; // X4
7745 break;
7746 }
7747
7748 const char FptrReg = 0x11; // X17
7749
7750 SDValue Addr = Trmp;
7751
7752 SDLoc DL(Op);
7753 OutChains[0] = DAG.getStore(
7754 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7755 MachinePointerInfo(TrmpAddr));
7756
7757 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7758 DAG.getConstant(4, DL, MVT::i64));
7759 OutChains[1] = DAG.getStore(
7760 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7761 MachinePointerInfo(TrmpAddr, 4));
7762
7763 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7764 DAG.getConstant(8, DL, MVT::i64));
7765 OutChains[2] =
7766 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7767 MachinePointerInfo(TrmpAddr, 8));
7768
7769 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7770 DAG.getConstant(16, DL, MVT::i64));
7771 OutChains[3] =
7772 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7773
7774 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7775 DAG.getConstant(24, DL, MVT::i64));
7776 OutChains[4] =
7777 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7778
7779 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7780
7781 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7782 DAG.getConstant(12, DL, MVT::i64));
7783
7784 // Call clear cache on the trampoline instructions.
7785 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7786 EndOfTrmp);
7787}
7788
7789SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const {
7790 SDLoc DL(Op);
7791 EVT VT = Op.getValueType();
7792 if (VT.getScalarType() != MVT::bf16 ||
7793 (Subtarget->hasSVEB16B16() &&
7794 Subtarget->isNonStreamingSVEorSME2Available()))
7795 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7796
7797 assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering");
7798 assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16 || VT == MVT::v8bf16) &&
7799 "Unexpected FMUL VT");
7800
7801 auto MakeGetIntrinsic = [&](Intrinsic::ID IID) {
7802 return [&, IID](EVT VT, auto... Ops) {
7803 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7804 DAG.getConstant(IID, DL, MVT::i32), Ops...);
7805 };
7806 };
7807
7808 auto Reinterpret = [&](SDValue Value, EVT VT) {
7809 EVT SrcVT = Value.getValueType();
7810 if (VT == SrcVT)
7811 return Value;
7812 if (SrcVT.isFixedLengthVector())
7813 return convertToScalableVector(DAG, VT, Value);
7814 if (VT.isFixedLengthVector())
7815 return convertFromScalableVector(DAG, VT, Value);
7816 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value);
7817 };
7818
7819 bool UseSVEBFMLAL = VT.isScalableVector();
7820 auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2);
7821 auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2);
7822
7823 // Note: The NEON BFMLAL[BT] reads even/odd lanes like the SVE variant.
7824 // This does not match BFCVTN[2], so we use SVE to convert back to bf16.
7825 auto BFMLALB =
7826 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalb
7827 : Intrinsic::aarch64_neon_bfmlalb);
7828 auto BFMLALT =
7829 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalt
7830 : Intrinsic::aarch64_neon_bfmlalt);
7831
7832 EVT AccVT = UseSVEBFMLAL ? MVT::nxv4f32 : MVT::v4f32;
7833 SDValue Zero = DAG.getNeutralElement(ISD::FADD, DL, AccVT, Op->getFlags());
7834 SDValue Pg = getPredicateForVector(DAG, DL, AccVT);
7835
7836 // Lower bf16 FMUL as a pair (VT == [nx]v8bf16) of BFMLAL top/bottom
7837 // instructions. These result in two f32 vectors, which can be converted back
7838 // to bf16 with FCVT and FCVTNT.
7839 SDValue LHS = Op.getOperand(0);
7840 SDValue RHS = Op.getOperand(1);
7841
7842 // All SVE intrinsics expect to operate on full bf16 vector types.
7843 if (UseSVEBFMLAL) {
7844 LHS = Reinterpret(LHS, MVT::nxv8bf16);
7845 RHS = Reinterpret(RHS, MVT::nxv8bf16);
7846 }
7847
7848 SDValue BottomF32 = Reinterpret(BFMLALB(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7849 SDValue BottomBF16 =
7850 FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32);
7851 // Note: nxv4bf16 only uses even lanes.
7852 if (VT == MVT::nxv4bf16)
7853 return Reinterpret(BottomBF16, VT);
7854
7855 SDValue TopF32 = Reinterpret(BFMLALT(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7856 SDValue TopBF16 = FCVTNT(MVT::nxv8bf16, BottomBF16, Pg, TopF32);
7857 return Reinterpret(TopBF16, VT);
7858}
7859
7860SDValue AArch64TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const {
7861 SDValue OpA = Op->getOperand(0);
7862 SDValue OpB = Op->getOperand(1);
7863 SDValue OpC = Op->getOperand(2);
7864 EVT VT = Op.getValueType();
7865 SDLoc DL(Op);
7866
7867 assert(VT.isVector() && "Scalar fma lowering should be handled by patterns");
7868
7869 // Bail early if we're definitely not looking to merge FNEGs into the FMA.
7870 if (VT != MVT::v8f16 && VT != MVT::v4f32 && VT != MVT::v2f64)
7871 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7872
7873 if (OpC.getOpcode() != ISD::FNEG)
7874 return useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())
7875 ? LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED)
7876 : Op; // Fallback to NEON lowering.
7877
7878 // Convert FMA/FNEG nodes to SVE to enable the following patterns:
7879 // fma(a, b, neg(c)) -> fnmls(a, b, c)
7880 // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
7881 // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
7882 SDValue Pg = getPredicateForVector(DAG, DL, VT);
7883 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7884
7885 auto ConvertToScalableFnegMt = [&](SDValue Op) {
7886 if (Op.getOpcode() == ISD::FNEG)
7887 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7888 return convertToScalableVector(DAG, ContainerVT, Op);
7889 };
7890
7891 OpA = ConvertToScalableFnegMt(OpA);
7892 OpB = ConvertToScalableFnegMt(OpB);
7893 OpC = ConvertToScalableFnegMt(OpC);
7894
7895 SDValue ScalableRes =
7896 DAG.getNode(AArch64ISD::FMA_PRED, DL, ContainerVT, Pg, OpA, OpB, OpC);
7897 return convertFromScalableVector(DAG, VT, ScalableRes);
7898}
7899
7901 SelectionDAG &DAG) const {
7902 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7903 LLVM_DEBUG(Op.dump());
7904
7905 switch (Op.getOpcode()) {
7906 default:
7907 llvm_unreachable("unimplemented operand");
7908 return SDValue();
7911 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7912 case ISD::BITCAST:
7913 return LowerBITCAST(Op, DAG);
7914 case ISD::GlobalAddress:
7915 return LowerGlobalAddress(Op, DAG);
7917 return LowerGlobalTLSAddress(Op, DAG);
7919 return LowerPtrAuthGlobalAddress(Op, DAG);
7921 return LowerADJUST_TRAMPOLINE(Op, DAG);
7923 return LowerINIT_TRAMPOLINE(Op, DAG);
7924 case ISD::SETCC:
7925 case ISD::STRICT_FSETCC:
7927 return LowerSETCC(Op, DAG);
7928 case ISD::SETCCCARRY:
7929 return LowerSETCCCARRY(Op, DAG);
7930 case ISD::BRCOND:
7931 return LowerBRCOND(Op, DAG);
7932 case ISD::BR_CC:
7933 return LowerBR_CC(Op, DAG);
7934 case ISD::SELECT:
7935 return LowerSELECT(Op, DAG);
7936 case ISD::SELECT_CC:
7937 return LowerSELECT_CC(Op, DAG);
7938 case ISD::JumpTable:
7939 return LowerJumpTable(Op, DAG);
7940 case ISD::BR_JT:
7941 return LowerBR_JT(Op, DAG);
7942 case ISD::BRIND:
7943 return LowerBRIND(Op, DAG);
7944 case ISD::ConstantPool:
7945 return LowerConstantPool(Op, DAG);
7946 case ISD::BlockAddress:
7947 return LowerBlockAddress(Op, DAG);
7948 case ISD::VASTART:
7949 return LowerVASTART(Op, DAG);
7950 case ISD::VACOPY:
7951 return LowerVACOPY(Op, DAG);
7952 case ISD::VAARG:
7953 return LowerVAARG(Op, DAG);
7954 case ISD::UADDO_CARRY:
7955 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7956 case ISD::USUBO_CARRY:
7957 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7958 case ISD::SADDO_CARRY:
7959 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7960 case ISD::SSUBO_CARRY:
7961 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7962 case ISD::SADDO:
7963 case ISD::UADDO:
7964 case ISD::SSUBO:
7965 case ISD::USUBO:
7966 case ISD::SMULO:
7967 case ISD::UMULO:
7968 return LowerXALUO(Op, DAG);
7969 case ISD::FADD:
7970 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7971 case ISD::FSUB:
7972 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7973 case ISD::FMUL:
7974 return LowerFMUL(Op, DAG);
7975 case ISD::FMA:
7976 return LowerFMA(Op, DAG);
7977 case ISD::FDIV:
7978 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7979 case ISD::FNEG:
7980 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7981 case ISD::FCEIL:
7982 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7983 case ISD::FFLOOR:
7984 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7985 case ISD::FNEARBYINT:
7986 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7987 case ISD::FRINT:
7988 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7989 case ISD::FROUND:
7990 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7991 case ISD::FROUNDEVEN:
7992 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7993 case ISD::FTRUNC:
7994 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7995 case ISD::FSQRT:
7996 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7997 case ISD::FABS:
7998 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7999 case ISD::FP_ROUND:
8001 return LowerFP_ROUND(Op, DAG);
8002 case ISD::FP_EXTEND:
8004 return LowerFP_EXTEND(Op, DAG);
8005 case ISD::FRAMEADDR:
8006 return LowerFRAMEADDR(Op, DAG);
8007 case ISD::SPONENTRY:
8008 return LowerSPONENTRY(Op, DAG);
8009 case ISD::RETURNADDR:
8010 return LowerRETURNADDR(Op, DAG);
8012 return LowerADDROFRETURNADDR(Op, DAG);
8014 return LowerCONCAT_VECTORS(Op, DAG);
8016 return LowerINSERT_VECTOR_ELT(Op, DAG);
8018 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
8019 case ISD::BUILD_VECTOR:
8020 return LowerBUILD_VECTOR(Op, DAG);
8023 return LowerEXTEND_VECTOR_INREG(Op, DAG);
8025 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
8027 return LowerVECTOR_SHUFFLE(Op, DAG);
8028 case ISD::SPLAT_VECTOR:
8029 return LowerSPLAT_VECTOR(Op, DAG);
8031 return LowerEXTRACT_SUBVECTOR(Op, DAG);
8033 return LowerINSERT_SUBVECTOR(Op, DAG);
8034 case ISD::SDIV:
8035 case ISD::UDIV:
8036 return LowerDIV(Op, DAG);
8037 case ISD::SMIN:
8038 case ISD::UMIN:
8039 case ISD::SMAX:
8040 case ISD::UMAX:
8041 return LowerMinMax(Op, DAG);
8042 case ISD::SRA:
8043 case ISD::SRL:
8044 case ISD::SHL:
8045 return LowerVectorSRA_SRL_SHL(Op, DAG);
8046 case ISD::SHL_PARTS:
8047 case ISD::SRL_PARTS:
8048 case ISD::SRA_PARTS:
8049 return LowerShiftParts(Op, DAG);
8050 case ISD::CTPOP:
8051 case ISD::PARITY:
8052 return LowerCTPOP_PARITY(Op, DAG);
8053 case ISD::FCOPYSIGN:
8054 return LowerFCOPYSIGN(Op, DAG);
8055 case ISD::OR:
8056 return LowerVectorOR(Op, DAG);
8057 case ISD::XOR:
8058 return LowerXOR(Op, DAG);
8059 case ISD::PREFETCH:
8060 return LowerPREFETCH(Op, DAG);
8061 case ISD::SINT_TO_FP:
8062 case ISD::UINT_TO_FP:
8065 return LowerINT_TO_FP(Op, DAG);
8066 case ISD::FP_TO_SINT:
8067 case ISD::FP_TO_UINT:
8070 return LowerFP_TO_INT(Op, DAG);
8073 return LowerFP_TO_INT_SAT(Op, DAG);
8074 case ISD::GET_ROUNDING:
8075 return LowerGET_ROUNDING(Op, DAG);
8076 case ISD::SET_ROUNDING:
8077 return LowerSET_ROUNDING(Op, DAG);
8078 case ISD::GET_FPMODE:
8079 return LowerGET_FPMODE(Op, DAG);
8080 case ISD::SET_FPMODE:
8081 return LowerSET_FPMODE(Op, DAG);
8082 case ISD::RESET_FPMODE:
8083 return LowerRESET_FPMODE(Op, DAG);
8084 case ISD::MUL:
8085 return LowerMUL(Op, DAG);
8086 case ISD::MULHS:
8087 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
8088 case ISD::MULHU:
8089 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
8091 return LowerINTRINSIC_W_CHAIN(Op, DAG);
8093 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
8095 return LowerINTRINSIC_VOID(Op, DAG);
8096 case ISD::ATOMIC_STORE:
8097 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
8098 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
8099 return LowerStore128(Op, DAG);
8100 }
8101 return SDValue();
8102 case ISD::STORE:
8103 return LowerSTORE(Op, DAG);
8104 case ISD::MSTORE:
8105 return LowerMSTORE(Op, DAG);
8106 case ISD::MGATHER:
8107 return LowerMGATHER(Op, DAG);
8108 case ISD::MSCATTER:
8109 return LowerMSCATTER(Op, DAG);
8111 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
8112 case ISD::VECREDUCE_ADD:
8113 case ISD::VECREDUCE_AND:
8114 case ISD::VECREDUCE_OR:
8115 case ISD::VECREDUCE_XOR:
8125 return LowerVECREDUCE(Op, DAG);
8126 case ISD::VECREDUCE_MUL:
8128 return LowerVECREDUCE_MUL(Op, DAG);
8130 return LowerATOMIC_LOAD_AND(Op, DAG);
8132 return LowerDYNAMIC_STACKALLOC(Op, DAG);
8133 case ISD::VSCALE:
8134 return LowerVSCALE(Op, DAG);
8136 return LowerVECTOR_COMPRESS(Op, DAG);
8137 case ISD::ANY_EXTEND:
8138 case ISD::SIGN_EXTEND:
8139 case ISD::ZERO_EXTEND:
8140 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
8141 case ISD::ADDRSPACECAST:
8142 return LowerADDRSPACECAST(Op, DAG);
8144 // Only custom lower when ExtraVT has a legal byte based element type.
8145 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
8146 EVT ExtraEltVT = ExtraVT.getVectorElementType();
8147 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
8148 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
8149 return SDValue();
8150
8151 return LowerToPredicatedOp(Op, DAG,
8152 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
8153 }
8154 case ISD::TRUNCATE:
8155 return LowerTRUNCATE(Op, DAG);
8156 case ISD::MLOAD:
8157 return LowerMLOAD(Op, DAG);
8158 case ISD::LOAD:
8159 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
8160 !Subtarget->isNeonAvailable()))
8161 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
8162 return LowerLOAD(Op, DAG);
8163 case ISD::ADD:
8164 case ISD::AND:
8165 case ISD::SUB:
8166 return LowerToScalableOp(Op, DAG);
8167 case ISD::FMAXIMUM:
8168 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
8169 case ISD::FMAXNUM:
8170 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
8171 case ISD::FMINIMUM:
8172 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
8173 case ISD::FMINNUM:
8174 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
8175 case ISD::VSELECT:
8176 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
8177 case ISD::ABS:
8178 return LowerABS(Op, DAG);
8179 case ISD::ABDS:
8180 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
8181 case ISD::ABDU:
8182 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
8183 case ISD::AVGFLOORS:
8184 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
8185 case ISD::AVGFLOORU:
8186 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
8187 case ISD::AVGCEILS:
8188 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
8189 case ISD::AVGCEILU:
8190 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
8191 case ISD::BITREVERSE:
8192 return LowerBitreverse(Op, DAG);
8193 case ISD::BSWAP:
8194 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
8195 case ISD::CTLZ:
8196 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
8197 case ISD::CTTZ:
8198 return LowerCTTZ(Op, DAG);
8201 return LowerVECTOR_SPLICE(Op, DAG);
8203 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
8205 return LowerVECTOR_INTERLEAVE(Op, DAG);
8207 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
8208 case ISD::LRINT:
8209 case ISD::LLRINT:
8210 if (Op.getValueType().isVector())
8211 return LowerVectorXRINT(Op, DAG);
8212 [[fallthrough]];
8213 case ISD::LROUND:
8214 case ISD::LLROUND: {
8215 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
8216 Op.getOperand(0).getValueType() == MVT::bf16) &&
8217 "Expected custom lowering of rounding operations only for f16");
8218 SDLoc DL(Op);
8219 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
8220 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
8221 }
8222 case ISD::STRICT_LROUND:
8224 case ISD::STRICT_LRINT:
8225 case ISD::STRICT_LLRINT: {
8226 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
8227 Op.getOperand(1).getValueType() == MVT::bf16) &&
8228 "Expected custom lowering of rounding operations only for f16");
8229 SDLoc DL(Op);
8230 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
8231 {Op.getOperand(0), Op.getOperand(1)});
8232 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
8233 {Ext.getValue(1), Ext.getValue(0)});
8234 }
8235 case ISD::WRITE_REGISTER: {
8236 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
8237 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
8238 SDLoc DL(Op);
8239
8240 SDValue Chain = Op.getOperand(0);
8241 SDValue SysRegName = Op.getOperand(1);
8242 std::pair<SDValue, SDValue> Pair =
8243 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
8244
8245 // chain = MSRR(chain, sysregname, lo, hi)
8246 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
8247 SysRegName, Pair.first, Pair.second);
8248
8249 return Result;
8250 }
8251 case ISD::FSHL:
8252 case ISD::FSHR:
8253 return LowerFunnelShift(Op, DAG);
8254 case ISD::FLDEXP:
8255 return LowerFLDEXP(Op, DAG);
8257 return LowerVECTOR_HISTOGRAM(Op, DAG);
8262 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
8263 }
8264}
8265
8267 return !Subtarget->useSVEForFixedLengthVectors();
8268}
8269
8271 EVT VT, bool OverrideNEON) const {
8272 if (!VT.isFixedLengthVector() || !VT.isSimple())
8273 return false;
8274
8275 // Don't use SVE for vectors we cannot scalarize if required.
8276 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8277 // Fixed length predicates should be promoted to i8.
8278 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
8279 case MVT::i1:
8280 default:
8281 return false;
8282 case MVT::i8:
8283 case MVT::i16:
8284 case MVT::i32:
8285 case MVT::i64:
8286 case MVT::f16:
8287 case MVT::f32:
8288 case MVT::f64:
8289 break;
8290 }
8291
8292 // NEON-sized vectors can be emulated using SVE instructions.
8293 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
8294 return Subtarget->isSVEorStreamingSVEAvailable();
8295
8296 // Ensure NEON MVTs only belong to a single register class.
8297 if (VT.getFixedSizeInBits() <= 128)
8298 return false;
8299
8300 // Ensure wider than NEON code generation is enabled.
8301 if (!Subtarget->useSVEForFixedLengthVectors())
8302 return false;
8303
8304 // Don't use SVE for types that don't fit.
8305 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
8306 return false;
8307
8308 // TODO: Perhaps an artificial restriction, but worth having whilst getting
8309 // the base fixed length SVE support in place.
8310 if (!VT.isPow2VectorType())
8311 return false;
8312
8313 return true;
8314}
8315
8316//===----------------------------------------------------------------------===//
8317// Calling Convention Implementation
8318//===----------------------------------------------------------------------===//
8319
8320static unsigned getIntrinsicID(const SDNode *N) {
8321 unsigned Opcode = N->getOpcode();
8322 switch (Opcode) {
8323 default:
8326 unsigned IID = N->getConstantOperandVal(0);
8327 if (IID < Intrinsic::num_intrinsics)
8328 return IID;
8330 }
8331 }
8332}
8333
8335 SDValue N1) const {
8336 if (!N0.hasOneUse())
8337 return false;
8338
8339 unsigned IID = getIntrinsicID(N1.getNode());
8340 // Avoid reassociating expressions that can be lowered to smlal/umlal.
8341 if (IID == Intrinsic::aarch64_neon_umull ||
8342 N1.getOpcode() == AArch64ISD::UMULL ||
8343 IID == Intrinsic::aarch64_neon_smull ||
8344 N1.getOpcode() == AArch64ISD::SMULL)
8345 return N0.getOpcode() != ISD::ADD;
8346
8347 return true;
8348}
8349
8350/// Selects the correct CCAssignFn for a given CallingConvention value.
8352 bool IsVarArg) const {
8353 switch (CC) {
8354 default:
8355 reportFatalUsageError("unsupported calling convention");
8356 case CallingConv::GHC:
8357 return CC_AArch64_GHC;
8359 // The VarArg implementation makes assumptions about register
8360 // argument passing that do not hold for preserve_none, so we
8361 // instead fall back to C argument passing.
8362 // The non-vararg case is handled in the CC function itself.
8363 if (!IsVarArg)
8365 [[fallthrough]];
8366 case CallingConv::C:
8367 case CallingConv::Fast:
8371 case CallingConv::Swift:
8373 case CallingConv::Tail:
8374 case CallingConv::GRAAL:
8375 if (Subtarget->isTargetWindows()) {
8376 if (IsVarArg) {
8377 if (Subtarget->isWindowsArm64EC())
8380 }
8381 return CC_AArch64_Win64PCS;
8382 }
8383 if (!Subtarget->isTargetDarwin())
8384 return CC_AArch64_AAPCS;
8385 if (!IsVarArg)
8386 return CC_AArch64_DarwinPCS;
8387 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8389 case CallingConv::Win64:
8390 if (IsVarArg) {
8391 if (Subtarget->isWindowsArm64EC())
8394 }
8395 return CC_AArch64_Win64PCS;
8397 if (Subtarget->isWindowsArm64EC())
8405 return CC_AArch64_AAPCS;
8410 }
8411}
8412
8413CCAssignFn *
8415 switch (CC) {
8416 default:
8417 return RetCC_AArch64_AAPCS;
8421 if (Subtarget->isWindowsArm64EC())
8423 return RetCC_AArch64_AAPCS;
8424 }
8425}
8426
8427static bool isPassedInFPR(EVT VT) {
8428 return VT.isFixedLengthVector() ||
8429 (VT.isFloatingPoint() && !VT.isScalableVector());
8430}
8431
8433 AArch64FunctionInfo &FuncInfo,
8434 SelectionDAG &DAG) {
8435 if (!FuncInfo.hasZT0SpillSlotIndex())
8436 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8437
8438 return DAG.getFrameIndex(
8439 FuncInfo.getZT0SpillSlotIndex(),
8441}
8442
8443// Emit a call to __arm_sme_save or __arm_sme_restore.
8445 SelectionDAG &DAG,
8447 SDValue Chain, bool IsSave) {
8450 FuncInfo->setSMESaveBufferUsed();
8452 Args.emplace_back(
8453 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8455
8456 RTLIB::Libcall LC =
8457 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8458 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
8459 SDValue Callee =
8460 DAG.getExternalSymbol(LCImpl, TLI.getPointerTy(DAG.getDataLayout()));
8461 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8463 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8464 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
8465 std::move(Args));
8466 return TLI.LowerCallTo(CLI).second;
8467}
8468
8470 const AArch64TargetLowering &TLI,
8471 const AArch64RegisterInfo &TRI,
8472 AArch64FunctionInfo &FuncInfo,
8473 SelectionDAG &DAG) {
8474 // Conditionally restore the lazy save using a pseudo node.
8475 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
8476 TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
8477
8478 RTLIB::LibcallImpl LibcallImpl = DAG.getLibcalls().getLibcallImpl(LC);
8479 SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
8480 DAG.getMachineFunction(),
8481 DAG.getLibcalls().getLibcallImplCallingConv(LibcallImpl)));
8482 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8483 LibcallImpl, TLI.getPointerTy(DAG.getDataLayout()));
8484 SDValue TPIDR2_EL0 = DAG.getNode(
8485 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
8486 DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8487 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8488 // RESTORE_ZA pseudo.
8489 SDValue Glue;
8490 SDValue TPIDR2Block = DAG.getFrameIndex(
8491 TPIDR2.FrameIndex,
8493 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
8494 Chain =
8495 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8496 {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8497 RestoreRoutine, RegMask, Chain.getValue(1)});
8498 // Finally reset the TPIDR2_EL0 register to 0.
8499 Chain = DAG.getNode(
8500 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8501 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8502 DAG.getConstant(0, DL, MVT::i64));
8503 TPIDR2.Uses++;
8504 return Chain;
8505}
8506
8507SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8508 SelectionDAG &DAG) const {
8509 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8510 SDValue Glue = Chain.getValue(1);
8511
8512 MachineFunction &MF = DAG.getMachineFunction();
8513 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8514 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
8515 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
8516
8517 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8518
8519 // The following conditions are true on entry to an exception handler:
8520 // - PSTATE.SM is 0.
8521 // - PSTATE.ZA is 0.
8522 // - TPIDR2_EL0 is null.
8523 // See:
8524 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8525 //
8526 // Therefore, if the function that contains this exception handler is a
8527 // streaming[-compatible] function, we must re-enable streaming mode.
8528 //
8529 // These mode changes are usually optimized away in catch blocks as they
8530 // occur before the __cxa_begin_catch (which is a non-streaming function),
8531 // but are necessary in some cases (such as for cleanups).
8532 //
8533 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8534
8535 // [COND_]SMSTART SM
8536 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8537 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8538 /*Glue*/ Glue, AArch64SME::Always);
8539 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8540 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8542
8543 if (getTM().useNewSMEABILowering())
8544 return Chain;
8545
8546 if (SMEFnAttrs.hasAgnosticZAInterface()) {
8547 // Restore full ZA
8548 Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
8549 /*IsSave=*/false);
8550 } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
8551 // SMSTART ZA
8552 Chain = DAG.getNode(
8553 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
8554 DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
8555
8556 // Restore ZT0
8557 if (SMEFnAttrs.hasZT0State()) {
8558 SDValue ZT0FrameIndex =
8559 getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
8560 Chain =
8561 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8562 {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
8563 }
8564
8565 // Restore ZA
8566 if (SMEFnAttrs.hasZAState())
8567 Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
8568 }
8569
8570 return Chain;
8571}
8572
8573SDValue AArch64TargetLowering::LowerFormalArguments(
8574 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8575 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8576 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8577 MachineFunction &MF = DAG.getMachineFunction();
8578 const Function &F = MF.getFunction();
8579 MachineFrameInfo &MFI = MF.getFrameInfo();
8580 bool IsWin64 =
8581 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8582 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8583 (isVarArg && Subtarget->isWindowsArm64EC());
8584 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8585
8587 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8589 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8590 FuncInfo->setIsSVECC(true);
8591
8592 // Assign locations to all of the incoming arguments.
8594 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8595
8596 // At this point, Ins[].VT may already be promoted to i32. To correctly
8597 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8598 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8599 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8600 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8601 // LocVT.
8602 unsigned NumArgs = Ins.size();
8603 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8604 unsigned CurArgIdx = 0;
8605 bool UseVarArgCC = false;
8606 if (IsWin64)
8607 UseVarArgCC = isVarArg;
8608
8609 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8610
8611 for (unsigned i = 0; i != NumArgs; ++i) {
8612 MVT ValVT = Ins[i].VT;
8613 if (Ins[i].isOrigArg()) {
8614 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8615 CurArgIdx = Ins[i].getOrigArgIndex();
8616
8617 // Get type of the original argument.
8618 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8619 /*AllowUnknown*/ true);
8620 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8621 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8622 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8623 ValVT = MVT::i8;
8624 else if (ActualMVT == MVT::i16)
8625 ValVT = MVT::i16;
8626 }
8627 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8628 Ins[i].OrigTy, CCInfo);
8629 assert(!Res && "Call operand has unhandled type");
8630 (void)Res;
8631 }
8632
8633 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8634 bool IsLocallyStreaming =
8635 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8636 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8637 SDValue Glue = Chain.getValue(1);
8638
8639 unsigned ExtraArgLocs = 0;
8640 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8641 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8642
8643 if (Ins[i].Flags.isByVal()) {
8644 // Byval is used for HFAs in the PCS, but the system should work in a
8645 // non-compliant manner for larger structs.
8646 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8647 int Size = Ins[i].Flags.getByValSize();
8648 unsigned NumRegs = (Size + 7) / 8;
8649
8650 // FIXME: This works on big-endian for composite byvals, which are the common
8651 // case. It should also work for fundamental types too.
8652 unsigned FrameIdx =
8653 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8654 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8655 InVals.push_back(FrameIdxN);
8656
8657 continue;
8658 }
8659
8660 if (Ins[i].Flags.isSwiftAsync())
8661 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8662
8663 SDValue ArgValue;
8664 if (VA.isRegLoc()) {
8665 // Arguments stored in registers.
8666 EVT RegVT = VA.getLocVT();
8667 const TargetRegisterClass *RC;
8668
8669 if (RegVT == MVT::i32)
8670 RC = &AArch64::GPR32RegClass;
8671 else if (RegVT == MVT::i64)
8672 RC = &AArch64::GPR64RegClass;
8673 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8674 RC = &AArch64::FPR16RegClass;
8675 else if (RegVT == MVT::f32)
8676 RC = &AArch64::FPR32RegClass;
8677 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8678 RC = &AArch64::FPR64RegClass;
8679 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8680 RC = &AArch64::FPR128RegClass;
8681 else if (RegVT.isScalableVector() &&
8682 RegVT.getVectorElementType() == MVT::i1) {
8683 FuncInfo->setIsSVECC(true);
8684 RC = &AArch64::PPRRegClass;
8685 } else if (RegVT == MVT::aarch64svcount) {
8686 FuncInfo->setIsSVECC(true);
8687 RC = &AArch64::PPRRegClass;
8688 } else if (RegVT.isScalableVector()) {
8689 FuncInfo->setIsSVECC(true);
8690 RC = &AArch64::ZPRRegClass;
8691 } else
8692 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8693
8694 // Transform the arguments in physical registers into virtual ones.
8695 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8696
8697 if (IsLocallyStreaming) {
8698 // LocallyStreamingFunctions must insert the SMSTART in the correct
8699 // position, so we use Glue to ensure no instructions can be scheduled
8700 // between the chain of:
8701 // t0: ch,glue = EntryNode
8702 // t1: res,ch,glue = CopyFromReg
8703 // ...
8704 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8705 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8706 // ^^^^^^
8707 // This will be the new Chain/Root node.
8708 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8709 Glue = ArgValue.getValue(2);
8710 if (isPassedInFPR(ArgValue.getValueType())) {
8711 ArgValue =
8712 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8713 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8714 {ArgValue, Glue});
8715 Glue = ArgValue.getValue(1);
8716 }
8717 } else
8718 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8719
8720 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8721 // to 64 bits. Insert an assert[sz]ext to capture this, then
8722 // truncate to the right size.
8723 switch (VA.getLocInfo()) {
8724 default:
8725 llvm_unreachable("Unknown loc info!");
8726 case CCValAssign::Full:
8727 break;
8729 assert(
8730 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8731 "Indirect arguments should be scalable on most subtargets");
8732 break;
8733 case CCValAssign::BCvt:
8734 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8735 break;
8736 case CCValAssign::AExt:
8737 case CCValAssign::SExt:
8738 case CCValAssign::ZExt:
8739 break;
8741 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8742 DAG.getConstant(32, DL, RegVT));
8743 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8744 break;
8745 }
8746 } else { // VA.isRegLoc()
8747 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8748 unsigned ArgOffset = VA.getLocMemOffset();
8749 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8750 ? VA.getLocVT().getSizeInBits()
8751 : VA.getValVT().getSizeInBits()) / 8;
8752
8753 uint32_t BEAlign = 0;
8754 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8755 !Ins[i].Flags.isInConsecutiveRegs())
8756 BEAlign = 8 - ArgSize;
8757
8758 SDValue FIN;
8759 MachinePointerInfo PtrInfo;
8760 if (StackViaX4) {
8761 // In both the ARM64EC varargs convention and the thunk convention,
8762 // arguments on the stack are accessed relative to x4, not sp. In
8763 // the thunk convention, there's an additional offset of 32 bytes
8764 // to account for the shadow store.
8765 unsigned ObjOffset = ArgOffset + BEAlign;
8766 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8767 ObjOffset += 32;
8768 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8769 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8770 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8771 DAG.getConstant(ObjOffset, DL, MVT::i64));
8773 } else {
8774 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8775
8776 // Create load nodes to retrieve arguments from the stack.
8777 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8778 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8779 }
8780
8781 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8783 MVT MemVT = VA.getValVT();
8784
8785 switch (VA.getLocInfo()) {
8786 default:
8787 break;
8788 case CCValAssign::Trunc:
8789 case CCValAssign::BCvt:
8790 MemVT = VA.getLocVT();
8791 break;
8793 assert(
8794 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8795 "Indirect arguments should be scalable on most subtargets");
8796 MemVT = VA.getLocVT();
8797 break;
8798 case CCValAssign::SExt:
8799 ExtType = ISD::SEXTLOAD;
8800 break;
8801 case CCValAssign::ZExt:
8802 ExtType = ISD::ZEXTLOAD;
8803 break;
8804 case CCValAssign::AExt:
8805 ExtType = ISD::EXTLOAD;
8806 break;
8807 }
8808
8809 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8810 MemVT);
8811 }
8812
8813 if (VA.getLocInfo() == CCValAssign::Indirect) {
8814 assert((VA.getValVT().isScalableVT() ||
8815 Subtarget->isWindowsArm64EC()) &&
8816 "Indirect arguments should be scalable on most subtargets");
8817
8818 TypeSize PartSize = VA.getValVT().getStoreSize();
8819 unsigned NumParts = 1;
8820 if (Ins[i].Flags.isInConsecutiveRegs()) {
8821 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8822 ++NumParts;
8823 }
8824
8825 MVT PartLoad = VA.getValVT();
8826 SDValue Ptr = ArgValue;
8827
8828 // Ensure we generate all loads for each tuple part, whilst updating the
8829 // pointer after each load correctly using vscale.
8830 while (NumParts > 0) {
8831 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8832 InVals.push_back(ArgValue);
8833 NumParts--;
8834 if (NumParts > 0) {
8835 SDValue BytesIncrement =
8836 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
8837 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8838 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8839 ExtraArgLocs++;
8840 i++;
8841 }
8842 }
8843 } else {
8844 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8845 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8846 ArgValue, DAG.getValueType(MVT::i32));
8847
8848 // i1 arguments are zero-extended to i8 by the caller. Emit a
8849 // hint to reflect this.
8850 if (Ins[i].isOrigArg()) {
8851 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8852 if (OrigArg->getType()->isIntegerTy(1)) {
8853 if (!Ins[i].Flags.isZExt()) {
8854 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8855 ArgValue.getValueType(), ArgValue);
8856 }
8857 }
8858 }
8859
8860 InVals.push_back(ArgValue);
8861 }
8862 }
8863 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8864
8865 if (Attrs.hasStreamingCompatibleInterface()) {
8866 SDValue EntryPStateSM =
8867 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8868 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8869
8870 // Copy the value to a virtual register, and save that in FuncInfo.
8871 Register EntryPStateSMReg =
8872 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8873 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8874 EntryPStateSM);
8875 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8876 }
8877
8878 // Insert the SMSTART if this is a locally streaming function and
8879 // make sure it is Glued to the last CopyFromReg value.
8880 if (IsLocallyStreaming) {
8881 if (Attrs.hasStreamingCompatibleInterface())
8882 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8884 else
8885 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8887
8888 // Ensure that the SMSTART happens after the CopyWithChain such that its
8889 // chain result is used.
8890 for (unsigned I=0; I<InVals.size(); ++I) {
8893 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8894 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8895 InVals[I].getValueType());
8896 }
8897 }
8898
8899 // varargs
8900 if (isVarArg) {
8902 if (!Subtarget->isTargetDarwin() || IsWin64) {
8903 // The AAPCS variadic function ABI is identical to the non-variadic
8904 // one. As a result there may be more arguments in registers and we
8905 // should save them for future reference.
8906 // Win64 variadic functions also pass arguments in registers, but all
8907 // float arguments are passed in integer registers.
8908 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8909 }
8910
8911 // This will point to the next argument passed via stack.
8912 unsigned VarArgsOffset = CCInfo.getStackSize();
8913 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8914 VarArgsOffset =
8915 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8916 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8917 FuncInfo->setVarArgsStackIndex(
8918 MFI.CreateFixedObject(4, VarArgsOffset, true));
8919 }
8920
8921 if (MFI.hasMustTailInVarArgFunc()) {
8922 SmallVector<MVT, 2> RegParmTypes;
8923 RegParmTypes.push_back(MVT::i64);
8924 RegParmTypes.push_back(MVT::f128);
8925 // Compute the set of forwarded registers. The rest are scratch.
8926 SmallVectorImpl<ForwardedRegister> &Forwards =
8927 FuncInfo->getForwardedMustTailRegParms();
8928 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8930
8931 // Conservatively forward X8, since it might be used for aggregate return.
8932 if (!CCInfo.isAllocated(AArch64::X8)) {
8933 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8934 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8935 }
8936 }
8937 }
8938
8939 // On Windows, InReg pointers must be returned, so record the pointer in a
8940 // virtual register at the start of the function so it can be returned in the
8941 // epilogue.
8942 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8943 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8944 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8945 Ins[I].Flags.isInReg()) &&
8946 Ins[I].Flags.isSRet()) {
8947 assert(!FuncInfo->getSRetReturnReg());
8948
8949 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8950 Register Reg =
8952 FuncInfo->setSRetReturnReg(Reg);
8953
8954 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8955 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8956 break;
8957 }
8958 }
8959 }
8960
8961 unsigned StackArgSize = CCInfo.getStackSize();
8962 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8963 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8964 // This is a non-standard ABI so by fiat I say we're allowed to make full
8965 // use of the stack area to be popped, which must be aligned to 16 bytes in
8966 // any case:
8967 StackArgSize = alignTo(StackArgSize, 16);
8968
8969 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8970 // a multiple of 16.
8971 FuncInfo->setArgumentStackToRestore(StackArgSize);
8972
8973 // This realignment carries over to the available bytes below. Our own
8974 // callers will guarantee the space is free by giving an aligned value to
8975 // CALLSEQ_START.
8976 }
8977 // Even if we're not expected to free up the space, it's useful to know how
8978 // much is there while considering tail calls (because we can reuse it).
8979 FuncInfo->setBytesInStackArgArea(StackArgSize);
8980
8981 if (Subtarget->hasCustomCallingConv())
8982 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8983
8984 if (getTM().useNewSMEABILowering()) {
8985 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8986 SDValue Size;
8987 if (Attrs.hasZAState()) {
8988 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8989 DAG.getConstant(1, DL, MVT::i32));
8990 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8991 } else if (Attrs.hasAgnosticZAInterface()) {
8992 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8993 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
8994
8995 SDValue Callee =
8996 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
8997 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8998 TargetLowering::CallLoweringInfo CLI(DAG);
8999 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9000 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
9001 {});
9002 std::tie(Size, Chain) = LowerCallTo(CLI);
9003 }
9004 if (Size) {
9005 SDValue Buffer = DAG.getNode(
9006 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9007 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9008 Chain = Buffer.getValue(1);
9009
9010 Register BufferPtr =
9011 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9012 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
9013 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
9014 DAG.getVTList(MVT::Other), Chain);
9015 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
9016 MFI.CreateVariableSizedObject(Align(16), nullptr);
9017 }
9018 }
9019 } else {
9020 // Old SME ABI lowering (deprecated):
9021 // Create a 16 Byte TPIDR2 object. The dynamic buffer
9022 // will be expanded and stored in the static object later using a
9023 // pseudonode.
9024 if (Attrs.hasZAState()) {
9025 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9026 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
9027 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9028 DAG.getConstant(1, DL, MVT::i32));
9029 SDValue Buffer;
9030 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
9031 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
9032 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
9033 } else {
9034 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
9035 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
9036 DAG.getVTList(MVT::i64, MVT::Other),
9037 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9038 MFI.CreateVariableSizedObject(Align(16), nullptr);
9039 }
9040 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9041 DAG.getConstant(1, DL, MVT::i32));
9042 Chain = DAG.getNode(
9043 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
9044 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
9045 /*Num save slices*/ NumZaSaveSlices});
9046 } else if (Attrs.hasAgnosticZAInterface()) {
9047 // Call __arm_sme_state_size().
9048 SDValue BufferSize =
9049 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
9050 DAG.getVTList(MVT::i64, MVT::Other), Chain);
9051 Chain = BufferSize.getValue(1);
9052 SDValue Buffer;
9053 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
9054 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
9055 DAG.getVTList(MVT::i64, MVT::Other),
9056 {Chain, BufferSize});
9057 } else {
9058 // Allocate space dynamically.
9059 Buffer = DAG.getNode(
9060 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9061 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
9062 MFI.CreateVariableSizedObject(Align(16), nullptr);
9063 }
9064 // Copy the value to a virtual register, and save that in FuncInfo.
9065 Register BufferPtr =
9066 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9067 FuncInfo->setSMESaveBufferAddr(BufferPtr);
9068 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
9069 }
9070 }
9071
9072 if (CallConv == CallingConv::PreserveNone) {
9073 for (const ISD::InputArg &I : Ins) {
9074 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
9075 I.Flags.isSwiftAsync()) {
9076 MachineFunction &MF = DAG.getMachineFunction();
9077 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9078 MF.getFunction(),
9079 "Swift attributes can't be used with preserve_none",
9080 DL.getDebugLoc()));
9081 break;
9082 }
9083 }
9084 }
9085
9086 return Chain;
9087}
9088
9089void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
9090 SelectionDAG &DAG,
9091 const SDLoc &DL,
9092 SDValue &Chain) const {
9093 MachineFunction &MF = DAG.getMachineFunction();
9094 MachineFrameInfo &MFI = MF.getFrameInfo();
9095 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9096 auto PtrVT = getPointerTy(DAG.getDataLayout());
9097 Function &F = MF.getFunction();
9098 bool IsWin64 =
9099 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
9100
9102
9104 unsigned NumGPRArgRegs = GPRArgRegs.size();
9105 if (Subtarget->isWindowsArm64EC()) {
9106 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
9107 // functions.
9108 NumGPRArgRegs = 4;
9109 }
9110 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
9111
9112 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
9113 int GPRIdx = 0;
9114 if (GPRSaveSize != 0) {
9115 if (IsWin64) {
9116 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
9117 if (GPRSaveSize & 15)
9118 // The extra size here, if triggered, will always be 8.
9119 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
9120 } else
9121 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
9122
9123 SDValue FIN;
9124 if (Subtarget->isWindowsArm64EC()) {
9125 // With the Arm64EC ABI, we reserve the save area as usual, but we
9126 // compute its address relative to x4. For a normal AArch64->AArch64
9127 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
9128 // different address.
9129 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9130 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9131 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
9132 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
9133 } else {
9134 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
9135 }
9136
9137 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
9138 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
9139 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9140 SDValue Store =
9141 DAG.getStore(Val.getValue(1), DL, Val, FIN,
9143 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
9144 : MachinePointerInfo::getStack(MF, i * 8));
9145 MemOps.push_back(Store);
9146 FIN =
9147 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
9148 }
9149 }
9150 FuncInfo->setVarArgsGPRIndex(GPRIdx);
9151 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
9152
9153 if (Subtarget->hasFPARMv8() && !IsWin64) {
9155 const unsigned NumFPRArgRegs = FPRArgRegs.size();
9156 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
9157
9158 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
9159 int FPRIdx = 0;
9160 if (FPRSaveSize != 0) {
9161 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
9162
9163 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
9164
9165 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
9166 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
9167 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
9168
9169 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
9170 MachinePointerInfo::getStack(MF, i * 16));
9171 MemOps.push_back(Store);
9172 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
9173 DAG.getConstant(16, DL, PtrVT));
9174 }
9175 }
9176 FuncInfo->setVarArgsFPRIndex(FPRIdx);
9177 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
9178 }
9179
9180 if (!MemOps.empty()) {
9181 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9182 }
9183}
9184
9185/// LowerCallResult - Lower the result values of a call into the
9186/// appropriate copies out of appropriate physical registers.
9187SDValue AArch64TargetLowering::LowerCallResult(
9188 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
9189 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
9190 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
9191 SDValue ThisVal, bool RequiresSMChange) const {
9192 DenseMap<unsigned, SDValue> CopiedRegs;
9193 // Copy all of the result registers out of their specified physreg.
9194 for (unsigned i = 0; i != RVLocs.size(); ++i) {
9195 CCValAssign VA = RVLocs[i];
9196
9197 // Pass 'this' value directly from the argument to return value, to avoid
9198 // reg unit interference
9199 if (i == 0 && isThisReturn) {
9200 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
9201 "unexpected return calling convention register assignment");
9202 InVals.push_back(ThisVal);
9203 continue;
9204 }
9205
9206 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
9207 // allows one use of a physreg per block.
9208 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
9209 if (!Val) {
9210 Val =
9211 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
9212 Chain = Val.getValue(1);
9213 InGlue = Val.getValue(2);
9214 CopiedRegs[VA.getLocReg()] = Val;
9215 }
9216
9217 switch (VA.getLocInfo()) {
9218 default:
9219 llvm_unreachable("Unknown loc info!");
9220 case CCValAssign::Full:
9221 break;
9222 case CCValAssign::BCvt:
9223 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
9224 break;
9226 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
9227 DAG.getConstant(32, DL, VA.getLocVT()));
9228 [[fallthrough]];
9229 case CCValAssign::AExt:
9230 [[fallthrough]];
9231 case CCValAssign::ZExt:
9232 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
9233 break;
9234 }
9235
9236 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
9237 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9238 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
9239
9240 InVals.push_back(Val);
9241 }
9242
9243 return Chain;
9244}
9245
9246/// Return true if the calling convention is one that we can guarantee TCO for.
9247static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
9248 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
9250}
9251
9252/// Return true if we might ever do TCO for calls with this calling convention.
9254 switch (CC) {
9255 case CallingConv::C:
9260 case CallingConv::Swift:
9262 case CallingConv::Tail:
9263 case CallingConv::Fast:
9264 return true;
9265 default:
9266 return false;
9267 }
9268}
9269
9270/// Return true if the call convention supports varargs
9271/// Currently only those that pass varargs like the C
9272/// calling convention does are eligible
9273/// Calling conventions listed in this function must also
9274/// be properly handled in AArch64Subtarget::isCallingConvWin64
9276 switch (CC) {
9277 case CallingConv::C:
9279 // SVE vector call is only partially supported, but it should
9280 // support named arguments being passed. Any arguments being passed
9281 // as varargs, are still unsupported.
9283 return true;
9284 default:
9285 return false;
9286 }
9287}
9288
9290 const AArch64Subtarget *Subtarget,
9292 CCState &CCInfo) {
9293 const SelectionDAG &DAG = CLI.DAG;
9294 CallingConv::ID CalleeCC = CLI.CallConv;
9295 bool IsVarArg = CLI.IsVarArg;
9296 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9297 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
9298
9299 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
9300 // for the shadow store.
9301 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
9302 CCInfo.AllocateStack(32, Align(16));
9303
9304 unsigned NumArgs = Outs.size();
9305 for (unsigned i = 0; i != NumArgs; ++i) {
9306 MVT ArgVT = Outs[i].VT;
9307 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
9308
9309 bool UseVarArgCC = false;
9310 if (IsVarArg) {
9311 // On Windows, the fixed arguments in a vararg call are passed in GPRs
9312 // too, so use the vararg CC to force them to integer registers.
9313 if (IsCalleeWin64) {
9314 UseVarArgCC = true;
9315 } else {
9316 UseVarArgCC = ArgFlags.isVarArg();
9317 }
9318 }
9319
9320 if (!UseVarArgCC) {
9321 // Get type of the original argument.
9322 EVT ActualVT =
9323 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
9324 /*AllowUnknown*/ true);
9325 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
9326 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
9327 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
9328 ArgVT = MVT::i8;
9329 else if (ActualMVT == MVT::i16)
9330 ArgVT = MVT::i16;
9331 }
9332
9333 // FIXME: CCAssignFnForCall should be called once, for the call and not per
9334 // argument. This logic should exactly mirror LowerFormalArguments.
9335 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
9336 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
9337 Outs[i].OrigTy, CCInfo);
9338 assert(!Res && "Call operand has unhandled type");
9339 (void)Res;
9340 }
9341}
9342
9343static SMECallAttrs
9346 if (CLI.CB)
9347 return SMECallAttrs(*CLI.CB, &RTLCI);
9348 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9349 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
9351}
9352
9353bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9354 const CallLoweringInfo &CLI) const {
9355 CallingConv::ID CalleeCC = CLI.CallConv;
9356 if (!mayTailCallThisCC(CalleeCC))
9357 return false;
9358
9359 SDValue Callee = CLI.Callee;
9360 bool IsVarArg = CLI.IsVarArg;
9361 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9362 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9363 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9364 const SelectionDAG &DAG = CLI.DAG;
9365 MachineFunction &MF = DAG.getMachineFunction();
9366 const Function &CallerF = MF.getFunction();
9367 CallingConv::ID CallerCC = CallerF.getCallingConv();
9368
9369 // SME Streaming functions are not eligible for TCO as they may require
9370 // the streaming mode or ZA/ZT0 to be restored after returning from the call.
9371 SMECallAttrs CallAttrs =
9372 getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
9373 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9374 CallAttrs.requiresPreservingAllZAState() ||
9375 CallAttrs.requiresPreservingZT0() ||
9376 CallAttrs.caller().hasStreamingBody())
9377 return false;
9378
9379 // Functions using the C or Fast calling convention that have an SVE signature
9380 // preserve more registers and should assume the SVE_VectorCall CC.
9381 // The check for matching callee-saved regs will determine whether it is
9382 // eligible for TCO.
9383 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9384 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9386
9387 bool CCMatch = CallerCC == CalleeCC;
9388
9389 // When using the Windows calling convention on a non-windows OS, we want
9390 // to back up and restore X18 in such functions; we can't do a tail call
9391 // from those functions.
9392 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9393 CalleeCC != CallingConv::Win64)
9394 return false;
9395
9396 // Byval parameters hand the function a pointer directly into the stack area
9397 // we want to reuse during a tail call. Working around this *is* possible (see
9398 // X86) but less efficient and uglier in LowerCall.
9399 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9400 e = CallerF.arg_end();
9401 i != e; ++i) {
9402 if (i->hasByValAttr())
9403 return false;
9404
9405 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9406 // In this case, it is necessary to save X0/X1 in the callee and return it
9407 // in X0. Tail call opt may interfere with this, so we disable tail call
9408 // opt when the caller has an "inreg" attribute -- except if the callee
9409 // also has that attribute on the same argument, and the same value is
9410 // passed.
9411 if (i->hasInRegAttr()) {
9412 unsigned ArgIdx = i - CallerF.arg_begin();
9413 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9414 return false;
9415 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9416 if (!Attrs.hasAttribute(Attribute::InReg) ||
9417 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9418 CLI.CB->getArgOperand(ArgIdx) != i) {
9419 return false;
9420 }
9421 }
9422 }
9423
9424 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9425 return CCMatch;
9426
9427 // Externally-defined functions with weak linkage should not be
9428 // tail-called on AArch64 when the OS does not support dynamic
9429 // pre-emption of symbols, as the AAELF spec requires normal calls
9430 // to undefined weak functions to be replaced with a NOP or jump to the
9431 // next instruction. The behaviour of branch instructions in this
9432 // situation (as used for tail calls) is implementation-defined, so we
9433 // cannot rely on the linker replacing the tail call with a return.
9434 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9435 const GlobalValue *GV = G->getGlobal();
9436 const Triple &TT = getTargetMachine().getTargetTriple();
9437 if (GV->hasExternalWeakLinkage() &&
9438 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9439 return false;
9440 }
9441
9442 // Now we search for cases where we can use a tail call without changing the
9443 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9444 // concept.
9445
9446 // I want anyone implementing a new calling convention to think long and hard
9447 // about this assert.
9448 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9449 report_fatal_error("Unsupported variadic calling convention");
9450
9451 LLVMContext &C = *DAG.getContext();
9452 // Check that the call results are passed in the same way.
9453 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9454 CCAssignFnForCall(CalleeCC, IsVarArg),
9455 CCAssignFnForCall(CallerCC, IsVarArg)))
9456 return false;
9457 // The callee has to preserve all registers the caller needs to preserve.
9458 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9459 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9460 if (!CCMatch) {
9461 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9462 if (Subtarget->hasCustomCallingConv()) {
9463 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9464 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9465 }
9466 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9467 return false;
9468 }
9469
9470 // Nothing more to check if the callee is taking no arguments
9471 if (Outs.empty())
9472 return true;
9473
9475 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9476
9477 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9478
9479 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9480 // When we are musttail, additional checks have been done and we can safely ignore this check
9481 // At least two cases here: if caller is fastcc then we can't have any
9482 // memory arguments (we'd be expected to clean up the stack afterwards). If
9483 // caller is C then we could potentially use its argument area.
9484
9485 // FIXME: for now we take the most conservative of these in both cases:
9486 // disallow all variadic memory operands.
9487 for (const CCValAssign &ArgLoc : ArgLocs)
9488 if (!ArgLoc.isRegLoc())
9489 return false;
9490 }
9491
9492 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9493
9494 // If any of the arguments is passed indirectly, it must be SVE, so the
9495 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9496 // allocate space on the stack. That is why we determine this explicitly here
9497 // the call cannot be a tailcall.
9498 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9499 assert((A.getLocInfo() != CCValAssign::Indirect ||
9500 A.getValVT().isScalableVector() ||
9501 Subtarget->isWindowsArm64EC()) &&
9502 "Expected value to be scalable");
9503 return A.getLocInfo() == CCValAssign::Indirect;
9504 }))
9505 return false;
9506
9507 // If the stack arguments for this call do not fit into our own save area then
9508 // the call cannot be made tail.
9509 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9510 return false;
9511
9512 const MachineRegisterInfo &MRI = MF.getRegInfo();
9513 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9514 return false;
9515
9516 return true;
9517}
9518
9519SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9520 SelectionDAG &DAG,
9521 MachineFrameInfo &MFI,
9522 int ClobberedFI) const {
9523 SmallVector<SDValue, 8> ArgChains;
9524 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9525 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9526
9527 // Include the original chain at the beginning of the list. When this is
9528 // used by target LowerCall hooks, this helps legalize find the
9529 // CALLSEQ_BEGIN node.
9530 ArgChains.push_back(Chain);
9531
9532 // Add a chain value for each stack argument corresponding
9533 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9534 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9535 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9536 if (FI->getIndex() < 0) {
9537 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9538 int64_t InLastByte = InFirstByte;
9539 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9540
9541 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9542 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9543 ArgChains.push_back(SDValue(L, 1));
9544 }
9545
9546 // Build a tokenfactor for all the chains.
9547 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9548}
9549
9550bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9551 bool TailCallOpt) const {
9552 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9553 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9554}
9555
9556// Check if the value is zero-extended from i1 to i8
9557static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9558 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9559 if (SizeInBits < 8)
9560 return false;
9561
9562 APInt RequiredZero(SizeInBits, 0xFE);
9563 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9564 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9565 return ZExtBool;
9566}
9567
9568void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9569 SDNode *Node) const {
9570 // Live-in physreg copies that are glued to SMSTART are applied as
9571 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9572 // register allocator to pass call args in callee saved regs, without extra
9573 // copies to avoid these fake clobbers of actually-preserved GPRs.
9574 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9575 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9576 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9577 if (MachineOperand &MO = MI.getOperand(I);
9578 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9579 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9580 AArch64::GPR64RegClass.contains(MO.getReg())))
9581 MI.removeOperand(I);
9582
9583 // The SVE vector length can change when entering/leaving streaming mode.
9584 // FPMR is set to 0 when entering/leaving streaming mode.
9585 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9586 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9587 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9588 /*IsImplicit=*/true));
9589 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9590 /*IsImplicit=*/true));
9591 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9592 /*IsImplicit=*/true));
9593 }
9594 }
9595
9596 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9597 // have nothing to do with VG, were it not that they are used to materialise a
9598 // frame-address. If they contain a frame-index to a scalable vector, this
9599 // will likely require an ADDVL instruction to materialise the address, thus
9600 // reading VG.
9601 const MachineFunction &MF = *MI.getMF();
9602 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9603 (MI.getOpcode() == AArch64::ADDXri ||
9604 MI.getOpcode() == AArch64::SUBXri)) {
9605 const MachineOperand &MO = MI.getOperand(1);
9606 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9607 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9608 /*IsImplicit=*/true));
9609 }
9610}
9611
9613 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9614 unsigned Condition, bool InsertVectorLengthCheck) const {
9617 FuncInfo->setHasStreamingModeChanges(true);
9618
9619 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9620 SmallVector<SDValue, 2> Ops = {Chain};
9621 if (InGlue)
9622 Ops.push_back(InGlue);
9623 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9624 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9625 };
9626
9627 if (InsertVectorLengthCheck && Enable) {
9628 // Non-streaming -> Streaming
9629 // Insert vector length check before smstart
9630 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9631 Chain = CheckVL.getValue(0);
9632 InGlue = CheckVL.getValue(1);
9633 }
9634
9635 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9636 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9637 SDValue MSROp =
9638 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9639 SmallVector<SDValue> Ops = {Chain, MSROp};
9640 unsigned Opcode;
9641 if (Condition != AArch64SME::Always) {
9642 Register PStateReg = FuncInfo->getPStateSMReg();
9643 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9644 SDValue PStateSM =
9645 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9646 // Use chain and glue from the CopyFromReg.
9647 Ops[0] = PStateSM.getValue(1);
9648 InGlue = PStateSM.getValue(2);
9649 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9650 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9651 Ops.push_back(ConditionOp);
9652 Ops.push_back(PStateSM);
9653 } else {
9654 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9655 }
9656 Ops.push_back(RegMask);
9657
9658 if (InGlue)
9659 Ops.push_back(InGlue);
9660
9661 SDValue SMChange =
9662 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9663
9664 if (!InsertVectorLengthCheck || Enable)
9665 return SMChange;
9666
9667 // Streaming -> Non-streaming
9668 // Insert vector length check after smstop since we cannot read VL
9669 // in streaming mode
9670 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9671}
9672
9675 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9676 CallAttrs.caller().hasStreamingBody())
9677 return AArch64SME::Always;
9678 if (CallAttrs.callee().hasNonStreamingInterface())
9680 if (CallAttrs.callee().hasStreamingInterface())
9682
9683 llvm_unreachable("Unsupported attributes");
9684}
9685
9686/// Check whether a stack argument requires lowering in a tail call.
9688 const CCValAssign &VA, SDValue Arg,
9689 ISD::ArgFlagsTy Flags, int CallOffset) {
9690 // FIXME: We should be able to handle this case, but it's not clear how to.
9691 if (Flags.isZExt() || Flags.isSExt())
9692 return true;
9693
9694 for (;;) {
9695 // Look through nodes that don't alter the bits of the incoming value.
9696 unsigned Op = Arg.getOpcode();
9697 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9698 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9699 Arg = Arg.getOperand(0);
9700 continue;
9701 }
9702 break;
9703 }
9704
9705 // If the argument is a load from the same immutable stack slot, we can reuse
9706 // it.
9707 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9708 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9709 const MachineFrameInfo &MFI = MF.getFrameInfo();
9710 int FI = FINode->getIndex();
9711 if (!MFI.isImmutableObjectIndex(FI))
9712 return true;
9713 if (CallOffset != MFI.getObjectOffset(FI))
9714 return true;
9715 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9716 if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
9717 return true;
9718 return false;
9719 }
9720 }
9721
9722 return true;
9723}
9724
9725/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9726/// and add input and output parameter nodes.
9727SDValue
9728AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9729 SmallVectorImpl<SDValue> &InVals) const {
9730 SelectionDAG &DAG = CLI.DAG;
9731 SDLoc &DL = CLI.DL;
9732 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9733 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9734 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9735 SDValue Chain = CLI.Chain;
9736 SDValue Callee = CLI.Callee;
9737 bool &IsTailCall = CLI.IsTailCall;
9738 CallingConv::ID &CallConv = CLI.CallConv;
9739 bool IsVarArg = CLI.IsVarArg;
9740 const CallBase *CB = CLI.CB;
9741
9742 MachineFunction &MF = DAG.getMachineFunction();
9743 MachineFunction::CallSiteInfo CSInfo;
9744 bool IsThisReturn = false;
9745
9746 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9747 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9748 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9749 bool IsSibCall = false;
9750 bool GuardWithBTI = false;
9751
9752 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9753 !Subtarget->noBTIAtReturnTwice()) {
9754 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9755 }
9756
9757 // Analyze operands of the call, assigning locations to each operand.
9759 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9760
9761 if (IsVarArg) {
9762 unsigned NumArgs = Outs.size();
9763
9764 for (unsigned i = 0; i != NumArgs; ++i) {
9765 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9766 report_fatal_error("Passing SVE types to variadic functions is "
9767 "currently not supported");
9768 }
9769 }
9770
9771 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9772
9773 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9774 // Assign locations to each value returned by this call.
9776 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9777 *DAG.getContext());
9778 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9779
9780 // Set type id for call site info.
9781 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9782 CSInfo = MachineFunction::CallSiteInfo(*CB);
9783
9784 // Check callee args/returns for SVE registers and set calling convention
9785 // accordingly.
9786 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9787 auto HasSVERegLoc = [](CCValAssign &Loc) {
9788 if (!Loc.isRegLoc())
9789 return false;
9790 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9791 AArch64::PPRRegClass.contains(Loc.getLocReg());
9792 };
9793 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9795 }
9796
9797 // Determine whether we need any streaming mode changes.
9798 SMECallAttrs CallAttrs =
9800
9801 std::optional<unsigned> ZAMarkerNode;
9802 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9803
9804 if (UseNewSMEABILowering) {
9805 if (CallAttrs.requiresLazySave() ||
9806 CallAttrs.requiresPreservingAllZAState())
9807 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9808 else if (CallAttrs.requiresPreservingZT0())
9809 ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
9810 else if (CallAttrs.caller().hasZAState() ||
9811 CallAttrs.caller().hasZT0State())
9812 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9813 }
9814
9815 if (IsTailCall) {
9816 // Check if it's really possible to do a tail call.
9817 IsTailCall = isEligibleForTailCallOptimization(CLI);
9818
9819 // A sibling call is one where we're under the usual C ABI and not planning
9820 // to change that but can still do a tail call:
9821 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9822 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9823 IsSibCall = true;
9824
9825 if (IsTailCall)
9826 ++NumTailCalls;
9827 }
9828
9829 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9830 report_fatal_error("failed to perform tail call elimination on a call "
9831 "site marked musttail");
9832
9833 // Get a count of how many bytes are to be pushed on the stack.
9834 unsigned NumBytes = CCInfo.getStackSize();
9835
9836 if (IsSibCall) {
9837 // Since we're not changing the ABI to make this a tail call, the memory
9838 // operands are already available in the caller's incoming argument space.
9839 NumBytes = 0;
9840 }
9841
9842 // FPDiff is the byte offset of the call's argument area from the callee's.
9843 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9844 // by this amount for a tail call. In a sibling call it must be 0 because the
9845 // caller will deallocate the entire stack and the callee still expects its
9846 // arguments to begin at SP+0. Completely unused for non-tail calls.
9847 int FPDiff = 0;
9848
9849 if (IsTailCall && !IsSibCall) {
9850 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9851
9852 // Since callee will pop argument stack as a tail call, we must keep the
9853 // popped size 16-byte aligned.
9854 NumBytes = alignTo(NumBytes, 16);
9855
9856 // FPDiff will be negative if this tail call requires more space than we
9857 // would automatically have in our incoming argument space. Positive if we
9858 // can actually shrink the stack.
9859 FPDiff = NumReusableBytes - NumBytes;
9860
9861 // Update the required reserved area if this is the tail call requiring the
9862 // most argument stack space.
9863 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9864 FuncInfo->setTailCallReservedStack(-FPDiff);
9865
9866 // The stack pointer must be 16-byte aligned at all times it's used for a
9867 // memory operation, which in practice means at *all* times and in
9868 // particular across call boundaries. Therefore our own arguments started at
9869 // a 16-byte aligned SP and the delta applied for the tail call should
9870 // satisfy the same constraint.
9871 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9872 }
9873
9874 auto DescribeCallsite =
9875 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9876 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9877 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9878 R << ore::NV("Callee", ES->getSymbol());
9879 else if (CLI.CB && CLI.CB->getCalledFunction())
9880 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9881 else
9882 R << "unknown callee";
9883 R << "'";
9884 return R;
9885 };
9886
9887 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9888 bool RequiresSaveAllZA =
9889 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9890 if (RequiresLazySave) {
9891 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9892 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9893 TPIDR2.FrameIndex,
9895 Chain = DAG.getNode(
9896 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9897 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9898 TPIDR2ObjAddr);
9899 OptimizationRemarkEmitter ORE(&MF.getFunction());
9900 ORE.emit([&]() {
9901 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9902 CLI.CB)
9903 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9904 &MF.getFunction());
9905 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9906 });
9907 } else if (RequiresSaveAllZA) {
9908 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9909 "Cannot share state that may not exist");
9910 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9911 /*IsSave=*/true);
9912 }
9913
9914 bool RequiresSMChange = CallAttrs.requiresSMChange();
9915 if (RequiresSMChange) {
9916 OptimizationRemarkEmitter ORE(&MF.getFunction());
9917 ORE.emit([&]() {
9918 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9919 CLI.CB)
9920 : OptimizationRemarkAnalysis("sme", "SMETransition",
9921 &MF.getFunction());
9922 DescribeCallsite(R) << " requires a streaming mode transition";
9923 return R;
9924 });
9925 }
9926
9927 SDValue ZTFrameIdx;
9928 MachineFrameInfo &MFI = MF.getFrameInfo();
9929 bool ShouldPreserveZT0 =
9930 !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0();
9931
9932 // If the caller has ZT0 state which will not be preserved by the callee,
9933 // spill ZT0 before the call.
9934 if (ShouldPreserveZT0) {
9935 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9936
9937 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9938 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9939 }
9940
9941 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9942 // PSTATE.ZA before the call if there is no lazy-save active.
9943 bool DisableZA =
9944 !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall();
9945 assert((!DisableZA || !RequiresLazySave) &&
9946 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9947
9948 if (DisableZA)
9949 Chain = DAG.getNode(
9950 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9951 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9952
9953 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9954 // These operations are automatically eliminated by the prolog/epilog pass
9955 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9956 if (!IsSibCall) {
9957 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9958 if (ZAMarkerNode) {
9959 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9960 // using a chain can result in incorrect scheduling. The markers refer to
9961 // the position just before the CALLSEQ_START (though occur after as
9962 // CALLSEQ_START lacks in-glue).
9963 Chain =
9964 DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other, MVT::Glue),
9965 {Chain, Chain.getValue(1)});
9966 }
9967 }
9968
9969 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9971
9973 SmallSet<unsigned, 8> RegsUsed;
9974 SmallVector<SDValue, 8> MemOpChains;
9975 auto PtrVT = getPointerTy(DAG.getDataLayout());
9976
9977 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9978 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9979 for (const auto &F : Forwards) {
9980 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9981 RegsToPass.emplace_back(F.PReg, Val);
9982 }
9983 }
9984
9985 // Walk the register/memloc assignments, inserting copies/loads.
9986 unsigned ExtraArgLocs = 0;
9987 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9988 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9989 SDValue Arg = OutVals[i];
9990 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9991
9992 // Promote the value if needed.
9993 switch (VA.getLocInfo()) {
9994 default:
9995 llvm_unreachable("Unknown loc info!");
9996 case CCValAssign::Full:
9997 break;
9998 case CCValAssign::SExt:
9999 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
10000 break;
10001 case CCValAssign::ZExt:
10002 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10003 break;
10004 case CCValAssign::AExt:
10005 if (Outs[i].ArgVT == MVT::i1) {
10006 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
10007 //
10008 // Check if we actually have to do this, because the value may
10009 // already be zero-extended.
10010 //
10011 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
10012 // and rely on DAGCombiner to fold this, because the following
10013 // (anyext i32) is combined with (zext i8) in DAG.getNode:
10014 //
10015 // (ext (zext x)) -> (zext x)
10016 //
10017 // This will give us (zext i32), which we cannot remove, so
10018 // try to check this beforehand.
10019 if (!checkZExtBool(Arg, DAG)) {
10020 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10021 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
10022 }
10023 }
10024 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10025 break;
10027 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10028 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10029 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10030 DAG.getConstant(32, DL, VA.getLocVT()));
10031 break;
10032 case CCValAssign::BCvt:
10033 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
10034 break;
10035 case CCValAssign::Trunc:
10036 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10037 break;
10038 case CCValAssign::FPExt:
10039 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
10040 break;
10042 bool isScalable = VA.getValVT().isScalableVT();
10043 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
10044 "Indirect arguments should be scalable on most subtargets");
10045
10046 TypeSize StoreSize = VA.getValVT().getStoreSize();
10047 TypeSize PartSize = StoreSize;
10048 unsigned NumParts = 1;
10049 if (Outs[i].Flags.isInConsecutiveRegs()) {
10050 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
10051 ++NumParts;
10052 StoreSize *= NumParts;
10053 }
10054
10055 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
10056 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
10057 MachineFrameInfo &MFI = MF.getFrameInfo();
10058 int FI =
10059 MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
10060 if (isScalable) {
10061 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
10062 VA.getValVT().getVectorElementType() == MVT::i1;
10065 }
10066
10067 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
10068 SDValue Ptr = DAG.getFrameIndex(
10070 SDValue SpillSlot = Ptr;
10071
10072 // Ensure we generate all stores for each tuple part, whilst updating the
10073 // pointer after each store correctly using vscale.
10074 while (NumParts) {
10075 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
10076 MemOpChains.push_back(Store);
10077
10078 NumParts--;
10079 if (NumParts > 0) {
10080 SDValue BytesIncrement =
10081 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
10082 MPI = MachinePointerInfo(MPI.getAddrSpace());
10083 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
10084 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
10085 ExtraArgLocs++;
10086 i++;
10087 }
10088 }
10089
10090 Arg = SpillSlot;
10091 break;
10092 }
10093
10094 if (VA.isRegLoc()) {
10095 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
10096 Outs[0].VT == MVT::i64) {
10097 assert(VA.getLocVT() == MVT::i64 &&
10098 "unexpected calling convention register assignment");
10099 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
10100 "unexpected use of 'returned'");
10101 IsThisReturn = true;
10102 }
10103 if (RegsUsed.count(VA.getLocReg())) {
10104 // If this register has already been used then we're trying to pack
10105 // parts of an [N x i32] into an X-register. The extension type will
10106 // take care of putting the two halves in the right place but we have to
10107 // combine them.
10108 SDValue &Bits =
10109 llvm::find_if(RegsToPass,
10110 [=](const std::pair<unsigned, SDValue> &Elt) {
10111 return Elt.first == VA.getLocReg();
10112 })
10113 ->second;
10114 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10115 // Call site info is used for function's parameter entry value
10116 // tracking. For now we track only simple cases when parameter
10117 // is transferred through whole register.
10119 [&VA](MachineFunction::ArgRegPair ArgReg) {
10120 return ArgReg.Reg == VA.getLocReg();
10121 });
10122 } else {
10123 // Add an extra level of indirection for streaming mode changes by
10124 // using a pseudo copy node that cannot be rematerialised between a
10125 // smstart/smstop and the call by the simple register coalescer.
10126 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
10127 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10128 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
10129 RegsToPass.emplace_back(VA.getLocReg(), Arg);
10130 RegsUsed.insert(VA.getLocReg());
10131 const TargetOptions &Options = DAG.getTarget().Options;
10132 if (Options.EmitCallSiteInfo)
10133 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
10134 }
10135 } else {
10136 assert(VA.isMemLoc());
10137
10138 SDValue DstAddr;
10139 MachinePointerInfo DstInfo;
10140
10141 // FIXME: This works on big-endian for composite byvals, which are the
10142 // common case. It should also work for fundamental types too.
10143 uint32_t BEAlign = 0;
10144 unsigned OpSize;
10145 if (VA.getLocInfo() == CCValAssign::Indirect ||
10147 OpSize = VA.getLocVT().getFixedSizeInBits();
10148 else
10149 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
10150 : VA.getValVT().getSizeInBits();
10151 OpSize = (OpSize + 7) / 8;
10152 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
10153 !Flags.isInConsecutiveRegs()) {
10154 if (OpSize < 8)
10155 BEAlign = 8 - OpSize;
10156 }
10157 unsigned LocMemOffset = VA.getLocMemOffset();
10158 int32_t Offset = LocMemOffset + BEAlign;
10159
10160 if (IsTailCall) {
10161 // When the frame pointer is perfectly aligned for the tail call and the
10162 // same stack argument is passed down intact, we can reuse it.
10163 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
10164 continue;
10165
10166 Offset = Offset + FPDiff;
10167 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
10168
10169 DstAddr = DAG.getFrameIndex(FI, PtrVT);
10170 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
10171
10172 // Make sure any stack arguments overlapping with where we're storing
10173 // are loaded before this eventual operation. Otherwise they'll be
10174 // clobbered.
10175 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
10176 } else {
10177 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
10178
10179 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
10180 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
10181 }
10182
10183 if (Outs[i].Flags.isByVal()) {
10184 SDValue SizeNode =
10185 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
10186 SDValue Cpy = DAG.getMemcpy(
10187 Chain, DL, DstAddr, Arg, SizeNode,
10188 Outs[i].Flags.getNonZeroByValAlign(),
10189 /*isVol = */ false, /*AlwaysInline = */ false,
10190 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
10191
10192 MemOpChains.push_back(Cpy);
10193 } else {
10194 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
10195 // promoted to a legal register type i32, we should truncate Arg back to
10196 // i1/i8/i16.
10197 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
10198 VA.getValVT() == MVT::i16)
10199 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
10200
10201 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
10202 MemOpChains.push_back(Store);
10203 }
10204 }
10205 }
10206
10207 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
10208 !(CLI.CB && CLI.CB->isMustTailCall())) {
10209 SDValue ParamPtr = StackPtr;
10210 if (IsTailCall) {
10211 // Create a dummy object at the top of the stack that can be used to get
10212 // the SP after the epilogue
10213 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
10214 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
10215 }
10216
10217 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
10218 // describing the argument list. x4 contains the address of the
10219 // first stack parameter. x5 contains the size in bytes of all parameters
10220 // passed on the stack.
10221 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
10222 RegsToPass.emplace_back(AArch64::X5,
10223 DAG.getConstant(NumBytes, DL, MVT::i64));
10224 }
10225
10226 if (!MemOpChains.empty())
10227 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
10228
10229 SDValue InGlue;
10230 if (RequiresSMChange) {
10231 bool InsertVectorLengthCheck =
10233 Chain = changeStreamingMode(
10234 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
10235 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
10236 InGlue = Chain.getValue(1);
10237 }
10238
10239 // Build a sequence of copy-to-reg nodes chained together with token chain
10240 // and flag operands which copy the outgoing args into the appropriate regs.
10241 for (auto &RegToPass : RegsToPass) {
10242 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
10243 RegToPass.second, InGlue);
10244 InGlue = Chain.getValue(1);
10245 }
10246
10247 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
10248 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
10249 // node so that legalize doesn't hack it.
10250 const GlobalValue *CalledGlobal = nullptr;
10251 unsigned OpFlags = 0;
10252 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
10253 CalledGlobal = G->getGlobal();
10254 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
10256 if (OpFlags & AArch64II::MO_GOT) {
10257 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
10258 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10259 } else {
10260 const GlobalValue *GV = G->getGlobal();
10261 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
10262 }
10263 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
10264 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
10265 Subtarget->isTargetMachO()) ||
10267 const char *Sym = S->getSymbol();
10268 if (UseGot) {
10270 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10271 } else {
10272 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
10273 }
10274 }
10275
10276 // We don't usually want to end the call-sequence here because we would tidy
10277 // the frame up *after* the call, however in the ABI-changing tail-call case
10278 // we've carefully laid out the parameters so that when sp is reset they'll be
10279 // in the correct location.
10280 if (IsTailCall && !IsSibCall) {
10281 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
10282 InGlue = Chain.getValue(1);
10283 }
10284
10285 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
10286
10287 std::vector<SDValue> Ops;
10288 Ops.push_back(Chain);
10289 Ops.push_back(Callee);
10290
10291 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
10292 // be expanded to the call, directly followed by a special marker sequence and
10293 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
10294 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
10295 assert(!IsTailCall &&
10296 "tail calls cannot be marked with clang.arc.attachedcall");
10297 Opc = AArch64ISD::CALL_RVMARKER;
10298
10299 // Add a target global address for the retainRV/claimRV runtime function
10300 // just before the call target.
10301 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
10302 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
10303 Ops.insert(Ops.begin() + 1, GA);
10304
10305 // We may or may not need to emit both the marker and the retain/claim call.
10306 // Tell the pseudo expansion using an additional boolean op.
10307 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
10308 SDValue DoEmitMarker =
10309 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
10310 Ops.insert(Ops.begin() + 2, DoEmitMarker);
10311 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10312 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
10313 } else if (GuardWithBTI) {
10314 Opc = AArch64ISD::CALL_BTI;
10315 }
10316
10317 if (IsTailCall) {
10318 // Each tail call may have to adjust the stack by a different amount, so
10319 // this information must travel along with the operation for eventual
10320 // consumption by emitEpilogue.
10321 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
10322 }
10323
10324 if (CLI.PAI) {
10325 const uint64_t Key = CLI.PAI->Key;
10327 "Invalid auth call key");
10328
10329 // Split the discriminator into address/integer components.
10330 SDValue AddrDisc, IntDisc;
10331 std::tie(IntDisc, AddrDisc) =
10332 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
10333
10334 if (Opc == AArch64ISD::CALL_RVMARKER)
10335 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
10336 else
10337 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
10338 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
10339 Ops.push_back(IntDisc);
10340 Ops.push_back(AddrDisc);
10341 }
10342
10343 // Add argument registers to the end of the list so that they are known live
10344 // into the call.
10345 for (auto &RegToPass : RegsToPass)
10346 Ops.push_back(DAG.getRegister(RegToPass.first,
10347 RegToPass.second.getValueType()));
10348
10349 // Add a register mask operand representing the call-preserved registers.
10350 const uint32_t *Mask;
10351 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10352 if (IsThisReturn) {
10353 // For 'this' returns, use the X0-preserving mask if applicable
10354 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10355 if (!Mask) {
10356 IsThisReturn = false;
10357 Mask = TRI->getCallPreservedMask(MF, CallConv);
10358 }
10359 } else
10360 Mask = TRI->getCallPreservedMask(MF, CallConv);
10361
10362 if (Subtarget->hasCustomCallingConv())
10363 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10364
10365 if (TRI->isAnyArgRegReserved(MF))
10366 TRI->emitReservedArgRegCallError(MF);
10367
10368 assert(Mask && "Missing call preserved mask for calling convention");
10369 Ops.push_back(DAG.getRegisterMask(Mask));
10370
10371 if (InGlue.getNode())
10372 Ops.push_back(InGlue);
10373
10374 if (CLI.DeactivationSymbol)
10375 Ops.push_back(DAG.getDeactivationSymbol(CLI.DeactivationSymbol));
10376
10377 // If we're doing a tall call, use a TC_RETURN here rather than an
10378 // actual call instruction.
10379 if (IsTailCall) {
10381 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10382 if (IsCFICall)
10383 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10384
10385 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10386 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10387 if (CalledGlobal &&
10388 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10389 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10390 return Ret;
10391 }
10392
10393 // Returns a chain and a flag for retval copy to use.
10394 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10395 if (IsCFICall)
10396 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10397
10398 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10399 InGlue = Chain.getValue(1);
10400 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10401 if (CalledGlobal &&
10402 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10403 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10404
10405 uint64_t CalleePopBytes =
10406 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10407
10408 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10409 InGlue = Chain.getValue(1);
10410
10411 // Handle result values, copying them out of physregs into vregs that we
10412 // return.
10413 SDValue Result = LowerCallResult(
10414 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10415 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10416
10417 if (!Ins.empty())
10418 InGlue = Result.getValue(Result->getNumValues() - 1);
10419
10420 if (RequiresSMChange) {
10422 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10423 getSMToggleCondition(CallAttrs));
10424 }
10425
10426 if (!UseNewSMEABILowering &&
10427 (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()))
10428 // Unconditionally resume ZA.
10429 Result = DAG.getNode(
10430 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10431 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10432
10433 if (ShouldPreserveZT0)
10434 Result =
10435 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10436 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10437
10438 if (RequiresLazySave) {
10439 Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
10440 } else if (RequiresSaveAllZA) {
10441 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10442 /*IsSave=*/false);
10443 }
10444
10445 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10446 RequiresSaveAllZA) {
10447 for (unsigned I = 0; I < InVals.size(); ++I) {
10448 // The smstart/smstop is chained as part of the call, but when the
10449 // resulting chain is discarded (which happens when the call is not part
10450 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10451 // smstart/smstop is chained to the result value. We can do that by doing
10452 // a vreg -> vreg copy.
10455 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10456 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10457 InVals[I].getValueType());
10458 }
10459 }
10460
10461 if (CallConv == CallingConv::PreserveNone) {
10462 for (const ISD::OutputArg &O : Outs) {
10463 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10464 O.Flags.isSwiftAsync()) {
10465 MachineFunction &MF = DAG.getMachineFunction();
10466 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10467 MF.getFunction(),
10468 "Swift attributes can't be used with preserve_none",
10469 DL.getDebugLoc()));
10470 break;
10471 }
10472 }
10473 }
10474
10475 return Result;
10476}
10477
10478bool AArch64TargetLowering::CanLowerReturn(
10479 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10481 const Type *RetTy) const {
10482 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10484 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10485 return CCInfo.CheckReturn(Outs, RetCC);
10486}
10487
10488SDValue
10489AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10490 bool isVarArg,
10492 const SmallVectorImpl<SDValue> &OutVals,
10493 const SDLoc &DL, SelectionDAG &DAG) const {
10494 auto &MF = DAG.getMachineFunction();
10495 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10496
10497 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10499 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10500 CCInfo.AnalyzeReturn(Outs, RetCC);
10501
10502 // Copy the result values into the output registers.
10503 SDValue Glue;
10505 SmallSet<unsigned, 4> RegsUsed;
10506 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10507 ++i, ++realRVLocIdx) {
10508 CCValAssign &VA = RVLocs[i];
10509 assert(VA.isRegLoc() && "Can only return in registers!");
10510 SDValue Arg = OutVals[realRVLocIdx];
10511
10512 switch (VA.getLocInfo()) {
10513 default:
10514 llvm_unreachable("Unknown loc info!");
10515 case CCValAssign::Full:
10516 if (Outs[i].ArgVT == MVT::i1) {
10517 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10518 // value. This is strictly redundant on Darwin (which uses "zeroext
10519 // i1"), but will be optimised out before ISel.
10520 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10521 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10522 }
10523 break;
10524 case CCValAssign::BCvt:
10525 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10526 break;
10527 case CCValAssign::AExt:
10528 case CCValAssign::ZExt:
10529 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10530 break;
10532 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10533 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10534 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10535 DAG.getConstant(32, DL, VA.getLocVT()));
10536 break;
10537 }
10538
10539 if (RegsUsed.count(VA.getLocReg())) {
10540 SDValue &Bits =
10541 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10542 return Elt.first == VA.getLocReg();
10543 })->second;
10544 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10545 } else {
10546 RetVals.emplace_back(VA.getLocReg(), Arg);
10547 RegsUsed.insert(VA.getLocReg());
10548 }
10549 }
10550
10551 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10552
10553 // Emit SMSTOP before returning from a locally streaming function
10554 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10555 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10556 if (FuncAttrs.hasStreamingCompatibleInterface())
10557 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10558 /*Glue*/ SDValue(),
10560 else
10561 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10562 /*Glue*/ SDValue(), AArch64SME::Always);
10563 Glue = Chain.getValue(1);
10564 }
10565
10566 SmallVector<SDValue, 4> RetOps(1, Chain);
10567 for (auto &RetVal : RetVals) {
10568 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10569 isPassedInFPR(RetVal.second.getValueType()))
10570 RetVal.second =
10571 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10572 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10573 RetVal.second);
10574 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10575 Glue = Chain.getValue(1);
10576 RetOps.push_back(
10577 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10578 }
10579
10580 // Windows AArch64 ABIs require that for returning structs by value we copy
10581 // the sret argument into X0 for the return.
10582 // We saved the argument into a virtual register in the entry block,
10583 // so now we copy the value out and into X0.
10584 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10585 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10587
10588 unsigned RetValReg = AArch64::X0;
10589 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10590 RetValReg = AArch64::X8;
10591 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10592 Glue = Chain.getValue(1);
10593
10594 RetOps.push_back(
10595 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10596 }
10597
10598 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10599 if (I) {
10600 for (; *I; ++I) {
10601 if (AArch64::GPR64RegClass.contains(*I))
10602 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10603 else if (AArch64::FPR64RegClass.contains(*I))
10604 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10605 else
10606 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10607 }
10608 }
10609
10610 RetOps[0] = Chain; // Update chain.
10611
10612 // Add the glue if we have it.
10613 if (Glue.getNode())
10614 RetOps.push_back(Glue);
10615
10616 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10617 // ARM64EC entry thunks use a special return sequence: instead of a regular
10618 // "ret" instruction, they need to explicitly call the emulator.
10619 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10620 SDValue Arm64ECRetDest =
10621 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10622 Arm64ECRetDest =
10623 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10624 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10625 MachinePointerInfo());
10626 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10627 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10628 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10629 }
10630
10631 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10632}
10633
10634//===----------------------------------------------------------------------===//
10635// Other Lowering Code
10636//===----------------------------------------------------------------------===//
10637
10638SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10639 SelectionDAG &DAG,
10640 unsigned Flag) const {
10641 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10642 N->getOffset(), Flag);
10643}
10644
10645SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10646 SelectionDAG &DAG,
10647 unsigned Flag) const {
10648 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10649}
10650
10651SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10652 SelectionDAG &DAG,
10653 unsigned Flag) const {
10654 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10655 N->getOffset(), Flag);
10656}
10657
10658SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10659 SelectionDAG &DAG,
10660 unsigned Flag) const {
10661 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10662}
10663
10664SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10665 SelectionDAG &DAG,
10666 unsigned Flag) const {
10667 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10668}
10669
10670// (loadGOT sym)
10671template <class NodeTy>
10672SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10673 unsigned Flags) const {
10674 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10675 SDLoc DL(N);
10676 EVT Ty = getPointerTy(DAG.getDataLayout());
10677 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10678 // FIXME: Once remat is capable of dealing with instructions with register
10679 // operands, expand this into two nodes instead of using a wrapper node.
10680 if (DAG.getMachineFunction()
10681 .getInfo<AArch64FunctionInfo>()
10682 ->hasELFSignedGOT())
10683 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10684 0);
10685 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10686}
10687
10688// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10689template <class NodeTy>
10690SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10691 unsigned Flags) const {
10692 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10693 SDLoc DL(N);
10694 EVT Ty = getPointerTy(DAG.getDataLayout());
10695 const unsigned char MO_NC = AArch64II::MO_NC;
10696 return DAG.getNode(
10697 AArch64ISD::WrapperLarge, DL, Ty,
10698 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10699 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10700 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10701 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10702}
10703
10704// (addlow (adrp %hi(sym)) %lo(sym))
10705template <class NodeTy>
10706SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10707 unsigned Flags) const {
10708 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10709 SDLoc DL(N);
10710 EVT Ty = getPointerTy(DAG.getDataLayout());
10711 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10712 SDValue Lo = getTargetNode(N, Ty, DAG,
10714 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10715 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10716}
10717
10718// (adr sym)
10719template <class NodeTy>
10720SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10721 unsigned Flags) const {
10722 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10723 SDLoc DL(N);
10724 EVT Ty = getPointerTy(DAG.getDataLayout());
10725 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10726 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10727}
10728
10729SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10730 SelectionDAG &DAG) const {
10731 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10732 const GlobalValue *GV = GN->getGlobal();
10733 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10734
10735 if (OpFlags != AArch64II::MO_NO_FLAG)
10737 "unexpected offset in global node");
10738
10739 // This also catches the large code model case for Darwin, and tiny code
10740 // model with got relocations.
10741 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10742 return getGOT(GN, DAG, OpFlags);
10743 }
10744
10748 Result = getAddrLarge(GN, DAG, OpFlags);
10749 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10750 Result = getAddrTiny(GN, DAG, OpFlags);
10751 } else {
10752 Result = getAddr(GN, DAG, OpFlags);
10753 }
10754 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10755 SDLoc DL(GN);
10757 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10759 return Result;
10760}
10761
10762/// Convert a TLS address reference into the correct sequence of loads
10763/// and calls to compute the variable's address (for Darwin, currently) and
10764/// return an SDValue containing the final node.
10765
10766/// Darwin only has one TLS scheme which must be capable of dealing with the
10767/// fully general situation, in the worst case. This means:
10768/// + "extern __thread" declaration.
10769/// + Defined in a possibly unknown dynamic library.
10770///
10771/// The general system is that each __thread variable has a [3 x i64] descriptor
10772/// which contains information used by the runtime to calculate the address. The
10773/// only part of this the compiler needs to know about is the first xword, which
10774/// contains a function pointer that must be called with the address of the
10775/// entire descriptor in "x0".
10776///
10777/// Since this descriptor may be in a different unit, in general even the
10778/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10779/// is:
10780/// adrp x0, _var@TLVPPAGE
10781/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10782/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10783/// ; the function pointer
10784/// blr x1 ; Uses descriptor address in x0
10785/// ; Address of _var is now in x0.
10786///
10787/// If the address of _var's descriptor *is* known to the linker, then it can
10788/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10789/// a slight efficiency gain.
10790SDValue
10791AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10792 SelectionDAG &DAG) const {
10793 assert(Subtarget->isTargetDarwin() &&
10794 "This function expects a Darwin target");
10795
10796 SDLoc DL(Op);
10797 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10798 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10799 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10800
10801 SDValue TLVPAddr =
10802 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10803 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10804
10805 // The first entry in the descriptor is a function pointer that we must call
10806 // to obtain the address of the variable.
10807 SDValue Chain = DAG.getEntryNode();
10808 SDValue FuncTLVGet = DAG.getLoad(
10809 PtrMemVT, DL, Chain, DescAddr,
10811 Align(PtrMemVT.getSizeInBits() / 8),
10813 Chain = FuncTLVGet.getValue(1);
10814
10815 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10816 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10817
10818 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10819 MFI.setAdjustsStack(true);
10820
10821 // TLS calls preserve all registers except those that absolutely must be
10822 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10823 // silly).
10824 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10825 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10826 if (Subtarget->hasCustomCallingConv())
10827 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10828
10829 // Finally, we can make the call. This is just a degenerate version of a
10830 // normal AArch64 call node: x0 takes the address of the descriptor, and
10831 // returns the address of the variable in this thread.
10832 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10833
10834 unsigned Opcode = AArch64ISD::CALL;
10836 Ops.push_back(Chain);
10837 Ops.push_back(FuncTLVGet);
10838
10839 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10840 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10841 Opcode = AArch64ISD::AUTH_CALL;
10842 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10843 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10844 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10845 }
10846
10847 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10848 Ops.push_back(DAG.getRegisterMask(Mask));
10849 Ops.push_back(Chain.getValue(1));
10850 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10851 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10852}
10853
10854/// Convert a thread-local variable reference into a sequence of instructions to
10855/// compute the variable's address for the local exec TLS model of ELF targets.
10856/// The sequence depends on the maximum TLS area size.
10857SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10858 SDValue ThreadBase,
10859 const SDLoc &DL,
10860 SelectionDAG &DAG) const {
10861 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10862 SDValue TPOff, Addr;
10863
10864 switch (DAG.getTarget().Options.TLSSize) {
10865 default:
10866 llvm_unreachable("Unexpected TLS size");
10867
10868 case 12: {
10869 // mrs x0, TPIDR_EL0
10870 // add x0, x0, :tprel_lo12:a
10872 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10873 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10874 Var,
10875 DAG.getTargetConstant(0, DL, MVT::i32)),
10876 0);
10877 }
10878
10879 case 24: {
10880 // mrs x0, TPIDR_EL0
10881 // add x0, x0, :tprel_hi12:a
10882 // add x0, x0, :tprel_lo12_nc:a
10883 SDValue HiVar = DAG.getTargetGlobalAddress(
10884 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10885 SDValue LoVar = DAG.getTargetGlobalAddress(
10886 GV, DL, PtrVT, 0,
10888 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10889 HiVar,
10890 DAG.getTargetConstant(0, DL, MVT::i32)),
10891 0);
10892 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10893 LoVar,
10894 DAG.getTargetConstant(0, DL, MVT::i32)),
10895 0);
10896 }
10897
10898 case 32: {
10899 // mrs x1, TPIDR_EL0
10900 // movz x0, #:tprel_g1:a
10901 // movk x0, #:tprel_g0_nc:a
10902 // add x0, x1, x0
10903 SDValue HiVar = DAG.getTargetGlobalAddress(
10904 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10905 SDValue LoVar = DAG.getTargetGlobalAddress(
10906 GV, DL, PtrVT, 0,
10908 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10909 DAG.getTargetConstant(16, DL, MVT::i32)),
10910 0);
10911 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10912 DAG.getTargetConstant(0, DL, MVT::i32)),
10913 0);
10914 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10915 }
10916
10917 case 48: {
10918 // mrs x1, TPIDR_EL0
10919 // movz x0, #:tprel_g2:a
10920 // movk x0, #:tprel_g1_nc:a
10921 // movk x0, #:tprel_g0_nc:a
10922 // add x0, x1, x0
10923 SDValue HiVar = DAG.getTargetGlobalAddress(
10924 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10925 SDValue MiVar = DAG.getTargetGlobalAddress(
10926 GV, DL, PtrVT, 0,
10928 SDValue LoVar = DAG.getTargetGlobalAddress(
10929 GV, DL, PtrVT, 0,
10931 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10932 DAG.getTargetConstant(32, DL, MVT::i32)),
10933 0);
10934 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10935 DAG.getTargetConstant(16, DL, MVT::i32)),
10936 0);
10937 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10938 DAG.getTargetConstant(0, DL, MVT::i32)),
10939 0);
10940 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10941 }
10942 }
10943}
10944
10945/// When accessing thread-local variables under either the general-dynamic or
10946/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10947/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10948/// is a function pointer to carry out the resolution.
10949///
10950/// The sequence is:
10951/// adrp x0, :tlsdesc:var
10952/// ldr x1, [x0, #:tlsdesc_lo12:var]
10953/// add x0, x0, #:tlsdesc_lo12:var
10954/// .tlsdesccall var
10955/// blr x1
10956/// (TPIDR_EL0 offset now in x0)
10957///
10958/// The above sequence must be produced unscheduled, to enable the linker to
10959/// optimize/relax this sequence.
10960/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10961/// above sequence, and expanded really late in the compilation flow, to ensure
10962/// the sequence is produced as per above.
10963SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10964 const SDLoc &DL,
10965 SelectionDAG &DAG) const {
10966 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10967 auto &MF = DAG.getMachineFunction();
10968 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10969
10970 SDValue Glue;
10971 SDValue Chain = DAG.getEntryNode();
10972 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10973
10974 SMECallAttrs TLSCallAttrs(FuncInfo->getSMEFnAttrs(), {}, SMEAttrs::Normal);
10975 bool RequiresSMChange = TLSCallAttrs.requiresSMChange();
10976
10977 auto ChainAndGlue = [](SDValue Chain) -> std::pair<SDValue, SDValue> {
10978 return {Chain, Chain.getValue(1)};
10979 };
10980
10981 if (RequiresSMChange)
10982 std::tie(Chain, Glue) =
10983 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/false, Chain, Glue,
10984 getSMToggleCondition(TLSCallAttrs)));
10985
10986 unsigned Opcode =
10987 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10988 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10989 : AArch64ISD::TLSDESC_CALLSEQ;
10990 SDValue Ops[] = {Chain, SymAddr, Glue};
10991 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
10992 Opcode, DL, NodeTys, Glue ? ArrayRef(Ops) : ArrayRef(Ops).drop_back()));
10993
10994 if (TLSCallAttrs.requiresLazySave())
10995 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
10996 AArch64ISD::REQUIRES_ZA_SAVE, DL, NodeTys, {Chain, Chain.getValue(1)}));
10997
10998 if (RequiresSMChange)
10999 std::tie(Chain, Glue) =
11000 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
11001 getSMToggleCondition(TLSCallAttrs)));
11002
11003 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
11004}
11005
11006SDValue
11007AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
11008 SelectionDAG &DAG) const {
11009 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
11010
11011 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11012 AArch64FunctionInfo *MFI =
11013 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11014
11018
11020 if (Model == TLSModel::LocalDynamic)
11022 }
11023
11025 Model != TLSModel::LocalExec)
11026 report_fatal_error("ELF TLS only supported in small memory model or "
11027 "in local exec TLS model");
11028 // Different choices can be made for the maximum size of the TLS area for a
11029 // module. For the small address model, the default TLS size is 16MiB and the
11030 // maximum TLS size is 4GiB.
11031 // FIXME: add tiny and large code model support for TLS access models other
11032 // than local exec. We currently generate the same code as small for tiny,
11033 // which may be larger than needed.
11034
11035 SDValue TPOff;
11036 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11037 SDLoc DL(Op);
11038 const GlobalValue *GV = GA->getGlobal();
11039
11040 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
11041
11042 if (Model == TLSModel::LocalExec) {
11043 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
11044 } else if (Model == TLSModel::InitialExec) {
11045 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11046 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
11047 } else if (Model == TLSModel::LocalDynamic) {
11048 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
11049 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
11050 // the beginning of the module's TLS region, followed by a DTPREL offset
11051 // calculation.
11052
11053 // These accesses will need deduplicating if there's more than one.
11055
11056 // The call needs a relocation too for linker relaxation. It doesn't make
11057 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11058 // the address.
11059 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
11061
11062 // Now we can calculate the offset from TPIDR_EL0 to this module's
11063 // thread-local area.
11064 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11065
11066 // Now use :dtprel_whatever: operations to calculate this variable's offset
11067 // in its thread-storage area.
11068 SDValue HiVar = DAG.getTargetGlobalAddress(
11069 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11070 SDValue LoVar = DAG.getTargetGlobalAddress(
11071 GV, DL, MVT::i64, 0,
11073
11074 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
11075 DAG.getTargetConstant(0, DL, MVT::i32)),
11076 0);
11077 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
11078 DAG.getTargetConstant(0, DL, MVT::i32)),
11079 0);
11080 } else if (Model == TLSModel::GeneralDynamic) {
11081 // The call needs a relocation too for linker relaxation. It doesn't make
11082 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11083 // the address.
11084 SDValue SymAddr =
11085 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11086
11087 // Finally we can make a call to calculate the offset from tpidr_el0.
11088 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11089 } else
11090 llvm_unreachable("Unsupported ELF TLS access model");
11091
11092 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
11093}
11094
11095SDValue
11096AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
11097 SelectionDAG &DAG) const {
11098 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
11099
11100 SDValue Chain = DAG.getEntryNode();
11101 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11102 SDLoc DL(Op);
11103
11104 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
11105
11106 // Load the ThreadLocalStoragePointer from the TEB
11107 // A pointer to the TLS array is located at offset 0x58 from the TEB.
11108 SDValue TLSArray =
11109 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
11110 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
11111 Chain = TLSArray.getValue(1);
11112
11113 // Load the TLS index from the C runtime;
11114 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
11115 // This also does the same as LOADgot, but using a generic i32 load,
11116 // while LOADgot only loads i64.
11117 SDValue TLSIndexHi =
11118 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
11119 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
11120 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
11121 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
11122 SDValue TLSIndex =
11123 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
11124 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
11125 Chain = TLSIndex.getValue(1);
11126
11127 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
11128 // offset into the TLSArray.
11129 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
11130 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
11131 DAG.getConstant(3, DL, PtrVT));
11132 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
11133 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
11134 MachinePointerInfo());
11135 Chain = TLS.getValue(1);
11136
11137 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11138 const GlobalValue *GV = GA->getGlobal();
11139 SDValue TGAHi = DAG.getTargetGlobalAddress(
11140 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11141 SDValue TGALo = DAG.getTargetGlobalAddress(
11142 GV, DL, PtrVT, 0,
11144
11145 // Add the offset from the start of the .tls section (section base).
11146 SDValue Addr =
11147 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
11148 DAG.getTargetConstant(0, DL, MVT::i32)),
11149 0);
11150 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
11151 return Addr;
11152}
11153
11154SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
11155 SelectionDAG &DAG) const {
11156 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11157 if (DAG.getTarget().useEmulatedTLS())
11158 return LowerToTLSEmulatedModel(GA, DAG);
11159
11160 if (Subtarget->isTargetDarwin())
11161 return LowerDarwinGlobalTLSAddress(Op, DAG);
11162 if (Subtarget->isTargetELF())
11163 return LowerELFGlobalTLSAddress(Op, DAG);
11164 if (Subtarget->isTargetWindows())
11165 return LowerWindowsGlobalTLSAddress(Op, DAG);
11166
11167 llvm_unreachable("Unexpected platform trying to use TLS");
11168}
11169
11170//===----------------------------------------------------------------------===//
11171// PtrAuthGlobalAddress lowering
11172//
11173// We have 3 lowering alternatives to choose from:
11174// - MOVaddrPAC: similar to MOVaddr, with added PAC.
11175// If the GV doesn't need a GOT load (i.e., is locally defined)
11176// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
11177//
11178// - LOADgotPAC: similar to LOADgot, with added PAC.
11179// If the GV needs a GOT load, materialize the pointer using the usual
11180// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
11181// section is assumed to be read-only (for example, via relro mechanism). See
11182// LowerMOVaddrPAC.
11183//
11184// - LOADauthptrstatic: similar to LOADgot, but use a
11185// special stub slot instead of a GOT slot.
11186// Load a signed pointer for symbol 'sym' from a stub slot named
11187// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
11188// resolving. This usually lowers to adrp+ldr, but also emits an entry into
11189// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
11190//
11191// All 3 are pseudos that are expand late to longer sequences: this lets us
11192// provide integrity guarantees on the to-be-signed intermediate values.
11193//
11194// LOADauthptrstatic is undesirable because it requires a large section filled
11195// with often similarly-signed pointers, making it a good harvesting target.
11196// Thus, it's only used for ptrauth references to extern_weak to avoid null
11197// checks.
11198
11200 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
11201 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
11202 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
11203 assert(TGN->getGlobal()->hasExternalWeakLinkage());
11204
11205 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
11206 // offset alone as a pointer if the symbol wasn't available, which would
11207 // probably break null checks in users. Ptrauth complicates things further:
11208 // error out.
11209 if (TGN->getOffset() != 0)
11211 "unsupported non-zero offset in weak ptrauth global reference");
11212
11213 if (!isNullConstant(AddrDiscriminator))
11214 report_fatal_error("unsupported weak addr-div ptrauth global");
11215
11216 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11217 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
11218 {TGA, Key, Discriminator}),
11219 0);
11220}
11221
11222SDValue
11223AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
11224 SelectionDAG &DAG) const {
11225 SDValue Ptr = Op.getOperand(0);
11226 uint64_t KeyC = Op.getConstantOperandVal(1);
11227 SDValue AddrDiscriminator = Op.getOperand(2);
11228 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
11229 EVT VT = Op.getValueType();
11230 SDLoc DL(Op);
11231
11232 if (KeyC > AArch64PACKey::LAST)
11233 report_fatal_error("key in ptrauth global out of range [0, " +
11234 Twine((int)AArch64PACKey::LAST) + "]");
11235
11236 // Blend only works if the integer discriminator is 16-bit wide.
11237 if (!isUInt<16>(DiscriminatorC))
11239 "constant discriminator in ptrauth global out of range [0, 0xffff]");
11240
11241 // Choosing between 3 lowering alternatives is target-specific.
11242 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
11243 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
11244
11245 int64_t PtrOffsetC = 0;
11246 if (Ptr.getOpcode() == ISD::ADD) {
11247 PtrOffsetC = Ptr.getConstantOperandVal(1);
11248 Ptr = Ptr.getOperand(0);
11249 }
11250 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
11251 const GlobalValue *PtrGV = PtrN->getGlobal();
11252
11253 // Classify the reference to determine whether it needs a GOT load.
11254 const unsigned OpFlags =
11255 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
11256 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
11257 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
11258 "unsupported non-GOT op flags on ptrauth global reference");
11259
11260 // Fold any offset into the GV; our pseudos expect it there.
11261 PtrOffsetC += PtrN->getOffset();
11262 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
11263 /*TargetFlags=*/0);
11264 assert(PtrN->getTargetFlags() == 0 &&
11265 "unsupported target flags on ptrauth global");
11266
11267 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11268 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
11269 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
11270 ? AddrDiscriminator
11271 : DAG.getRegister(AArch64::XZR, MVT::i64);
11272
11273 // No GOT load needed -> MOVaddrPAC
11274 if (!NeedsGOTLoad) {
11275 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
11276 return SDValue(
11277 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
11278 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11279 0);
11280 }
11281
11282 // GOT load -> LOADgotPAC
11283 // Note that we disallow extern_weak refs to avoid null checks later.
11284 if (!PtrGV->hasExternalWeakLinkage())
11285 return SDValue(
11286 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
11287 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11288 0);
11289
11290 // extern_weak ref -> LOADauthptrstatic
11292 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
11293 DAG);
11294}
11295
11296// Looks through \param Val to determine the bit that can be used to
11297// check the sign of the value. It returns the unextended value and
11298// the sign bit position.
11299std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
11300 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
11301 return {Val.getOperand(0),
11302 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
11303 1};
11304
11305 if (Val.getOpcode() == ISD::SIGN_EXTEND)
11306 return {Val.getOperand(0),
11307 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
11308
11309 return {Val, Val.getValueSizeInBits() - 1};
11310}
11311
11312// Op is an SDValue that is being compared to 0. If the comparison is a bit
11313// test, optimize it to a TBZ or TBNZ.
11315 SDValue Dest, unsigned Opcode,
11316 SelectionDAG &DAG) {
11317 if (Op.getOpcode() != ISD::AND)
11318 return SDValue();
11319
11320 // See if we can use a TBZ to fold in an AND as well.
11321 // TBZ has a smaller branch displacement than CBZ. If the offset is
11322 // out of bounds, a late MI-layer pass rewrites branches.
11323 // 403.gcc is an example that hits this case.
11324 if (isa<ConstantSDNode>(Op.getOperand(1)) &&
11325 isPowerOf2_64(Op.getConstantOperandVal(1))) {
11326 SDValue Test = Op.getOperand(0);
11327 uint64_t Mask = Op.getConstantOperandVal(1);
11328 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Test,
11329 DAG.getConstant(Log2_64(Mask), DL, MVT::i64), Dest);
11330 }
11331
11332 if (Op.getOperand(0).getOpcode() == ISD::SHL) {
11333 auto Op00 = Op.getOperand(0).getOperand(0);
11334 if (isa<ConstantSDNode>(Op00) && Op00->getAsZExtVal() == 1) {
11335 auto Shr = DAG.getNode(ISD::SRL, DL, Op00.getValueType(),
11336 Op.getOperand(1), Op.getOperand(0).getOperand(1));
11337 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Shr,
11338 DAG.getConstant(0, DL, MVT::i64), Dest);
11339 }
11340 }
11341
11342 return SDValue();
11343}
11344
11345SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
11346 SDValue Chain = Op.getOperand(0);
11347 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
11348 SDValue LHS = Op.getOperand(2);
11349 SDValue RHS = Op.getOperand(3);
11350 SDValue Dest = Op.getOperand(4);
11351 SDLoc DL(Op);
11352
11353 MachineFunction &MF = DAG.getMachineFunction();
11354 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
11355 // will not be produced, as they are conditional branch instructions that do
11356 // not set flags.
11357 bool ProduceNonFlagSettingCondBr =
11358 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
11359
11360 // Handle f128 first, since lowering it will result in comparing the return
11361 // value of a libcall against zero, which is just what the rest of LowerBR_CC
11362 // is expecting to deal with.
11363 if (LHS.getValueType() == MVT::f128) {
11364 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11365
11366 // If softenSetCCOperands returned a scalar, we need to compare the result
11367 // against zero to select between true and false values.
11368 if (!RHS.getNode()) {
11369 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11370 CC = ISD::SETNE;
11371 }
11372 }
11373
11374 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
11375 // instruction.
11377 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
11378 // Only lower legal XALUO ops.
11379 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
11380 return SDValue();
11381
11382 // The actual operation with overflow check.
11384 SDValue Value, Overflow;
11385 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
11386
11387 if (CC == ISD::SETNE)
11388 OFCC = getInvertedCondCode(OFCC);
11389 SDValue CCVal = getCondCode(DAG, OFCC);
11390
11391 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11392 Overflow);
11393 }
11394
11395 if (LHS.getValueType().isInteger()) {
11396 assert((LHS.getValueType() == RHS.getValueType()) &&
11397 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11398
11399 // If the RHS of the comparison is zero, we can potentially fold this
11400 // to a specialized branch.
11401 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11402 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
11403 if (CC == ISD::SETEQ) {
11404 if (SDValue Result =
11405 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBZ, DAG))
11406 return Result;
11407
11408 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11409 } else if (CC == ISD::SETNE) {
11410 if (SDValue Result =
11411 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBNZ, DAG))
11412 return Result;
11413
11414 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11415 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11416 // Don't combine AND since emitComparison converts the AND to an ANDS
11417 // (a.k.a. TST) and the test in the test bit and branch instruction
11418 // becomes redundant. This would also increase register pressure.
11419 uint64_t SignBitPos;
11420 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11421 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11422 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11423 }
11424 }
11425 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11426 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11427 // Don't combine AND since emitComparison converts the AND to an ANDS
11428 // (a.k.a. TST) and the test in the test bit and branch instruction
11429 // becomes redundant. This would also increase register pressure.
11430 uint64_t SignBitPos;
11431 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11432 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11433 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11434 }
11435
11436 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11437 // larger branch displacement but do prefer CB over cmp + br.
11438 if (Subtarget->hasCMPBR() &&
11440 ProduceNonFlagSettingCondBr) {
11441 SDValue Cond =
11443 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11444 Dest);
11445 }
11446
11447 SDValue CCVal;
11448 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11449 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11450 Cmp);
11451 }
11452
11453 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11454 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11455
11456 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11457 // clean. Some of them require two branches to implement.
11458 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11459 AArch64CC::CondCode CC1, CC2;
11460 changeFPCCToAArch64CC(CC, CC1, CC2);
11461 SDValue CC1Val = getCondCode(DAG, CC1);
11462 SDValue BR1 =
11463 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11464 if (CC2 != AArch64CC::AL) {
11465 SDValue CC2Val = getCondCode(DAG, CC2);
11466 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11467 Cmp);
11468 }
11469
11470 return BR1;
11471}
11472
11473SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11474 SelectionDAG &DAG) const {
11475 if (!Subtarget->isNeonAvailable() &&
11476 !Subtarget->useSVEForFixedLengthVectors())
11477 return SDValue();
11478
11479 EVT VT = Op.getValueType();
11480 EVT IntVT = VT.changeTypeToInteger();
11481 SDLoc DL(Op);
11482
11483 SDValue In1 = Op.getOperand(0);
11484 SDValue In2 = Op.getOperand(1);
11485 EVT SrcVT = In2.getValueType();
11486
11487 if (!SrcVT.bitsEq(VT))
11488 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11489
11490 if (VT.isScalableVector())
11491 IntVT =
11493
11494 if (VT.isFixedLengthVector() &&
11495 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11496 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11497
11498 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11499 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11500
11501 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11502 return convertFromScalableVector(DAG, VT, Res);
11503 }
11504
11505 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11506 // a SVE FCOPYSIGN.
11507 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11508 Subtarget->isSVEorStreamingSVEAvailable()) {
11509 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11510 return SDValue();
11511 EVT SVT = getPackedSVEVectorVT(VT);
11512
11513 SDValue Ins1 =
11514 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11515 DAG.getConstant(0, DL, MVT::i64));
11516 SDValue Ins2 =
11517 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11518 DAG.getConstant(0, DL, MVT::i64));
11519 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11520 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11521 DAG.getConstant(0, DL, MVT::i64));
11522 }
11523
11524 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11525 if (VT.isScalableVector())
11526 return getSVESafeBitCast(VT, Op, DAG);
11527
11528 return DAG.getBitcast(VT, Op);
11529 };
11530
11531 SDValue VecVal1, VecVal2;
11532 EVT VecVT;
11533 auto SetVecVal = [&](int Idx = -1) {
11534 if (!VT.isVector()) {
11535 VecVal1 =
11536 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11537 VecVal2 =
11538 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11539 } else {
11540 VecVal1 = BitCast(VecVT, In1, DAG);
11541 VecVal2 = BitCast(VecVT, In2, DAG);
11542 }
11543 };
11544 if (VT.isVector()) {
11545 VecVT = IntVT;
11546 SetVecVal();
11547 } else if (VT == MVT::f64) {
11548 VecVT = MVT::v2i64;
11549 SetVecVal(AArch64::dsub);
11550 } else if (VT == MVT::f32) {
11551 VecVT = MVT::v4i32;
11552 SetVecVal(AArch64::ssub);
11553 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11554 VecVT = MVT::v8i16;
11555 SetVecVal(AArch64::hsub);
11556 } else {
11557 llvm_unreachable("Invalid type for copysign!");
11558 }
11559
11560 unsigned BitWidth = In1.getScalarValueSizeInBits();
11561 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11562
11563 // We want to materialize a mask with every bit but the high bit set, but the
11564 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11565 // 64-bit elements. Instead, materialize all bits set and then negate that.
11566 if (VT == MVT::f64 || VT == MVT::v2f64) {
11567 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11568 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11569 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11570 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11571 }
11572
11573 SDValue BSP =
11574 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11575 if (VT == MVT::f16 || VT == MVT::bf16)
11576 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11577 if (VT == MVT::f32)
11578 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11579 if (VT == MVT::f64)
11580 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11581
11582 return BitCast(VT, BSP, DAG);
11583}
11584
11585SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11586 SelectionDAG &DAG) const {
11588 Attribute::NoImplicitFloat))
11589 return SDValue();
11590
11591 EVT VT = Op.getValueType();
11592 if (VT.isScalableVector() ||
11593 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11594 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11595
11596 bool IsParity = Op.getOpcode() == ISD::PARITY;
11597 SDValue Val = Op.getOperand(0);
11598 SDLoc DL(Op);
11599
11600 // for i32, general parity function using EORs is more efficient compared to
11601 // using floating point
11602 if (VT == MVT::i32 && IsParity)
11603 return SDValue();
11604
11605 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11606 if (VT == MVT::i32 || VT == MVT::i64) {
11607 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11608 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11609 DAG.getUNDEF(ContainerVT), Val,
11610 DAG.getVectorIdxConstant(0, DL));
11611 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11612 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11613 DAG.getVectorIdxConstant(0, DL));
11614 if (IsParity)
11615 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11616 return Val;
11617 }
11618
11619 if (VT == MVT::i128) {
11620 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11621 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11622 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11623 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11624 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11625 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11626 if (IsParity)
11627 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11628 return Val;
11629 }
11630 }
11631
11632 if (!Subtarget->isNeonAvailable())
11633 return SDValue();
11634
11635 // If there is no CNT instruction available, GPR popcount can
11636 // be more efficiently lowered to the following sequence that uses
11637 // AdvSIMD registers/instructions as long as the copies to/from
11638 // the AdvSIMD registers are cheap.
11639 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11640 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11641 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11642 // FMOV X0, D0 // copy result back to integer reg
11643 if (VT == MVT::i32 || VT == MVT::i64) {
11644 if (VT == MVT::i32)
11645 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11646 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11647
11648 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11649 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11650 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11651 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11652 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11653 DAG.getConstant(0, DL, MVT::i64));
11654 if (IsParity)
11655 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11656 return AddV;
11657 } else if (VT == MVT::i128) {
11658 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11659
11660 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11661 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11662 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11663 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11664 DAG.getConstant(0, DL, MVT::i64));
11665 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11666 if (IsParity)
11667 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11668 return AddV;
11669 }
11670
11671 assert(!IsParity && "ISD::PARITY of vector types not supported");
11672
11673 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11674 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11675 "Unexpected type for custom ctpop lowering");
11676
11677 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11678 Val = DAG.getBitcast(VT8Bit, Val);
11679 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11680
11681 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11682 VT.getVectorNumElements() >= 2) {
11683 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11684 SDValue Zeros = DAG.getConstant(0, DL, DT);
11685 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11686
11687 if (VT == MVT::v2i64) {
11688 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11689 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11690 } else if (VT == MVT::v2i32) {
11691 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11692 } else if (VT == MVT::v4i32) {
11693 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11694 } else {
11695 llvm_unreachable("Unexpected type for custom ctpop lowering");
11696 }
11697
11698 return Val;
11699 }
11700
11701 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11702 unsigned EltSize = 8;
11703 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11704 while (EltSize != VT.getScalarSizeInBits()) {
11705 EltSize *= 2;
11706 NumElts /= 2;
11707 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11708 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11709 }
11710
11711 return Val;
11712}
11713
11714SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11715 EVT VT = Op.getValueType();
11716 assert(VT.isScalableVector() ||
11718 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11719
11720 SDLoc DL(Op);
11721 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11722 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11723}
11724
11725SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11726 SelectionDAG &DAG) const {
11727
11728 EVT VT = Op.getValueType();
11729 SDLoc DL(Op);
11730 unsigned Opcode = Op.getOpcode();
11731 ISD::CondCode CC;
11732 switch (Opcode) {
11733 default:
11734 llvm_unreachable("Wrong instruction");
11735 case ISD::SMAX:
11736 CC = ISD::SETGT;
11737 break;
11738 case ISD::SMIN:
11739 CC = ISD::SETLT;
11740 break;
11741 case ISD::UMAX:
11742 CC = ISD::SETUGT;
11743 break;
11744 case ISD::UMIN:
11745 CC = ISD::SETULT;
11746 break;
11747 }
11748
11749 // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11750 // prefer using SVE if available.
11751 if (VT.isScalableVector() ||
11752 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
11753 switch (Opcode) {
11754 default:
11755 llvm_unreachable("Wrong instruction");
11756 case ISD::SMAX:
11757 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11758 case ISD::SMIN:
11759 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11760 case ISD::UMAX:
11761 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11762 case ISD::UMIN:
11763 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11764 }
11765 }
11766
11767 SDValue Op0 = Op.getOperand(0);
11768 SDValue Op1 = Op.getOperand(1);
11769 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11770 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11771}
11772
11773SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11774 SelectionDAG &DAG) const {
11775 EVT VT = Op.getValueType();
11776
11777 if (VT.isScalableVector() ||
11779 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11780 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11781
11782 SDLoc DL(Op);
11783 SDValue REVB;
11784 MVT VST;
11785
11786 switch (VT.getSimpleVT().SimpleTy) {
11787 default:
11788 llvm_unreachable("Invalid type for bitreverse!");
11789
11790 case MVT::v2i32: {
11791 VST = MVT::v8i8;
11792 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11793
11794 break;
11795 }
11796
11797 case MVT::v4i32: {
11798 VST = MVT::v16i8;
11799 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11800
11801 break;
11802 }
11803
11804 case MVT::v1i64: {
11805 VST = MVT::v8i8;
11806 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11807
11808 break;
11809 }
11810
11811 case MVT::v2i64: {
11812 VST = MVT::v16i8;
11813 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11814
11815 break;
11816 }
11817 }
11818
11819 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11820 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11821}
11822
11823// Check whether the continuous comparison sequence.
11824static bool
11825isOrXorChain(SDValue N, unsigned &Num,
11826 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11827 if (Num == MaxXors)
11828 return false;
11829
11830 // Skip the one-use zext
11831 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11832 N = N->getOperand(0);
11833
11834 // The leaf node must be XOR
11835 if (N->getOpcode() == ISD::XOR) {
11836 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11837 Num++;
11838 return true;
11839 }
11840
11841 // All the non-leaf nodes must be OR.
11842 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11843 return false;
11844
11845 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11846 isOrXorChain(N->getOperand(1), Num, WorkList))
11847 return true;
11848 return false;
11849}
11850
11851// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11853 SDValue LHS = N->getOperand(0);
11854 SDValue RHS = N->getOperand(1);
11855 SDLoc DL(N);
11856 EVT VT = N->getValueType(0);
11858
11859 // Only handle integer compares.
11860 if (N->getOpcode() != ISD::SETCC)
11861 return SDValue();
11862
11863 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11864 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11865 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11866 unsigned NumXors = 0;
11867 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11868 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11869 isOrXorChain(LHS, NumXors, WorkList)) {
11870 SDValue XOR0, XOR1;
11871 std::tie(XOR0, XOR1) = WorkList[0];
11872 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11873 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11874 for (unsigned I = 1; I < WorkList.size(); I++) {
11875 std::tie(XOR0, XOR1) = WorkList[I];
11876 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11877 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11878 }
11879
11880 // Exit early by inverting the condition, which help reduce indentations.
11881 return Cmp;
11882 }
11883
11884 return SDValue();
11885}
11886
11887SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11888
11889 if (Op.getValueType().isVector())
11890 return LowerVSETCC(Op, DAG);
11891
11892 bool IsStrict = Op->isStrictFPOpcode();
11893 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11894 unsigned OpNo = IsStrict ? 1 : 0;
11895 SDValue Chain;
11896 if (IsStrict)
11897 Chain = Op.getOperand(0);
11898 SDValue LHS = Op.getOperand(OpNo + 0);
11899 SDValue RHS = Op.getOperand(OpNo + 1);
11900 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11901 SDLoc DL(Op);
11902
11903 // We chose ZeroOrOneBooleanContents, so use zero and one.
11904 EVT VT = Op.getValueType();
11905 SDValue TVal = DAG.getConstant(1, DL, VT);
11906 SDValue FVal = DAG.getConstant(0, DL, VT);
11907
11908 // Handle f128 first, since one possible outcome is a normal integer
11909 // comparison which gets picked up by the next if statement.
11910 if (LHS.getValueType() == MVT::f128) {
11911 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11912 IsSignaling);
11913
11914 // If softenSetCCOperands returned a scalar, use it.
11915 if (!RHS.getNode()) {
11916 assert(LHS.getValueType() == Op.getValueType() &&
11917 "Unexpected setcc expansion!");
11918 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11919 }
11920 }
11921
11922 if (LHS.getValueType().isInteger()) {
11923 if (Subtarget->hasCSSC() && CC == ISD::SETNE && isNullConstant(RHS)) {
11924 SDValue One = DAG.getConstant(1, DL, LHS.getValueType());
11925 SDValue UMin = DAG.getNode(ISD::UMIN, DL, LHS.getValueType(), LHS, One);
11926 SDValue Res = DAG.getZExtOrTrunc(UMin, DL, VT);
11927 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11928 }
11929 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11930
11931 SDValue CCVal;
11933 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11934
11935 // Note that we inverted the condition above, so we reverse the order of
11936 // the true and false operands here. This will allow the setcc to be
11937 // matched to a single CSINC instruction.
11938 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11939 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11940 }
11941
11942 // Now we know we're dealing with FP values.
11943 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11944 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11945
11946 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11947 // and do the comparison.
11948 SDValue Cmp;
11949 if (IsStrict)
11950 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11951 else
11952 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11953
11954 AArch64CC::CondCode CC1, CC2;
11955 changeFPCCToAArch64CC(CC, CC1, CC2);
11956 SDValue Res;
11957 if (CC2 == AArch64CC::AL) {
11958 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11959 CC2);
11960 SDValue CC1Val = getCondCode(DAG, CC1);
11961
11962 // Note that we inverted the condition above, so we reverse the order of
11963 // the true and false operands here. This will allow the setcc to be
11964 // matched to a single CSINC instruction.
11965 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11966 } else {
11967 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11968 // totally clean. Some of them require two CSELs to implement. As is in
11969 // this case, we emit the first CSEL and then emit a second using the output
11970 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11971
11972 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11973 SDValue CC1Val = getCondCode(DAG, CC1);
11974 SDValue CS1 =
11975 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11976
11977 SDValue CC2Val = getCondCode(DAG, CC2);
11978 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11979 }
11980 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11981}
11982
11983SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11984 SelectionDAG &DAG) const {
11985
11986 SDValue LHS = Op.getOperand(0);
11987 SDValue RHS = Op.getOperand(1);
11988 EVT VT = LHS.getValueType();
11989 if (VT != MVT::i32 && VT != MVT::i64)
11990 return SDValue();
11991
11992 SDLoc DL(Op);
11993 SDValue Carry = Op.getOperand(2);
11994 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11995 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11996 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11997 LHS, RHS, InvCarry);
11998
11999 EVT OpVT = Op.getValueType();
12000 SDValue TVal = DAG.getConstant(1, DL, OpVT);
12001 SDValue FVal = DAG.getConstant(0, DL, OpVT);
12002
12003 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
12005 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
12006 // Inputs are swapped because the condition is inverted. This will allow
12007 // matching with a single CSINC instruction.
12008 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
12009 Cmp.getValue(1));
12010}
12011
12012/// Emit vector comparison for floating-point values, producing a mask.
12014 AArch64CC::CondCode CC, bool NoNans, EVT VT,
12015 const SDLoc &DL, SelectionDAG &DAG) {
12016 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
12017 "function only supposed to emit natural comparisons");
12018
12019 switch (CC) {
12020 default:
12021 return SDValue();
12022 case AArch64CC::NE: {
12023 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12024 // Use vector semantics for the inversion to potentially save a copy between
12025 // SIMD and regular registers.
12026 if (!LHS.getValueType().isVector()) {
12027 EVT VecVT =
12028 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12029 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12030 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
12031 DAG.getUNDEF(VecVT), Fcmeq, Zero);
12032 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
12033 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
12034 }
12035 return DAG.getNOT(DL, Fcmeq, VT);
12036 }
12037 case AArch64CC::EQ:
12038 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12039 case AArch64CC::GE:
12040 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
12041 case AArch64CC::GT:
12042 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
12043 case AArch64CC::LE:
12044 if (!NoNans)
12045 return SDValue();
12046 // If we ignore NaNs then we can use to the LS implementation.
12047 [[fallthrough]];
12048 case AArch64CC::LS:
12049 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
12050 case AArch64CC::LT:
12051 if (!NoNans)
12052 return SDValue();
12053 // If we ignore NaNs then we can use to the MI implementation.
12054 [[fallthrough]];
12055 case AArch64CC::MI:
12056 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
12057 }
12058}
12059
12060/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
12061/// values are scalars, try to emit a mask generating vector instruction.
12063 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
12064 const SDLoc &DL, SelectionDAG &DAG) {
12065 assert(!LHS.getValueType().isVector());
12066 assert(!RHS.getValueType().isVector());
12067
12068 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
12069 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
12070 if (!CTVal || !CFVal)
12071 return {};
12072 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
12073 !(CTVal->isZero() && CFVal->isAllOnes()))
12074 return {};
12075
12076 if (CTVal->isZero())
12077 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12078
12079 EVT VT = TVal.getValueType();
12080 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
12081 return {};
12082
12083 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
12084 bool OneNaN = false;
12085 if (LHS == RHS) {
12086 OneNaN = true;
12087 } else if (DAG.isKnownNeverNaN(RHS)) {
12088 OneNaN = true;
12089 RHS = LHS;
12090 } else if (DAG.isKnownNeverNaN(LHS)) {
12091 OneNaN = true;
12092 LHS = RHS;
12093 }
12094 if (OneNaN)
12095 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
12096 }
12097
12100 bool ShouldInvert = false;
12101 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12102 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
12103 SDValue Cmp2;
12104 if (CC2 != AArch64CC::AL) {
12105 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
12106 if (!Cmp2)
12107 return {};
12108 }
12109 if (!Cmp2 && !ShouldInvert)
12110 return Cmp;
12111
12112 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12113 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12114 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
12115 Zero);
12116 if (Cmp2) {
12117 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
12118 Cmp2, Zero);
12119 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
12120 }
12121 if (ShouldInvert)
12122 Cmp = DAG.getNOT(DL, Cmp, VecVT);
12123 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
12124 return Cmp;
12125}
12126
12127SDValue AArch64TargetLowering::LowerSELECT_CC(
12130 const SDLoc &DL, SelectionDAG &DAG) const {
12131 // Handle f128 first, because it will result in a comparison of some RTLIB
12132 // call result against zero.
12133 if (LHS.getValueType() == MVT::f128) {
12134 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
12135
12136 // If softenSetCCOperands returned a scalar, we need to compare the result
12137 // against zero to select between true and false values.
12138 if (!RHS.getNode()) {
12139 RHS = DAG.getConstant(0, DL, LHS.getValueType());
12140 CC = ISD::SETNE;
12141 }
12142 }
12143
12144 // Also handle f16, for which we need to do a f32 comparison.
12145 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
12146 LHS.getValueType() == MVT::bf16) {
12147 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
12148 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
12149 }
12150
12151 // Next, handle integers.
12152 if (LHS.getValueType().isInteger()) {
12153 assert((LHS.getValueType() == RHS.getValueType()) &&
12154 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
12155
12156 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
12157 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
12158 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
12159
12160 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
12161 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
12162 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
12163 // Both require less instructions than compare and conditional select.
12164 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
12165 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
12166 LHS.getValueType() == RHS.getValueType()) {
12167 EVT VT = LHS.getValueType();
12168 SDValue Shift =
12169 DAG.getNode(ISD::SRA, DL, VT, LHS,
12170 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
12171
12172 if (CC == ISD::SETGT)
12173 Shift = DAG.getNOT(DL, Shift, VT);
12174
12175 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
12176 }
12177
12178 // Check for sign bit test patterns that can use TST optimization.
12179 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
12180 // -> TST %operand, sign_bit; CSEL
12181 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
12182 // -> TST %operand, sign_bit; CSEL
12183 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
12184 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
12185 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12186
12187 uint64_t SignBitPos;
12188 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
12189 EVT TestVT = LHS.getValueType();
12190 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
12191 SDValue TST =
12192 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
12193 LHS, SignBitConst);
12194
12195 SDValue Flags = TST.getValue(1);
12196 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
12197 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
12198 }
12199
12200 // Canonicalise absolute difference patterns:
12201 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
12202 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
12203 //
12204 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
12205 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
12206 // The second forms can be matched into subs+cneg.
12207 // NOTE: Drop poison generating flags from the negated operand to avoid
12208 // inadvertently propagating poison after the canonicalisation.
12209 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
12210 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
12211 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
12213 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
12214 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
12215 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
12217 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
12218 }
12219 }
12220
12221 unsigned Opcode = AArch64ISD::CSEL;
12222
12223 // If both the TVal and the FVal are constants, see if we can swap them in
12224 // order to for a CSINV or CSINC out of them.
12225 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
12226 std::swap(TVal, FVal);
12227 std::swap(CTVal, CFVal);
12228 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12229 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
12230 std::swap(TVal, FVal);
12231 std::swap(CTVal, CFVal);
12232 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12233 } else if (TVal.getOpcode() == ISD::XOR) {
12234 // If TVal is a NOT we want to swap TVal and FVal so that we can match
12235 // with a CSINV rather than a CSEL.
12236 if (isAllOnesConstant(TVal.getOperand(1))) {
12237 std::swap(TVal, FVal);
12238 std::swap(CTVal, CFVal);
12239 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12240 }
12241 } else if (TVal.getOpcode() == ISD::SUB) {
12242 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
12243 // that we can match with a CSNEG rather than a CSEL.
12244 if (isNullConstant(TVal.getOperand(0))) {
12245 std::swap(TVal, FVal);
12246 std::swap(CTVal, CFVal);
12247 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12248 }
12249 } else if (CTVal && CFVal) {
12250 const int64_t TrueVal = CTVal->getSExtValue();
12251 const int64_t FalseVal = CFVal->getSExtValue();
12252 bool Swap = false;
12253
12254 // If both TVal and FVal are constants, see if FVal is the
12255 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
12256 // instead of a CSEL in that case.
12257 if (TrueVal == ~FalseVal) {
12258 Opcode = AArch64ISD::CSINV;
12259 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
12260 TrueVal == -FalseVal) {
12261 Opcode = AArch64ISD::CSNEG;
12262 } else if (TVal.getValueType() == MVT::i32) {
12263 // If our operands are only 32-bit wide, make sure we use 32-bit
12264 // arithmetic for the check whether we can use CSINC. This ensures that
12265 // the addition in the check will wrap around properly in case there is
12266 // an overflow (which would not be the case if we do the check with
12267 // 64-bit arithmetic).
12268 const uint32_t TrueVal32 = CTVal->getZExtValue();
12269 const uint32_t FalseVal32 = CFVal->getZExtValue();
12270
12271 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
12272 Opcode = AArch64ISD::CSINC;
12273
12274 if (TrueVal32 > FalseVal32) {
12275 Swap = true;
12276 }
12277 }
12278 } else {
12279 // 64-bit check whether we can use CSINC.
12280 const uint64_t TrueVal64 = TrueVal;
12281 const uint64_t FalseVal64 = FalseVal;
12282
12283 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
12284 Opcode = AArch64ISD::CSINC;
12285
12286 if (TrueVal > FalseVal) {
12287 Swap = true;
12288 }
12289 }
12290 }
12291
12292 // Swap TVal and FVal if necessary.
12293 if (Swap) {
12294 std::swap(TVal, FVal);
12295 std::swap(CTVal, CFVal);
12296 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12297 }
12298
12299 if (Opcode != AArch64ISD::CSEL) {
12300 // Drop FVal since we can get its value by simply inverting/negating
12301 // TVal.
12302 FVal = TVal;
12303 }
12304 }
12305
12306 // Avoid materializing a constant when possible by reusing a known value in
12307 // a register. However, don't perform this optimization if the known value
12308 // is one, zero or negative one in the case of a CSEL. We can always
12309 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
12310 // FVal, respectively.
12311 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
12312 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
12313 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
12315 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
12316 // "a != C ? x : a" to avoid materializing C.
12317 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
12318 TVal = LHS;
12319 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
12320 FVal = LHS;
12321 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
12322 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
12323 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
12324 // avoid materializing C.
12326 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
12327 Opcode = AArch64ISD::CSINV;
12328 TVal = LHS;
12329 FVal = DAG.getConstant(0, DL, FVal.getValueType());
12330 }
12331 }
12332
12333 SDValue CCVal;
12334 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
12335 EVT VT = TVal.getValueType();
12336 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
12337 }
12338
12339 // Now we know we're dealing with FP values.
12340 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
12341 LHS.getValueType() == MVT::f64);
12342 assert(LHS.getValueType() == RHS.getValueType());
12343 EVT VT = TVal.getValueType();
12344
12345 // If the purpose of the comparison is to select between all ones
12346 // or all zeros, try to use a vector comparison because the operands are
12347 // already stored in SIMD registers.
12348 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
12349 switch (U->getOpcode()) {
12350 default:
12351 return false;
12354 case AArch64ISD::DUP:
12355 return true;
12356 }
12357 })) {
12358 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
12359 SDValue VectorCmp =
12360 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
12361 if (VectorCmp)
12362 return VectorCmp;
12363 }
12364
12365 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12366
12367 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12368 // clean. Some of them require two CSELs to implement.
12369 AArch64CC::CondCode CC1, CC2;
12370 changeFPCCToAArch64CC(CC, CC1, CC2);
12371
12372 if (Flags.hasNoSignedZeros()) {
12373 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
12374 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
12375 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
12376 if (RHSVal && RHSVal->isZero()) {
12377 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
12378 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
12379
12380 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
12381 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
12382 TVal = LHS;
12383 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
12384 CFVal && CFVal->isZero() &&
12385 FVal.getValueType() == LHS.getValueType())
12386 FVal = LHS;
12387 }
12388 }
12389
12390 // Emit first, and possibly only, CSEL.
12391 SDValue CC1Val = getCondCode(DAG, CC1);
12392 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12393
12394 // If we need a second CSEL, emit it, using the output of the first as the
12395 // RHS. We're effectively OR'ing the two CC's together.
12396 if (CC2 != AArch64CC::AL) {
12397 SDValue CC2Val = getCondCode(DAG, CC2);
12398 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12399 }
12400
12401 // Otherwise, return the output of the first CSEL.
12402 return CS1;
12403}
12404
12405SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12406 SelectionDAG &DAG) const {
12407 EVT Ty = Op.getValueType();
12408 auto Idx = Op.getConstantOperandAPInt(2);
12409 int64_t IdxVal = Idx.getSExtValue();
12410 assert(Ty.isScalableVector() &&
12411 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12412
12413 // We can use the splice instruction for certain index values where we are
12414 // able to efficiently generate the correct predicate. The index will be
12415 // inverted and used directly as the input to the ptrue instruction, i.e.
12416 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12417 // splice predicate. However, we can only do this if we can guarantee that
12418 // there are enough elements in the vector, hence we check the index <= min
12419 // number of elements.
12420 std::optional<unsigned> PredPattern;
12421 if (Ty.isScalableVector() && Op.getOpcode() == ISD::VECTOR_SPLICE_RIGHT &&
12422 (PredPattern = getSVEPredPatternFromNumElements(IdxVal)) !=
12423 std::nullopt) {
12424 SDLoc DL(Op);
12425
12426 // Create a predicate where all but the last -IdxVal elements are false.
12427 EVT PredVT = Ty.changeVectorElementType(*DAG.getContext(), MVT::i1);
12428 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12429 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12430
12431 // Now splice the two inputs together using the predicate.
12432 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12433 Op.getOperand(1));
12434 }
12435
12436 // We can select to an EXT instruction when indexing the first 256 bytes.
12438 if (Op.getOpcode() == ISD::VECTOR_SPLICE_LEFT &&
12439 (IdxVal * BlockSize / 8) < 256)
12440 return Op;
12441
12442 return SDValue();
12443}
12444
12445SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12446 SelectionDAG &DAG) const {
12447 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12448 SDValue LHS = Op.getOperand(0);
12449 SDValue RHS = Op.getOperand(1);
12450 SDValue TVal = Op.getOperand(2);
12451 SDValue FVal = Op.getOperand(3);
12452 SDNodeFlags Flags = Op->getFlags();
12453 SDLoc DL(Op);
12454 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12455}
12456
12457SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12458 SelectionDAG &DAG) const {
12459 SDValue CCVal = Op->getOperand(0);
12460 SDValue TVal = Op->getOperand(1);
12461 SDValue FVal = Op->getOperand(2);
12462 SDLoc DL(Op);
12463
12464 EVT Ty = Op.getValueType();
12465 if (Ty == MVT::aarch64svcount) {
12466 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12467 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12468 SDValue Sel =
12469 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12470 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12471 }
12472
12473 if (Ty.isScalableVector()) {
12474 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12475 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12476 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12477 }
12478
12479 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12480 // FIXME: Ideally this would be the same as above using i1 types, however
12481 // for the moment we can't deal with fixed i1 vector types properly, so
12482 // instead extend the predicate to a result type sized integer vector.
12483 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12484 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12485 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12486 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12487 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12488 }
12489
12490 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12491 // instruction.
12492 if (ISD::isOverflowIntrOpRes(CCVal)) {
12493 // Only lower legal XALUO ops.
12494 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12495 return SDValue();
12496
12498 SDValue Value, Overflow;
12499 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12500 SDValue CCVal = getCondCode(DAG, OFCC);
12501
12502 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12503 CCVal, Overflow);
12504 }
12505
12506 // Lower it the same way as we would lower a SELECT_CC node.
12507 ISD::CondCode CC;
12508 SDValue LHS, RHS;
12509 if (CCVal.getOpcode() == ISD::SETCC) {
12510 LHS = CCVal.getOperand(0);
12511 RHS = CCVal.getOperand(1);
12512 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12513 } else {
12514 LHS = CCVal;
12515 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12516 CC = ISD::SETNE;
12517 }
12518
12519 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12520 // order to use FCSELSrrr
12521 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12522 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12523 DAG.getUNDEF(MVT::f32), TVal);
12524 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12525 DAG.getUNDEF(MVT::f32), FVal);
12526 }
12527
12528 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12529 Op->getFlags(), DL, DAG);
12530
12531 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12532 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12533 }
12534
12535 return Res;
12536}
12537
12538SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12539 SelectionDAG &DAG) const {
12540 // Jump table entries as PC relative offsets. No additional tweaking
12541 // is necessary here. Just get the address of the jump table.
12542 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12543
12546 !Subtarget->isTargetMachO())
12547 return getAddrLarge(JT, DAG);
12548 if (CM == CodeModel::Tiny)
12549 return getAddrTiny(JT, DAG);
12550 return getAddr(JT, DAG);
12551}
12552
12553SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12554 SelectionDAG &DAG) const {
12555 // Jump table entries as PC relative offsets. No additional tweaking
12556 // is necessary here. Just get the address of the jump table.
12557 SDLoc DL(Op);
12558 SDValue JT = Op.getOperand(1);
12559 SDValue Entry = Op.getOperand(2);
12560 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12561
12562 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12563 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12564
12565 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12566 // sequence later, to guarantee the integrity of the intermediate values.
12568 "aarch64-jump-table-hardening")) {
12570 if (Subtarget->isTargetMachO()) {
12571 if (CM != CodeModel::Small && CM != CodeModel::Large)
12572 report_fatal_error("Unsupported code-model for hardened jump-table");
12573 } else {
12574 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12575 assert(Subtarget->isTargetELF() &&
12576 "jump table hardening only supported on MachO/ELF");
12577 if (CM != CodeModel::Small)
12578 report_fatal_error("Unsupported code-model for hardened jump-table");
12579 }
12580
12581 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12582 Entry, SDValue());
12583 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12584 DAG.getTargetJumpTable(JTI, MVT::i32),
12585 X16Copy.getValue(0), X16Copy.getValue(1));
12586 return SDValue(B, 0);
12587 }
12588
12589 SDNode *Dest =
12590 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12591 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12592 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12593 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12594}
12595
12596SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12597 SDValue Chain = Op.getOperand(0);
12598 SDValue Dest = Op.getOperand(1);
12599
12600 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12601 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12602 if (Dest->isMachineOpcode() &&
12603 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12604 return SDValue();
12605
12606 const MachineFunction &MF = DAG.getMachineFunction();
12607 std::optional<uint16_t> BADisc =
12608 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12609 if (!BADisc)
12610 return SDValue();
12611
12612 SDLoc DL(Op);
12613
12614 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12616 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12617
12618 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12619 {Dest, Key, Disc, AddrDisc, Chain});
12620 return SDValue(BrA, 0);
12621}
12622
12623SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12624 SelectionDAG &DAG) const {
12625 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12627 if (CM == CodeModel::Large) {
12628 // Use the GOT for the large code model on iOS.
12629 if (Subtarget->isTargetMachO()) {
12630 return getGOT(CP, DAG);
12631 }
12633 return getAddrLarge(CP, DAG);
12634 } else if (CM == CodeModel::Tiny) {
12635 return getAddrTiny(CP, DAG);
12636 }
12637 return getAddr(CP, DAG);
12638}
12639
12640SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12641 SelectionDAG &DAG) const {
12642 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12643 const BlockAddress *BA = BAN->getBlockAddress();
12644
12645 if (std::optional<uint16_t> BADisc =
12646 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12647 *BA->getFunction())) {
12648 SDLoc DL(Op);
12649
12650 // This isn't cheap, but BRIND is rare.
12651 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12652
12653 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12654
12656 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12657
12658 SDNode *MOV =
12659 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12660 {TargetBA, Key, AddrDisc, Disc});
12661 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12662 SDValue(MOV, 1));
12663 }
12664
12666 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12668 return getAddrLarge(BAN, DAG);
12669 } else if (CM == CodeModel::Tiny) {
12670 return getAddrTiny(BAN, DAG);
12671 }
12672 return getAddr(BAN, DAG);
12673}
12674
12675SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12676 SelectionDAG &DAG) const {
12677 AArch64FunctionInfo *FuncInfo =
12678 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12679
12680 SDLoc DL(Op);
12681 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12683 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12684 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12685 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12686 MachinePointerInfo(SV));
12687}
12688
12689SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12690 SelectionDAG &DAG) const {
12691 MachineFunction &MF = DAG.getMachineFunction();
12692 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12693
12694 SDLoc DL(Op);
12695 SDValue FR;
12696 if (Subtarget->isWindowsArm64EC()) {
12697 // With the Arm64EC ABI, we compute the address of the varargs save area
12698 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12699 // but calls from an entry thunk can pass in a different address.
12700 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12701 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12702 uint64_t StackOffset;
12703 if (FuncInfo->getVarArgsGPRSize() > 0)
12704 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12705 else
12706 StackOffset = FuncInfo->getVarArgsStackOffset();
12707 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12708 DAG.getConstant(StackOffset, DL, MVT::i64));
12709 } else {
12710 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12711 ? FuncInfo->getVarArgsGPRIndex()
12712 : FuncInfo->getVarArgsStackIndex(),
12714 }
12715 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12716 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12717 MachinePointerInfo(SV));
12718}
12719
12720SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12721 SelectionDAG &DAG) const {
12722 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12723 // Standard, section B.3.
12724 MachineFunction &MF = DAG.getMachineFunction();
12725 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12726 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12727 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12728 auto PtrVT = getPointerTy(DAG.getDataLayout());
12729 SDLoc DL(Op);
12730
12731 SDValue Chain = Op.getOperand(0);
12732 SDValue VAList = Op.getOperand(1);
12733 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12735
12736 // void *__stack at offset 0
12737 unsigned Offset = 0;
12738 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12739 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12740 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12741 MachinePointerInfo(SV), Align(PtrSize)));
12742
12743 // void *__gr_top at offset 8 (4 on ILP32)
12744 Offset += PtrSize;
12745 int GPRSize = FuncInfo->getVarArgsGPRSize();
12746 if (GPRSize > 0) {
12747 SDValue GRTop, GRTopAddr;
12748
12749 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12750 DAG.getConstant(Offset, DL, PtrVT));
12751
12752 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12753 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12754 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12755 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12756
12757 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12758 MachinePointerInfo(SV, Offset),
12759 Align(PtrSize)));
12760 }
12761
12762 // void *__vr_top at offset 16 (8 on ILP32)
12763 Offset += PtrSize;
12764 int FPRSize = FuncInfo->getVarArgsFPRSize();
12765 if (FPRSize > 0) {
12766 SDValue VRTop, VRTopAddr;
12767 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12768 DAG.getConstant(Offset, DL, PtrVT));
12769
12770 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12771 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12772 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12773 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12774
12775 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12776 MachinePointerInfo(SV, Offset),
12777 Align(PtrSize)));
12778 }
12779
12780 // int __gr_offs at offset 24 (12 on ILP32)
12781 Offset += PtrSize;
12782 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12783 DAG.getConstant(Offset, DL, PtrVT));
12784 MemOps.push_back(
12785 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12786 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12787
12788 // int __vr_offs at offset 28 (16 on ILP32)
12789 Offset += 4;
12790 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12791 DAG.getConstant(Offset, DL, PtrVT));
12792 MemOps.push_back(
12793 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12794 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12795
12796 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12797}
12798
12799SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12800 SelectionDAG &DAG) const {
12801 MachineFunction &MF = DAG.getMachineFunction();
12802 Function &F = MF.getFunction();
12803
12804 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12805 return LowerWin64_VASTART(Op, DAG);
12806 else if (Subtarget->isTargetDarwin())
12807 return LowerDarwin_VASTART(Op, DAG);
12808 else
12809 return LowerAAPCS_VASTART(Op, DAG);
12810}
12811
12812SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12813 SelectionDAG &DAG) const {
12814 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12815 // pointer.
12816 SDLoc DL(Op);
12817 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12818 unsigned VaListSize =
12819 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12820 ? PtrSize
12821 : Subtarget->isTargetILP32() ? 20 : 32;
12822 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12823 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12824
12825 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12826 DAG.getConstant(VaListSize, DL, MVT::i32),
12827 Align(PtrSize), false, false, /*CI=*/nullptr,
12828 std::nullopt, MachinePointerInfo(DestSV),
12829 MachinePointerInfo(SrcSV));
12830}
12831
12832SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12833 assert(Subtarget->isTargetDarwin() &&
12834 "automatic va_arg instruction only works on Darwin");
12835
12836 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12837 EVT VT = Op.getValueType();
12838 SDLoc DL(Op);
12839 SDValue Chain = Op.getOperand(0);
12840 SDValue Addr = Op.getOperand(1);
12841 MaybeAlign Align(Op.getConstantOperandVal(3));
12842 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12843 auto PtrVT = getPointerTy(DAG.getDataLayout());
12844 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12845 SDValue VAList =
12846 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12847 Chain = VAList.getValue(1);
12848 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12849
12850 if (VT.isScalableVector())
12851 report_fatal_error("Passing SVE types to variadic functions is "
12852 "currently not supported");
12853
12854 if (Align && *Align > MinSlotSize) {
12855 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12856 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12857 VAList =
12858 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12859 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12860 }
12861
12862 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12863 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12864
12865 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12866 // up to 64 bits. At the very least, we have to increase the striding of the
12867 // vaargs list to match this, and for FP values we need to introduce
12868 // FP_ROUND nodes as well.
12869 if (VT.isInteger() && !VT.isVector())
12870 ArgSize = std::max(ArgSize, MinSlotSize);
12871 bool NeedFPTrunc = false;
12872 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12873 ArgSize = 8;
12874 NeedFPTrunc = true;
12875 }
12876
12877 // Increment the pointer, VAList, to the next vaarg
12878 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12879 DAG.getConstant(ArgSize, DL, PtrVT));
12880 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12881
12882 // Store the incremented VAList to the legalized pointer
12883 SDValue APStore =
12884 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12885
12886 // Load the actual argument out of the pointer VAList
12887 if (NeedFPTrunc) {
12888 // Load the value as an f64.
12889 SDValue WideFP =
12890 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12891 // Round the value down to an f32.
12892 SDValue NarrowFP =
12893 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12894 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12895 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12896 // Merge the rounded value with the chain output of the load.
12897 return DAG.getMergeValues(Ops, DL);
12898 }
12899
12900 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12901}
12902
12903SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12904 SelectionDAG &DAG) const {
12905 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12906 MFI.setFrameAddressIsTaken(true);
12907
12908 EVT VT = Op.getValueType();
12909 SDLoc DL(Op);
12910 unsigned Depth = Op.getConstantOperandVal(0);
12911 SDValue FrameAddr =
12912 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12913 while (Depth--)
12914 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12915 MachinePointerInfo());
12916
12917 if (Subtarget->isTargetILP32())
12918 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12919 DAG.getValueType(VT));
12920
12921 return FrameAddr;
12922}
12923
12924SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12925 SelectionDAG &DAG) const {
12926 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12927
12928 EVT VT = getPointerTy(DAG.getDataLayout());
12929 int FI = MFI.CreateFixedObject(4, 0, false);
12930 return DAG.getFrameIndex(FI, VT);
12931}
12932
12933#define GET_REGISTER_MATCHER
12934#include "AArch64GenAsmMatcher.inc"
12935
12936// FIXME? Maybe this could be a TableGen attribute on some registers and
12937// this table could be generated automatically from RegInfo.
12938Register AArch64TargetLowering::
12939getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12941 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12942 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12943 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12944 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12945 !MRI->isReservedReg(MF, Reg))
12946 Reg = Register();
12947 }
12948 return Reg;
12949}
12950
12951SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12952 SelectionDAG &DAG) const {
12954
12955 EVT VT = Op.getValueType();
12956 SDLoc DL(Op);
12957
12958 SDValue FrameAddr =
12959 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12961
12962 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12963}
12964
12965SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12966 SelectionDAG &DAG) const {
12967 MachineFunction &MF = DAG.getMachineFunction();
12968 MachineFrameInfo &MFI = MF.getFrameInfo();
12969 MFI.setReturnAddressIsTaken(true);
12970
12971 EVT VT = Op.getValueType();
12972 SDLoc DL(Op);
12973 unsigned Depth = Op.getConstantOperandVal(0);
12974 SDValue ReturnAddress;
12975 if (Depth) {
12976 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12978 ReturnAddress = DAG.getLoad(
12979 VT, DL, DAG.getEntryNode(),
12980 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12981 } else {
12982 // Return LR, which contains the return address. Mark it an implicit
12983 // live-in.
12984 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12985 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12986 }
12987
12988 // The XPACLRI instruction assembles to a hint-space instruction before
12989 // Armv8.3-A therefore this instruction can be safely used for any pre
12990 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12991 // that instead.
12992 SDNode *St;
12993 if (Subtarget->hasPAuth()) {
12994 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12995 } else {
12996 // XPACLRI operates on LR therefore we must move the operand accordingly.
12997 SDValue Chain =
12998 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12999 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
13000 }
13001 return SDValue(St, 0);
13002}
13003
13004/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
13005/// i32 values and take a 2 x i32 value to shift plus a shift amount.
13006SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
13007 SelectionDAG &DAG) const {
13008 SDValue Lo, Hi;
13009 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
13010 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
13011}
13012
13014 const GlobalAddressSDNode *GA) const {
13015 // Offsets are folded in the DAG combine rather than here so that we can
13016 // intelligently choose an offset based on the uses.
13017 return false;
13018}
13019
13021 bool OptForSize) const {
13022 bool IsLegal = false;
13023 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
13024 // 16-bit case when target has full fp16 support.
13025 // We encode bf16 bit patterns as if they were fp16. This results in very
13026 // strange looking assembly but should populate the register with appropriate
13027 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
13028 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
13029 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
13030 // FIXME: We should be able to handle f128 as well with a clever lowering.
13031 const APInt ImmInt = Imm.bitcastToAPInt();
13032 if (VT == MVT::f64)
13033 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
13034 else if (VT == MVT::f32)
13035 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
13036 else if (VT == MVT::f16 || VT == MVT::bf16)
13037 IsLegal =
13038 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
13039 Imm.isPosZero();
13040
13041 // If we can not materialize in immediate field for fmov, check if the
13042 // value can be encoded as the immediate operand of a logical instruction.
13043 // The immediate value will be created with either MOVZ, MOVN, or ORR.
13044 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
13045 // generate that fmov.
13046 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
13047 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
13048 // however the mov+fmov sequence is always better because of the reduced
13049 // cache pressure. The timings are still the same if you consider
13050 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
13051 // movw+movk is fused). So we limit up to 2 instrdduction at most.
13054 assert(Insn.size() <= 4 &&
13055 "Should be able to build any value with at most 4 moves");
13056 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
13057 IsLegal = Insn.size() <= Limit;
13058 }
13059
13060 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
13061 << " imm value: "; Imm.dump(););
13062 return IsLegal;
13063}
13064
13065//===----------------------------------------------------------------------===//
13066// AArch64 Optimization Hooks
13067//===----------------------------------------------------------------------===//
13068
13069static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
13070 SDValue Operand, SelectionDAG &DAG,
13071 int &ExtraSteps) {
13072 EVT VT = Operand.getValueType();
13073 if ((ST->hasNEON() &&
13074 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
13075 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
13076 VT == MVT::v4f32)) ||
13077 (ST->hasSVE() &&
13078 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
13080 // For the reciprocal estimates, convergence is quadratic, so the number
13081 // of digits is doubled after each iteration. In ARMv8, the accuracy of
13082 // the initial estimate is 2^-8. Thus the number of extra steps to refine
13083 // the result for float (23 mantissa bits) is 2 and for double (52
13084 // mantissa bits) is 3.
13085 constexpr unsigned AccurateBits = 8;
13086 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
13087 ExtraSteps = DesiredBits <= AccurateBits
13088 ? 0
13089 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
13090 }
13091
13092 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
13093 }
13094
13095 return SDValue();
13096}
13097
13098SDValue
13099AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13100 const DenormalMode &Mode) const {
13101 SDLoc DL(Op);
13102 EVT VT = Op.getValueType();
13103 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
13104 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
13105 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
13106}
13107
13108SDValue
13109AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
13110 SelectionDAG &DAG) const {
13111 return Op;
13112}
13113
13114SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
13115 SelectionDAG &DAG, int Enabled,
13116 int &ExtraSteps,
13117 bool &UseOneConst,
13118 bool Reciprocal) const {
13120 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
13121 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
13122 DAG, ExtraSteps)) {
13123 SDLoc DL(Operand);
13124 EVT VT = Operand.getValueType();
13125
13126 // Ensure nodes can be recognized by isAssociativeAndCommutative.
13127 SDNodeFlags Flags =
13129
13130 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
13131 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
13132 for (int i = ExtraSteps; i > 0; --i) {
13133 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
13134 Flags);
13135 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
13136 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13137 }
13138 if (!Reciprocal)
13139 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
13140
13141 ExtraSteps = 0;
13142 return Estimate;
13143 }
13144
13145 return SDValue();
13146}
13147
13148SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
13149 SelectionDAG &DAG, int Enabled,
13150 int &ExtraSteps) const {
13152 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
13153 DAG, ExtraSteps)) {
13154 SDLoc DL(Operand);
13155 EVT VT = Operand.getValueType();
13156
13158
13159 // Newton reciprocal iteration: E * (2 - X * E)
13160 // AArch64 reciprocal iteration instruction: (2 - M * N)
13161 for (int i = ExtraSteps; i > 0; --i) {
13162 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
13163 Estimate, Flags);
13164 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13165 }
13166
13167 ExtraSteps = 0;
13168 return Estimate;
13169 }
13170
13171 return SDValue();
13172}
13173
13174//===----------------------------------------------------------------------===//
13175// AArch64 Inline Assembly Support
13176//===----------------------------------------------------------------------===//
13177
13178// Table of Constraints
13179// TODO: This is the current set of constraints supported by ARM for the
13180// compiler, not all of them may make sense.
13181//
13182// r - A general register
13183// w - An FP/SIMD register of some size in the range v0-v31
13184// x - An FP/SIMD register of some size in the range v0-v15
13185// I - Constant that can be used with an ADD instruction
13186// J - Constant that can be used with a SUB instruction
13187// K - Constant that can be used with a 32-bit logical instruction
13188// L - Constant that can be used with a 64-bit logical instruction
13189// M - Constant that can be used as a 32-bit MOV immediate
13190// N - Constant that can be used as a 64-bit MOV immediate
13191// Q - A memory reference with base register and no offset
13192// S - A symbolic address
13193// Y - Floating point constant zero
13194// Z - Integer constant zero
13195//
13196// Note that general register operands will be output using their 64-bit x
13197// register name, whatever the size of the variable, unless the asm operand
13198// is prefixed by the %w modifier. Floating-point and SIMD register operands
13199// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
13200// %q modifier.
13201const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
13202 // At this point, we have to lower this constraint to something else, so we
13203 // lower it to an "r" or "w". However, by doing this we will force the result
13204 // to be in register, while the X constraint is much more permissive.
13205 //
13206 // Although we are correct (we are free to emit anything, without
13207 // constraints), we might break use cases that would expect us to be more
13208 // efficient and emit something else.
13209 if (!Subtarget->hasFPARMv8())
13210 return "r";
13211
13212 if (ConstraintVT.isFloatingPoint())
13213 return "w";
13214
13215 if (ConstraintVT.isVector() &&
13216 (ConstraintVT.getSizeInBits() == 64 ||
13217 ConstraintVT.getSizeInBits() == 128))
13218 return "w";
13219
13220 return "r";
13221}
13222
13224
13225// Returns a {Reg, RegisterClass} tuple if the constraint is
13226// a specific predicate register.
13227//
13228// For some constraint like "{pn3}" the default path in
13229// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
13230// suitable register class for this register is "PPRorPNR", after which it
13231// determines that nxv16i1 is an appropriate type for the constraint, which is
13232// not what we want. The code here pre-empts this by matching the register
13233// explicitly.
13234static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
13236 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
13237 (Constraint[1] != 'p' && Constraint[1] != 'z'))
13238 return std::nullopt;
13239
13240 bool IsPredicate = Constraint[1] == 'p';
13241 Constraint = Constraint.substr(2, Constraint.size() - 3);
13242 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
13243 if (IsPredicateAsCount)
13244 Constraint = Constraint.drop_front(1);
13245
13246 unsigned V;
13247 if (Constraint.getAsInteger(10, V) || V > 31)
13248 return std::nullopt;
13249
13250 if (IsPredicateAsCount)
13251 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
13252 if (IsPredicate)
13253 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
13254 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
13255}
13256
13257static std::optional<PredicateConstraint>
13260 .Case("Uph", PredicateConstraint::Uph)
13263 .Default(std::nullopt);
13264}
13265
13266static const TargetRegisterClass *
13268 if (VT != MVT::aarch64svcount &&
13269 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
13270 return nullptr;
13271
13272 switch (Constraint) {
13274 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
13275 : &AArch64::PPR_p8to15RegClass;
13277 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
13278 : &AArch64::PPR_3bRegClass;
13280 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
13281 : &AArch64::PPRRegClass;
13282 }
13283
13284 llvm_unreachable("Missing PredicateConstraint!");
13285}
13286
13288
13289static std::optional<ReducedGprConstraint>
13292 .Case("Uci", ReducedGprConstraint::Uci)
13294 .Default(std::nullopt);
13295}
13296
13297static const TargetRegisterClass *
13299 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
13300 return nullptr;
13301
13302 switch (Constraint) {
13304 return &AArch64::MatrixIndexGPR32_8_11RegClass;
13306 return &AArch64::MatrixIndexGPR32_12_15RegClass;
13307 }
13308
13309 llvm_unreachable("Missing ReducedGprConstraint!");
13310}
13311
13312// The set of cc code supported is from
13313// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
13316 .Case("{@cchi}", AArch64CC::HI)
13317 .Case("{@cccs}", AArch64CC::HS)
13318 .Case("{@cclo}", AArch64CC::LO)
13319 .Case("{@ccls}", AArch64CC::LS)
13320 .Case("{@cccc}", AArch64CC::LO)
13321 .Case("{@cceq}", AArch64CC::EQ)
13322 .Case("{@ccgt}", AArch64CC::GT)
13323 .Case("{@ccge}", AArch64CC::GE)
13324 .Case("{@cclt}", AArch64CC::LT)
13325 .Case("{@ccle}", AArch64CC::LE)
13326 .Case("{@cchs}", AArch64CC::HS)
13327 .Case("{@ccne}", AArch64CC::NE)
13328 .Case("{@ccvc}", AArch64CC::VC)
13329 .Case("{@ccpl}", AArch64CC::PL)
13330 .Case("{@ccvs}", AArch64CC::VS)
13331 .Case("{@ccmi}", AArch64CC::MI)
13333 return Cond;
13334}
13335
13336/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
13337/// WZR, invert(<cond>)'.
13339 SelectionDAG &DAG) {
13340 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
13341 DAG.getConstant(0, DL, MVT::i32),
13342 DAG.getConstant(0, DL, MVT::i32),
13343 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
13344}
13345
13346// Lower @cc flag output via getSETCC.
13347SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
13348 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
13349 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
13350 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
13351 if (Cond == AArch64CC::Invalid)
13352 return SDValue();
13353 // The output variable should be a scalar integer.
13354 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
13355 OpInfo.ConstraintVT.getSizeInBits() < 8)
13356 report_fatal_error("Flag output operand is of invalid type");
13357
13358 // Get NZCV register. Only update chain when copyfrom is glued.
13359 if (Glue.getNode()) {
13360 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
13361 Chain = Glue.getValue(1);
13362 } else
13363 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
13364 // Extract CC code.
13365 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
13366
13368
13369 // Truncate or ZERO_EXTEND based on value types.
13370 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
13371 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
13372 else
13373 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
13374
13375 return Result;
13376}
13377
13378/// getConstraintType - Given a constraint letter, return the type of
13379/// constraint it is for this target.
13381AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
13382 if (Constraint.size() == 1) {
13383 switch (Constraint[0]) {
13384 default:
13385 break;
13386 case 'x':
13387 case 'w':
13388 case 'y':
13389 return C_RegisterClass;
13390 // An address with a single base register. Due to the way we
13391 // currently handle addresses it is the same as 'r'.
13392 case 'Q':
13393 return C_Memory;
13394 case 'I':
13395 case 'J':
13396 case 'K':
13397 case 'L':
13398 case 'M':
13399 case 'N':
13400 case 'Y':
13401 case 'Z':
13402 return C_Immediate;
13403 case 'z':
13404 case 'S': // A symbol or label reference with a constant offset
13405 return C_Other;
13406 }
13407 } else if (parsePredicateConstraint(Constraint))
13408 return C_RegisterClass;
13409 else if (parseReducedGprConstraint(Constraint))
13410 return C_RegisterClass;
13411 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13412 return C_Other;
13413 return TargetLowering::getConstraintType(Constraint);
13414}
13415
13416/// Examine constraint type and operand type and determine a weight value.
13417/// This object must already have been set up with the operand type
13418/// and the current alternative constraint selected.
13420AArch64TargetLowering::getSingleConstraintMatchWeight(
13421 AsmOperandInfo &info, const char *constraint) const {
13423 Value *CallOperandVal = info.CallOperandVal;
13424 // If we don't have a value, we can't do a match,
13425 // but allow it at the lowest weight.
13426 if (!CallOperandVal)
13427 return CW_Default;
13428 Type *type = CallOperandVal->getType();
13429 // Look at the constraint type.
13430 switch (*constraint) {
13431 default:
13433 break;
13434 case 'x':
13435 case 'w':
13436 case 'y':
13437 if (type->isFloatingPointTy() || type->isVectorTy())
13438 weight = CW_Register;
13439 break;
13440 case 'z':
13441 weight = CW_Constant;
13442 break;
13443 case 'U':
13444 if (parsePredicateConstraint(constraint) ||
13445 parseReducedGprConstraint(constraint))
13446 weight = CW_Register;
13447 break;
13448 }
13449 return weight;
13450}
13451
13452std::pair<unsigned, const TargetRegisterClass *>
13453AArch64TargetLowering::getRegForInlineAsmConstraint(
13454 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13455 if (Constraint.size() == 1) {
13456 switch (Constraint[0]) {
13457 case 'r':
13458 if (VT.isScalableVector())
13459 return std::make_pair(0U, nullptr);
13460 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13461 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13462 if (VT.getFixedSizeInBits() == 64)
13463 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13464 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13465 case 'w': {
13466 if (!Subtarget->hasFPARMv8())
13467 break;
13468 if (VT.isScalableVector()) {
13469 if (VT.getVectorElementType() != MVT::i1)
13470 return std::make_pair(0U, &AArch64::ZPRRegClass);
13471 return std::make_pair(0U, nullptr);
13472 }
13473 if (VT == MVT::Other)
13474 break;
13475 uint64_t VTSize = VT.getFixedSizeInBits();
13476 if (VTSize == 16)
13477 return std::make_pair(0U, &AArch64::FPR16RegClass);
13478 if (VTSize == 32)
13479 return std::make_pair(0U, &AArch64::FPR32RegClass);
13480 if (VTSize == 64)
13481 return std::make_pair(0U, &AArch64::FPR64RegClass);
13482 if (VTSize == 128)
13483 return std::make_pair(0U, &AArch64::FPR128RegClass);
13484 break;
13485 }
13486 // The instructions that this constraint is designed for can
13487 // only take 128-bit registers so just use that regclass.
13488 case 'x':
13489 if (!Subtarget->hasFPARMv8())
13490 break;
13491 if (VT.isScalableVector())
13492 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13493 if (VT.getSizeInBits() == 128)
13494 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13495 break;
13496 case 'y':
13497 if (!Subtarget->hasFPARMv8())
13498 break;
13499 if (VT.isScalableVector())
13500 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13501 break;
13502 }
13503 } else {
13504 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13505 // SME functions that are not in streaming mode, should
13506 // still observe clobbers of Z-registers by clobbering
13507 // the lower 128bits of those registers.
13508 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13509 !Subtarget->isSVEorStreamingSVEAvailable())
13510 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13511 &AArch64::FPR128RegClass);
13512 return *P;
13513 }
13514 if (const auto PC = parsePredicateConstraint(Constraint))
13515 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13516 return std::make_pair(0U, RegClass);
13517
13518 if (const auto RGC = parseReducedGprConstraint(Constraint))
13519 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13520 return std::make_pair(0U, RegClass);
13521 }
13522 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13524 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13525
13526 if (Constraint == "{za}") {
13527 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13528 }
13529
13530 if (Constraint == "{zt0}") {
13531 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13532 }
13533
13534 // Use the default implementation in TargetLowering to convert the register
13535 // constraint into a member of a register class.
13536 std::pair<unsigned, const TargetRegisterClass *> Res;
13538
13539 // Not found as a standard register?
13540 if (!Res.second) {
13541 unsigned Size = Constraint.size();
13542 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13543 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13544 int RegNo;
13545 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13546 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13547 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13548 // By default we'll emit v0-v31 for this unless there's a modifier where
13549 // we'll emit the correct register as well.
13550 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13551 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13552 Res.second = &AArch64::FPR64RegClass;
13553 } else {
13554 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13555 Res.second = &AArch64::FPR128RegClass;
13556 }
13557 }
13558 }
13559 }
13560
13561 if (Res.second && !Subtarget->hasFPARMv8() &&
13562 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13563 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13564 return std::make_pair(0U, nullptr);
13565
13566 return Res;
13567}
13568
13570 llvm::Type *Ty,
13571 bool AllowUnknown) const {
13572 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13573 return EVT(MVT::i64x8);
13574
13575 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13576}
13577
13578/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13579/// vector. If it is invalid, don't add anything to Ops.
13580void AArch64TargetLowering::LowerAsmOperandForConstraint(
13581 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13582 SelectionDAG &DAG) const {
13583 SDValue Result;
13584
13585 // Currently only support length 1 constraints.
13586 if (Constraint.size() != 1)
13587 return;
13588
13589 char ConstraintLetter = Constraint[0];
13590 switch (ConstraintLetter) {
13591 default:
13592 break;
13593
13594 // This set of constraints deal with valid constants for various instructions.
13595 // Validate and return a target constant for them if we can.
13596 case 'z': {
13597 // 'z' maps to xzr or wzr so it needs an input of 0.
13598 if (!isNullConstant(Op))
13599 return;
13600
13601 if (Op.getValueType() == MVT::i64)
13602 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13603 else
13604 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13605 break;
13606 }
13607 case 'S':
13608 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13609 // supported for PIC while "s" isn't, making "s" less useful. We implement
13610 // "S" but not "s".
13612 break;
13613
13614 case 'I':
13615 case 'J':
13616 case 'K':
13617 case 'L':
13618 case 'M':
13619 case 'N':
13621 if (!C)
13622 return;
13623
13624 // Grab the value and do some validation.
13625 uint64_t CVal = C->getZExtValue();
13626 switch (ConstraintLetter) {
13627 // The I constraint applies only to simple ADD or SUB immediate operands:
13628 // i.e. 0 to 4095 with optional shift by 12
13629 // The J constraint applies only to ADD or SUB immediates that would be
13630 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13631 // instruction [or vice versa], in other words -1 to -4095 with optional
13632 // left shift by 12.
13633 case 'I':
13634 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13635 break;
13636 return;
13637 case 'J': {
13638 uint64_t NVal = -C->getSExtValue();
13639 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13640 CVal = C->getSExtValue();
13641 break;
13642 }
13643 return;
13644 }
13645 // The K and L constraints apply *only* to logical immediates, including
13646 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13647 // been removed and MOV should be used). So these constraints have to
13648 // distinguish between bit patterns that are valid 32-bit or 64-bit
13649 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13650 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13651 // versa.
13652 case 'K':
13653 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13654 break;
13655 return;
13656 case 'L':
13657 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13658 break;
13659 return;
13660 // The M and N constraints are a superset of K and L respectively, for use
13661 // with the MOV (immediate) alias. As well as the logical immediates they
13662 // also match 32 or 64-bit immediates that can be loaded either using a
13663 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13664 // (M) or 64-bit 0x1234000000000000 (N) etc.
13665 // As a note some of this code is liberally stolen from the asm parser.
13666 case 'M': {
13667 if (!isUInt<32>(CVal))
13668 return;
13669 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13670 break;
13671 if ((CVal & 0xFFFF) == CVal)
13672 break;
13673 if ((CVal & 0xFFFF0000ULL) == CVal)
13674 break;
13675 uint64_t NCVal = ~(uint32_t)CVal;
13676 if ((NCVal & 0xFFFFULL) == NCVal)
13677 break;
13678 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13679 break;
13680 return;
13681 }
13682 case 'N': {
13683 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13684 break;
13685 if ((CVal & 0xFFFFULL) == CVal)
13686 break;
13687 if ((CVal & 0xFFFF0000ULL) == CVal)
13688 break;
13689 if ((CVal & 0xFFFF00000000ULL) == CVal)
13690 break;
13691 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13692 break;
13693 uint64_t NCVal = ~CVal;
13694 if ((NCVal & 0xFFFFULL) == NCVal)
13695 break;
13696 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13697 break;
13698 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13699 break;
13700 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13701 break;
13702 return;
13703 }
13704 default:
13705 return;
13706 }
13707
13708 // All assembler immediates are 64-bit integers.
13709 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13710 break;
13711 }
13712
13713 if (Result.getNode()) {
13714 Ops.push_back(Result);
13715 return;
13716 }
13717
13718 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13719}
13720
13721//===----------------------------------------------------------------------===//
13722// AArch64 Advanced SIMD Support
13723//===----------------------------------------------------------------------===//
13724
13725/// WidenVector - Given a value in the V64 register class, produce the
13726/// equivalent value in the V128 register class.
13728 EVT VT = V64Reg.getValueType();
13729 unsigned NarrowSize = VT.getVectorNumElements();
13730 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13731 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13732 SDLoc DL(V64Reg);
13733
13734 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13735 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13736}
13737
13738/// getExtFactor - Determine the adjustment factor for the position when
13739/// generating an "extract from vector registers" instruction.
13740static unsigned getExtFactor(SDValue &V) {
13741 EVT EltType = V.getValueType().getVectorElementType();
13742 return EltType.getSizeInBits() / 8;
13743}
13744
13745// Check if a vector is built from one vector via extracted elements of
13746// another together with an AND mask, ensuring that all elements fit
13747// within range. This can be reconstructed using AND and NEON's TBL1.
13749 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13750 SDLoc DL(Op);
13751 EVT VT = Op.getValueType();
13752 assert(!VT.isScalableVector() &&
13753 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13754
13755 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13756 // directly to TBL1.
13757 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13758 return SDValue();
13759
13760 unsigned NumElts = VT.getVectorNumElements();
13761 assert((NumElts == 8 || NumElts == 16) &&
13762 "Need to have exactly 8 or 16 elements in vector.");
13763
13764 SDValue SourceVec;
13765 SDValue MaskSourceVec;
13766 SmallVector<SDValue, 16> AndMaskConstants;
13767
13768 for (unsigned i = 0; i < NumElts; ++i) {
13769 SDValue V = Op.getOperand(i);
13770 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13771 return SDValue();
13772
13773 SDValue OperandSourceVec = V.getOperand(0);
13774 if (!SourceVec)
13775 SourceVec = OperandSourceVec;
13776 else if (SourceVec != OperandSourceVec)
13777 return SDValue();
13778
13779 // This only looks at shuffles with elements that are
13780 // a) truncated by a constant AND mask extracted from a mask vector, or
13781 // b) extracted directly from a mask vector.
13782 SDValue MaskSource = V.getOperand(1);
13783 if (MaskSource.getOpcode() == ISD::AND) {
13784 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13785 return SDValue();
13786
13787 AndMaskConstants.push_back(MaskSource.getOperand(1));
13788 MaskSource = MaskSource->getOperand(0);
13789 } else if (!AndMaskConstants.empty()) {
13790 // Either all or no operands should have an AND mask.
13791 return SDValue();
13792 }
13793
13794 // An ANY_EXTEND may be inserted between the AND and the source vector
13795 // extraction. We don't care about that, so we can just skip it.
13796 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13797 MaskSource = MaskSource.getOperand(0);
13798
13799 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13800 return SDValue();
13801
13802 SDValue MaskIdx = MaskSource.getOperand(1);
13803 if (!isa<ConstantSDNode>(MaskIdx) ||
13804 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13805 return SDValue();
13806
13807 // We only apply this if all elements come from the same vector with the
13808 // same vector type.
13809 if (!MaskSourceVec) {
13810 MaskSourceVec = MaskSource->getOperand(0);
13811 if (MaskSourceVec.getValueType() != VT)
13812 return SDValue();
13813 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13814 return SDValue();
13815 }
13816 }
13817
13818 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13819 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13820 // insert, we know that the index in the mask must be smaller than the number
13821 // of elements in the source, or we would have an out-of-bounds access.
13822 if (NumElts == 8)
13823 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13824 DAG.getUNDEF(VT));
13825
13826 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13827 if (!AndMaskConstants.empty())
13828 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13829 DAG.getBuildVector(VT, DL, AndMaskConstants));
13830
13831 return DAG.getNode(
13833 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
13834 SourceVec, MaskSourceVec);
13835}
13836
13837// Gather data to see if the operation can be modelled as a
13838// shuffle in combination with VEXTs.
13840 SelectionDAG &DAG) const {
13841 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13842 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13843 SDLoc DL(Op);
13844 EVT VT = Op.getValueType();
13845 assert(!VT.isScalableVector() &&
13846 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13847 unsigned NumElts = VT.getVectorNumElements();
13848
13849 struct ShuffleSourceInfo {
13850 SDValue Vec;
13851 unsigned MinElt;
13852 unsigned MaxElt;
13853
13854 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13855 // be compatible with the shuffle we intend to construct. As a result
13856 // ShuffleVec will be some sliding window into the original Vec.
13857 SDValue ShuffleVec;
13858
13859 // Code should guarantee that element i in Vec starts at element "WindowBase
13860 // + i * WindowScale in ShuffleVec".
13861 int WindowBase;
13862 int WindowScale;
13863
13864 ShuffleSourceInfo(SDValue Vec)
13865 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13866 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13867
13868 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13869 };
13870
13871 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13872 // node.
13874 for (unsigned i = 0; i < NumElts; ++i) {
13875 SDValue V = Op.getOperand(i);
13876 if (V.isUndef())
13877 continue;
13878 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13879 !isa<ConstantSDNode>(V.getOperand(1)) ||
13880 V.getOperand(0).getValueType().isScalableVector()) {
13881 LLVM_DEBUG(
13882 dbgs() << "Reshuffle failed: "
13883 "a shuffle can only come from building a vector from "
13884 "various elements of other fixed-width vectors, provided "
13885 "their indices are constant\n");
13886 return SDValue();
13887 }
13888
13889 // Add this element source to the list if it's not already there.
13890 SDValue SourceVec = V.getOperand(0);
13891 auto Source = find(Sources, SourceVec);
13892 if (Source == Sources.end())
13893 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13894
13895 // Update the minimum and maximum lane number seen.
13896 unsigned EltNo = V.getConstantOperandVal(1);
13897 Source->MinElt = std::min(Source->MinElt, EltNo);
13898 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13899 }
13900
13901 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13902 // better than moving to/from gpr registers for larger vectors.
13903 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13904 // Construct a mask for the tbl. We may need to adjust the index for types
13905 // larger than i8.
13907 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13908 for (unsigned I = 0; I < NumElts; ++I) {
13909 SDValue V = Op.getOperand(I);
13910 if (V.isUndef()) {
13911 for (unsigned OF = 0; OF < OutputFactor; OF++)
13912 Mask.push_back(-1);
13913 continue;
13914 }
13915 // Set the Mask lanes adjusted for the size of the input and output
13916 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13917 // output element, adjusted in their positions per input and output types.
13918 unsigned Lane = V.getConstantOperandVal(1);
13919 for (unsigned S = 0; S < Sources.size(); S++) {
13920 if (V.getOperand(0) == Sources[S].Vec) {
13921 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13922 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13923 for (unsigned OF = 0; OF < OutputFactor; OF++)
13924 Mask.push_back(InputBase + OF);
13925 break;
13926 }
13927 }
13928 }
13929
13930 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13931 // v16i8, and the TBLMask
13932 SmallVector<SDValue, 16> TBLOperands;
13933 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13934 ? Intrinsic::aarch64_neon_tbl3
13935 : Intrinsic::aarch64_neon_tbl4,
13936 DL, MVT::i32));
13937 for (unsigned i = 0; i < Sources.size(); i++) {
13938 SDValue Src = Sources[i].Vec;
13939 EVT SrcVT = Src.getValueType();
13940 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13941 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13942 "Expected a legally typed vector");
13943 if (SrcVT.is64BitVector())
13944 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13945 DAG.getUNDEF(MVT::v8i8));
13946 TBLOperands.push_back(Src);
13947 }
13948
13950 for (unsigned i = 0; i < Mask.size(); i++)
13951 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13952 assert((Mask.size() == 8 || Mask.size() == 16) &&
13953 "Expected a v8i8 or v16i8 Mask");
13954 TBLOperands.push_back(DAG.getBuildVector(
13955 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13956
13957 SDValue Shuffle =
13959 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13960 return DAG.getBitcast(VT, Shuffle);
13961 }
13962
13963 if (Sources.size() > 2) {
13964 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13965 << "sensible when at most two source vectors are "
13966 << "involved\n");
13967 return SDValue();
13968 }
13969
13970 // Find out the smallest element size among result and two sources, and use
13971 // it as element size to build the shuffle_vector.
13972 EVT SmallestEltTy = VT.getVectorElementType();
13973 for (auto &Source : Sources) {
13974 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13975 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13976 SmallestEltTy = SrcEltTy;
13977 }
13978 }
13979 unsigned ResMultiplier =
13980 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13981 uint64_t VTSize = VT.getFixedSizeInBits();
13982 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13983 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13984
13985 // If the source vector is too wide or too narrow, we may nevertheless be able
13986 // to construct a compatible shuffle either by concatenating it with UNDEF or
13987 // extracting a suitable range of elements.
13988 for (auto &Src : Sources) {
13989 EVT SrcVT = Src.ShuffleVec.getValueType();
13990
13991 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13992 if (SrcVTSize == TypeSize::getFixed(VTSize))
13993 continue;
13994
13995 // This stage of the search produces a source with the same element type as
13996 // the original, but with a total width matching the BUILD_VECTOR output.
13997 EVT EltVT = SrcVT.getVectorElementType();
13998 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13999 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
14000
14001 if (SrcVTSize.getFixedValue() < VTSize) {
14002 assert(2 * SrcVTSize == VTSize);
14003 // We can pad out the smaller vector for free, so if it's part of a
14004 // shuffle...
14005 Src.ShuffleVec =
14006 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
14007 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
14008 continue;
14009 }
14010
14011 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
14012 LLVM_DEBUG(
14013 dbgs() << "Reshuffle failed: result vector too small to extract\n");
14014 return SDValue();
14015 }
14016
14017 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
14018 LLVM_DEBUG(
14019 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
14020 return SDValue();
14021 }
14022
14023 if (Src.MinElt >= NumSrcElts) {
14024 // The extraction can just take the second half
14025 Src.ShuffleVec =
14026 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14027 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14028 Src.WindowBase = -NumSrcElts;
14029 } else if (Src.MaxElt < NumSrcElts) {
14030 // The extraction can just take the first half
14031 Src.ShuffleVec =
14032 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14033 DAG.getConstant(0, DL, MVT::i64));
14034 } else {
14035 // An actual VEXT is needed
14036 SDValue VEXTSrc1 =
14037 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14038 DAG.getConstant(0, DL, MVT::i64));
14039 SDValue VEXTSrc2 =
14040 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14041 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14042 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
14043
14044 if (!SrcVT.is64BitVector()) {
14045 LLVM_DEBUG(
14046 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
14047 "for SVE vectors.");
14048 return SDValue();
14049 }
14050
14051 Src.ShuffleVec =
14052 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
14053 DAG.getConstant(Imm, DL, MVT::i32));
14054 Src.WindowBase = -Src.MinElt;
14055 }
14056 }
14057
14058 // Another possible incompatibility occurs from the vector element types. We
14059 // can fix this by bitcasting the source vectors to the same type we intend
14060 // for the shuffle.
14061 for (auto &Src : Sources) {
14062 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
14063 if (SrcEltTy == SmallestEltTy)
14064 continue;
14065 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
14066 if (DAG.getDataLayout().isBigEndian()) {
14067 Src.ShuffleVec =
14068 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
14069 } else {
14070 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
14071 }
14072 Src.WindowScale =
14073 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14074 Src.WindowBase *= Src.WindowScale;
14075 }
14076
14077 // Final check before we try to actually produce a shuffle.
14078 LLVM_DEBUG({
14079 for (auto Src : Sources)
14080 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
14081 });
14082
14083 // The stars all align, our next step is to produce the mask for the shuffle.
14084 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
14085 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
14086 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
14087 SDValue Entry = Op.getOperand(i);
14088 if (Entry.isUndef())
14089 continue;
14090
14091 auto Src = find(Sources, Entry.getOperand(0));
14092 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
14093
14094 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
14095 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
14096 // segment.
14097 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
14098 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
14099 VT.getScalarSizeInBits());
14100 int LanesDefined = BitsDefined / BitsPerShuffleLane;
14101
14102 // This source is expected to fill ResMultiplier lanes of the final shuffle,
14103 // starting at the appropriate offset.
14104 int *LaneMask = &Mask[i * ResMultiplier];
14105
14106 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
14107 ExtractBase += NumElts * (Src - Sources.begin());
14108 for (int j = 0; j < LanesDefined; ++j)
14109 LaneMask[j] = ExtractBase + j;
14110 }
14111
14112 // Final check before we try to produce nonsense...
14113 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
14114 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
14115 return SDValue();
14116 }
14117
14118 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
14119 for (unsigned i = 0; i < Sources.size(); ++i)
14120 ShuffleOps[i] = Sources[i].ShuffleVec;
14121
14122 SDValue Shuffle =
14123 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
14124 SDValue V;
14125 if (DAG.getDataLayout().isBigEndian()) {
14126 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
14127 } else {
14128 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
14129 }
14130
14131 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
14132 dbgs() << "Reshuffle, creating node: "; V.dump(););
14133
14134 return V;
14135}
14136
14137// check if an EXT instruction can handle the shuffle mask when the
14138// vector sources of the shuffle are the same.
14139static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
14140 unsigned NumElts = VT.getVectorNumElements();
14141
14142 // Assume that the first shuffle index is not UNDEF. Fail if it is.
14143 if (M[0] < 0)
14144 return false;
14145
14146 Imm = M[0];
14147
14148 // If this is a VEXT shuffle, the immediate value is the index of the first
14149 // element. The other shuffle indices must be the successive elements after
14150 // the first one.
14151 unsigned ExpectedElt = Imm;
14152 for (unsigned i = 1; i < NumElts; ++i) {
14153 // Increment the expected index. If it wraps around, just follow it
14154 // back to index zero and keep going.
14155 ++ExpectedElt;
14156 if (ExpectedElt == NumElts)
14157 ExpectedElt = 0;
14158
14159 if (M[i] < 0)
14160 continue; // ignore UNDEF indices
14161 if (ExpectedElt != static_cast<unsigned>(M[i]))
14162 return false;
14163 }
14164
14165 return true;
14166}
14167
14168// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14169// v4i32s. This is really a truncate, which we can construct out of (legal)
14170// concats and truncate nodes.
14172 if (V.getValueType() != MVT::v16i8)
14173 return SDValue();
14174 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
14175
14176 for (unsigned X = 0; X < 4; X++) {
14177 // Check the first item in each group is an extract from lane 0 of a v4i32
14178 // or v4i16.
14179 SDValue BaseExt = V.getOperand(X * 4);
14180 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14181 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
14182 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
14183 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
14184 BaseExt.getConstantOperandVal(1) != 0)
14185 return SDValue();
14186 SDValue Base = BaseExt.getOperand(0);
14187 // And check the other items are extracts from the same vector.
14188 for (unsigned Y = 1; Y < 4; Y++) {
14189 SDValue Ext = V.getOperand(X * 4 + Y);
14190 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14191 Ext.getOperand(0) != Base ||
14193 Ext.getConstantOperandVal(1) != Y)
14194 return SDValue();
14195 }
14196 }
14197
14198 // Turn the buildvector into a series of truncates and concates, which will
14199 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
14200 // concat together to produce 2 v8i16. These are both truncated and concat
14201 // together.
14202 SDLoc DL(V);
14203 SDValue Trunc[4] = {
14204 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
14205 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
14206 for (SDValue &V : Trunc)
14207 if (V.getValueType() == MVT::v4i32)
14208 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
14209 SDValue Concat0 =
14210 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
14211 SDValue Concat1 =
14212 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
14213 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
14214 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
14215 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
14216}
14217
14218/// Check if a vector shuffle corresponds to a DUP instructions with a larger
14219/// element width than the vector lane type. If that is the case the function
14220/// returns true and writes the value of the DUP instruction lane operand into
14221/// DupLaneOp
14222static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
14223 unsigned &DupLaneOp) {
14224 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
14225 "Only possible block sizes for wide DUP are: 16, 32, 64");
14226
14227 if (BlockSize <= VT.getScalarSizeInBits())
14228 return false;
14229 if (BlockSize % VT.getScalarSizeInBits() != 0)
14230 return false;
14231 if (VT.getSizeInBits() % BlockSize != 0)
14232 return false;
14233
14234 size_t SingleVecNumElements = VT.getVectorNumElements();
14235 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
14236 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
14237
14238 // We are looking for masks like
14239 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
14240 // might be replaced by 'undefined'. BlockIndices will eventually contain
14241 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
14242 // for the above examples)
14243 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
14244 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
14245 for (size_t I = 0; I < NumEltsPerBlock; I++) {
14246 int Elt = M[BlockIndex * NumEltsPerBlock + I];
14247 if (Elt < 0)
14248 continue;
14249 // For now we don't support shuffles that use the second operand
14250 if ((unsigned)Elt >= SingleVecNumElements)
14251 return false;
14252 if (BlockElts[I] < 0)
14253 BlockElts[I] = Elt;
14254 else if (BlockElts[I] != Elt)
14255 return false;
14256 }
14257
14258 // We found a candidate block (possibly with some undefs). It must be a
14259 // sequence of consecutive integers starting with a value divisible by
14260 // NumEltsPerBlock with some values possibly replaced by undef-s.
14261
14262 // Find first non-undef element
14263 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
14264 assert(FirstRealEltIter != BlockElts.end() &&
14265 "Shuffle with all-undefs must have been caught by previous cases, "
14266 "e.g. isSplat()");
14267 if (FirstRealEltIter == BlockElts.end()) {
14268 DupLaneOp = 0;
14269 return true;
14270 }
14271
14272 // Index of FirstRealElt in BlockElts
14273 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
14274
14275 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
14276 return false;
14277 // BlockElts[0] must have the following value if it isn't undef:
14278 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
14279
14280 // Check the first element
14281 if (Elt0 % NumEltsPerBlock != 0)
14282 return false;
14283 // Check that the sequence indeed consists of consecutive integers (modulo
14284 // undefs)
14285 for (size_t I = 0; I < NumEltsPerBlock; I++)
14286 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
14287 return false;
14288
14289 DupLaneOp = Elt0 / NumEltsPerBlock;
14290 return true;
14291}
14292
14293// check if an EXT instruction can handle the shuffle mask when the
14294// vector sources of the shuffle are different.
14295static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
14296 unsigned &Imm) {
14297 // Look for the first non-undef element.
14298 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
14299
14300 // Benefit from APInt to handle overflow when calculating expected element.
14301 unsigned NumElts = VT.getVectorNumElements();
14302 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
14303 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
14304 /*implicitTrunc=*/true);
14305 // The following shuffle indices must be the successive elements after the
14306 // first real element.
14307 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
14308 return Elt != ExpectedElt++ && Elt >= 0;
14309 });
14310 if (FoundWrongElt)
14311 return false;
14312
14313 // The index of an EXT is the first element if it is not UNDEF.
14314 // Watch out for the beginning UNDEFs. The EXT index should be the expected
14315 // value of the first element. E.g.
14316 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
14317 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
14318 // ExpectedElt is the last mask index plus 1.
14319 Imm = ExpectedElt.getZExtValue();
14320
14321 // There are two difference cases requiring to reverse input vectors.
14322 // For example, for vector <4 x i32> we have the following cases,
14323 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
14324 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
14325 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
14326 // to reverse two input vectors.
14327 if (Imm < NumElts)
14328 ReverseEXT = true;
14329 else
14330 Imm -= NumElts;
14331
14332 return true;
14333}
14334
14335/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
14336/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14337/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
14338static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14339 unsigned NumElts = VT.getVectorNumElements();
14340 if (NumElts % 2 != 0)
14341 return false;
14342 WhichResult = (M[0] == 0 ? 0 : 1);
14343 unsigned Idx = WhichResult * NumElts / 2;
14344 for (unsigned i = 0; i != NumElts; i += 2) {
14345 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
14346 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
14347 return false;
14348 Idx += 1;
14349 }
14350
14351 return true;
14352}
14353
14354/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
14355/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14356/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
14357static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14358 unsigned Half = VT.getVectorNumElements() / 2;
14359 WhichResult = (M[0] == 0 ? 0 : 1);
14360 for (unsigned j = 0; j != 2; ++j) {
14361 unsigned Idx = WhichResult;
14362 for (unsigned i = 0; i != Half; ++i) {
14363 int MIdx = M[i + j * Half];
14364 if (MIdx >= 0 && (unsigned)MIdx != Idx)
14365 return false;
14366 Idx += 2;
14367 }
14368 }
14369
14370 return true;
14371}
14372
14373/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
14374/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14375/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
14376static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14377 unsigned NumElts = VT.getVectorNumElements();
14378 if (NumElts % 2 != 0)
14379 return false;
14380 WhichResult = (M[0] == 0 ? 0 : 1);
14381 for (unsigned i = 0; i < NumElts; i += 2) {
14382 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
14383 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
14384 return false;
14385 }
14386 return true;
14387}
14388
14389static bool isINSMask(ArrayRef<int> M, int NumInputElements,
14390 bool &DstIsLeft, int &Anomaly) {
14391 if (M.size() != static_cast<size_t>(NumInputElements))
14392 return false;
14393
14394 int NumLHSMatch = 0, NumRHSMatch = 0;
14395 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14396
14397 for (int i = 0; i < NumInputElements; ++i) {
14398 if (M[i] == -1) {
14399 ++NumLHSMatch;
14400 ++NumRHSMatch;
14401 continue;
14402 }
14403
14404 if (M[i] == i)
14405 ++NumLHSMatch;
14406 else
14407 LastLHSMismatch = i;
14408
14409 if (M[i] == i + NumInputElements)
14410 ++NumRHSMatch;
14411 else
14412 LastRHSMismatch = i;
14413 }
14414
14415 if (NumLHSMatch == NumInputElements - 1) {
14416 DstIsLeft = true;
14417 Anomaly = LastLHSMismatch;
14418 return true;
14419 } else if (NumRHSMatch == NumInputElements - 1) {
14420 DstIsLeft = false;
14421 Anomaly = LastRHSMismatch;
14422 return true;
14423 }
14424
14425 return false;
14426}
14427
14428static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14429 if (VT.getSizeInBits() != 128)
14430 return false;
14431
14432 unsigned NumElts = VT.getVectorNumElements();
14433
14434 for (int I = 0, E = NumElts / 2; I != E; I++) {
14435 if (Mask[I] != I)
14436 return false;
14437 }
14438
14439 int Offset = NumElts / 2;
14440 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14441 if (Mask[I] != I + SplitLHS * Offset)
14442 return false;
14443 }
14444
14445 return true;
14446}
14447
14449 SDLoc DL(Op);
14450 EVT VT = Op.getValueType();
14451 SDValue V0 = Op.getOperand(0);
14452 SDValue V1 = Op.getOperand(1);
14453 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14454
14457 return SDValue();
14458
14459 bool SplitV0 = V0.getValueSizeInBits() == 128;
14460
14461 if (!isConcatMask(Mask, VT, SplitV0))
14462 return SDValue();
14463
14464 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14465 if (SplitV0) {
14466 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14467 DAG.getConstant(0, DL, MVT::i64));
14468 }
14469 if (V1.getValueSizeInBits() == 128) {
14470 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14471 DAG.getConstant(0, DL, MVT::i64));
14472 }
14473 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14474}
14475
14476/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14477/// the specified operations to build the shuffle. ID is the perfect-shuffle
14478//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14479//table entry and LHS/RHS are the immediate inputs for this stage of the
14480//shuffle.
14482 unsigned PFEntry, SDValue LHS,
14483 SDValue RHS, SelectionDAG &DAG,
14484 const SDLoc &DL) {
14485 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14486 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14487 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14488
14489 enum {
14490 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14491 OP_VREV,
14492 OP_VDUP0,
14493 OP_VDUP1,
14494 OP_VDUP2,
14495 OP_VDUP3,
14496 OP_VEXT1,
14497 OP_VEXT2,
14498 OP_VEXT3,
14499 OP_VUZPL, // VUZP, left result
14500 OP_VUZPR, // VUZP, right result
14501 OP_VZIPL, // VZIP, left result
14502 OP_VZIPR, // VZIP, right result
14503 OP_VTRNL, // VTRN, left result
14504 OP_VTRNR, // VTRN, right result
14505 OP_MOVLANE // Move lane. RHSID is the lane to move into
14506 };
14507
14508 if (OpNum == OP_COPY) {
14509 if (LHSID == (1 * 9 + 2) * 9 + 3)
14510 return LHS;
14511 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14512 return RHS;
14513 }
14514
14515 if (OpNum == OP_MOVLANE) {
14516 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14517 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14518 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14519 Elt = 3 - Elt;
14520 while (Elt > 0) {
14521 ID /= 9;
14522 Elt--;
14523 }
14524 return (ID % 9 == 8) ? -1 : ID % 9;
14525 };
14526
14527 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14528 // get the lane to move from the PFID, which is always from the
14529 // original vectors (V1 or V2).
14531 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14532 EVT VT = OpLHS.getValueType();
14533 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14534 unsigned ExtLane = 0;
14535 SDValue Input;
14536
14537 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14538 // convert into a higher type.
14539 if (RHSID & 0x4) {
14540 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14541 if (MaskElt == -1)
14542 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14543 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14544 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14545 Input = MaskElt < 2 ? V1 : V2;
14546 if (VT.getScalarSizeInBits() == 16) {
14547 Input = DAG.getBitcast(MVT::v2f32, Input);
14548 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14549 } else {
14550 assert(VT.getScalarSizeInBits() == 32 &&
14551 "Expected 16 or 32 bit shuffle elements");
14552 Input = DAG.getBitcast(MVT::v2f64, Input);
14553 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14554 }
14555 } else {
14556 int MaskElt = getPFIDLane(ID, RHSID);
14557 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14558 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14559 Input = MaskElt < 4 ? V1 : V2;
14560 // Be careful about creating illegal types. Use f16 instead of i16.
14561 if (VT == MVT::v4i16) {
14562 Input = DAG.getBitcast(MVT::v4f16, Input);
14563 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14564 }
14565 }
14567 Input.getValueType().getVectorElementType(),
14568 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14569 SDValue Ins =
14570 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14571 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14572 return DAG.getBitcast(VT, Ins);
14573 }
14574
14575 SDValue OpLHS, OpRHS;
14576 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14577 RHS, DAG, DL);
14578 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14579 RHS, DAG, DL);
14580 EVT VT = OpLHS.getValueType();
14581
14582 switch (OpNum) {
14583 default:
14584 llvm_unreachable("Unknown shuffle opcode!");
14585 case OP_VREV:
14586 // VREV divides the vector in half and swaps within the half.
14587 if (VT.getVectorElementType() == MVT::i32 ||
14588 VT.getVectorElementType() == MVT::f32)
14589 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14590 // vrev <4 x i16> -> REV32
14591 if (VT.getVectorElementType() == MVT::i16 ||
14592 VT.getVectorElementType() == MVT::f16 ||
14593 VT.getVectorElementType() == MVT::bf16)
14594 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14595 // vrev <4 x i8> -> REV16
14596 assert(VT.getVectorElementType() == MVT::i8);
14597 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14598 case OP_VDUP0:
14599 case OP_VDUP1:
14600 case OP_VDUP2:
14601 case OP_VDUP3: {
14602 EVT EltTy = VT.getVectorElementType();
14603 unsigned Opcode;
14604 if (EltTy == MVT::i8)
14605 Opcode = AArch64ISD::DUPLANE8;
14606 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14607 Opcode = AArch64ISD::DUPLANE16;
14608 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14609 Opcode = AArch64ISD::DUPLANE32;
14610 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14611 Opcode = AArch64ISD::DUPLANE64;
14612 else
14613 llvm_unreachable("Invalid vector element type?");
14614
14615 if (VT.getSizeInBits() == 64)
14616 OpLHS = WidenVector(OpLHS, DAG);
14617 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14618 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14619 }
14620 case OP_VEXT1:
14621 case OP_VEXT2:
14622 case OP_VEXT3: {
14623 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14624 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14625 DAG.getConstant(Imm, DL, MVT::i32));
14626 }
14627 case OP_VUZPL:
14628 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14629 case OP_VUZPR:
14630 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14631 case OP_VZIPL:
14632 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14633 case OP_VZIPR:
14634 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14635 case OP_VTRNL:
14636 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14637 case OP_VTRNR:
14638 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14639 }
14640}
14641
14643 SelectionDAG &DAG) {
14644 // Check to see if we can use the TBL instruction.
14645 SDValue V1 = Op.getOperand(0);
14646 SDValue V2 = Op.getOperand(1);
14647 SDLoc DL(Op);
14648
14649 EVT EltVT = Op.getValueType().getVectorElementType();
14650 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14651
14652 bool Swap = false;
14653 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14654 std::swap(V1, V2);
14655 Swap = true;
14656 }
14657
14658 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14659 // out of range values with 0s. We do need to make sure that any out-of-range
14660 // values are really out-of-range for a v16i8 vector.
14661 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14662 MVT IndexVT = MVT::v8i8;
14663 unsigned IndexLen = 8;
14664 if (Op.getValueSizeInBits() == 128) {
14665 IndexVT = MVT::v16i8;
14666 IndexLen = 16;
14667 }
14668
14670 for (int Val : ShuffleMask) {
14671 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14672 unsigned Offset = Byte + Val * BytesPerElt;
14673 if (Swap)
14674 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14675 if (IsUndefOrZero && Offset >= IndexLen)
14676 Offset = 255;
14677 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14678 }
14679 }
14680
14681 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14682 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14683
14684 SDValue Shuffle;
14685 if (IsUndefOrZero) {
14686 if (IndexLen == 8)
14687 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14688 Shuffle = DAG.getNode(
14689 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14690 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14691 V1Cst,
14692 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14693 } else {
14694 if (IndexLen == 8) {
14695 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14696 Shuffle = DAG.getNode(
14697 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14698 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14699 V1Cst,
14700 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14701 } else {
14702 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14703 // cannot currently represent the register constraints on the input
14704 // table registers.
14705 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14706 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14707 // IndexLen));
14708 Shuffle = DAG.getNode(
14709 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14710 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
14711 V1Cst, V2Cst,
14712 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14713 }
14714 }
14715 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14716}
14717
14718static unsigned getDUPLANEOp(EVT EltType) {
14719 if (EltType == MVT::i8)
14720 return AArch64ISD::DUPLANE8;
14721 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14722 return AArch64ISD::DUPLANE16;
14723 if (EltType == MVT::i32 || EltType == MVT::f32)
14724 return AArch64ISD::DUPLANE32;
14725 if (EltType == MVT::i64 || EltType == MVT::f64)
14726 return AArch64ISD::DUPLANE64;
14727
14728 llvm_unreachable("Invalid vector element type?");
14729}
14730
14731static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14732 unsigned Opcode, SelectionDAG &DAG) {
14733 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14734 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14735 // Match: dup (bitcast (extract_subv X, C)), LaneC
14736 if (BitCast.getOpcode() != ISD::BITCAST ||
14738 return false;
14739
14740 // The extract index must align in the destination type. That may not
14741 // happen if the bitcast is from narrow to wide type.
14742 SDValue Extract = BitCast.getOperand(0);
14743 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14744 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14745 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14746 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14747 if (ExtIdxInBits % CastedEltBitWidth != 0)
14748 return false;
14749
14750 // Can't handle cases where vector size is not 128-bit
14751 if (!Extract.getOperand(0).getValueType().is128BitVector())
14752 return false;
14753
14754 // Update the lane value by offsetting with the scaled extract index.
14755 LaneC += ExtIdxInBits / CastedEltBitWidth;
14756
14757 // Determine the casted vector type of the wide vector input.
14758 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14759 // Examples:
14760 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14761 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14762 unsigned SrcVecNumElts =
14763 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14765 SrcVecNumElts);
14766 return true;
14767 };
14768 MVT CastVT;
14769 if (getScaledOffsetDup(V, Lane, CastVT)) {
14770 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14771 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14772 V.getOperand(0).getValueType().is128BitVector()) {
14773 // The lane is incremented by the index of the extract.
14774 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14775 Lane += V.getConstantOperandVal(1);
14776 V = V.getOperand(0);
14777 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14778 // The lane is decremented if we are splatting from the 2nd operand.
14779 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14780 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14781 Lane -= Idx * VT.getVectorNumElements() / 2;
14782 V = WidenVector(V.getOperand(Idx), DAG);
14783 } else if (VT.getSizeInBits() == 64) {
14784 // Widen the operand to 128-bit register with undef.
14785 V = WidenVector(V, DAG);
14786 }
14787 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14788}
14789
14790// Try to widen element type to get a new mask value for a better permutation
14791// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14792// UZP1/2, TRN1/2, REV, INS, etc.
14793// For example:
14794// shufflevector <4 x i32> %a, <4 x i32> %b,
14795// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14796// is equivalent to:
14797// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14798// Finally, we can get:
14799// mov v0.d[0], v1.d[1]
14801 SDLoc DL(Op);
14802 EVT VT = Op.getValueType();
14803 EVT ScalarVT = VT.getVectorElementType();
14804 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14805 SDValue V0 = Op.getOperand(0);
14806 SDValue V1 = Op.getOperand(1);
14807 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14808
14809 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14810 // We need to make sure the wider element type is legal. Thus, ElementSize
14811 // should be not larger than 32 bits, and i1 type should also be excluded.
14812 if (ElementSize > 32 || ElementSize == 1)
14813 return SDValue();
14814
14815 SmallVector<int, 8> NewMask;
14816 if (widenShuffleMaskElts(Mask, NewMask)) {
14817 MVT NewEltVT = VT.isFloatingPoint()
14818 ? MVT::getFloatingPointVT(ElementSize * 2)
14819 : MVT::getIntegerVT(ElementSize * 2);
14820 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14821 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14822 V0 = DAG.getBitcast(NewVT, V0);
14823 V1 = DAG.getBitcast(NewVT, V1);
14824 return DAG.getBitcast(VT,
14825 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14826 }
14827 }
14828
14829 return SDValue();
14830}
14831
14832// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14834 ArrayRef<int> ShuffleMask,
14835 SelectionDAG &DAG) {
14836 SDValue Tbl1 = Op->getOperand(0);
14837 SDValue Tbl2 = Op->getOperand(1);
14838 SDLoc DL(Op);
14839 SDValue Tbl2ID =
14840 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14841
14842 EVT VT = Op.getValueType();
14843 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14844 Tbl1.getOperand(0) != Tbl2ID ||
14846 Tbl2.getOperand(0) != Tbl2ID)
14847 return SDValue();
14848
14849 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14850 return SDValue();
14851
14852 SDValue Mask1 = Tbl1.getOperand(3);
14853 SDValue Mask2 = Tbl2.getOperand(3);
14854 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14855 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14856 return SDValue();
14857
14858 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14859 for (unsigned I = 0; I < 16; I++) {
14860 if (ShuffleMask[I] < 16)
14861 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14862 else {
14863 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14864 if (!C)
14865 return SDValue();
14866 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14867 }
14868 }
14869
14870 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14871 SDValue ID =
14872 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14873
14874 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14875 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14876 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14877}
14878
14879SDValue
14880AArch64TargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
14881 SelectionDAG &DAG) const {
14882 SDLoc DL(Op);
14883 EVT VT = Op.getValueType();
14884 assert(VT.isScalableVector() && "Unexpected result type!");
14885
14886 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
14887 unsigned UnpackOpcode = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14888
14889 // Repeatedly unpack Val until the result is of the desired type.
14890 SDValue Val = Op.getOperand(0);
14891 switch (Val.getSimpleValueType().SimpleTy) {
14892 default:
14893 return SDValue();
14894 case MVT::nxv16i8:
14895 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv8i16, Val);
14896 if (VT == MVT::nxv8i16)
14897 break;
14898 [[fallthrough]];
14899 case MVT::nxv8i16:
14900 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv4i32, Val);
14901 if (VT == MVT::nxv4i32)
14902 break;
14903 [[fallthrough]];
14904 case MVT::nxv4i32:
14905 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv2i64, Val);
14906 assert(VT == MVT::nxv2i64 && "Unexpected result type!");
14907 break;
14908 }
14909
14910 return Val;
14911}
14912
14913// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14914// but we don't have an appropriate instruction,
14915// so custom-lower it as ZIP1-with-zeros.
14916SDValue
14917AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14918 SelectionDAG &DAG) const {
14919 SDLoc DL(Op);
14920 EVT VT = Op.getValueType();
14921
14922 if (VT.isScalableVector())
14923 return LowerEXTEND_VECTOR_INREG(Op, DAG);
14924
14925 SDValue SrcOp = Op.getOperand(0);
14926 EVT SrcVT = SrcOp.getValueType();
14927 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14928 "Unexpected extension factor.");
14929 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14930 // FIXME: support multi-step zipping?
14931 if (Scale != 2)
14932 return SDValue();
14933 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14934 return DAG.getBitcast(VT,
14935 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14936}
14937
14938SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14939 SelectionDAG &DAG) const {
14940 SDLoc DL(Op);
14941 EVT VT = Op.getValueType();
14942
14943 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14944
14945 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14946 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14947
14948 // Convert shuffles that are directly supported on NEON to target-specific
14949 // DAG nodes, instead of keeping them as shuffles and matching them again
14950 // during code selection. This is more efficient and avoids the possibility
14951 // of inconsistencies between legalization and selection.
14952 ArrayRef<int> ShuffleMask = SVN->getMask();
14953
14954 SDValue V1 = Op.getOperand(0);
14955 SDValue V2 = Op.getOperand(1);
14956
14957 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14958 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14959 "Unexpected VECTOR_SHUFFLE mask size!");
14960
14961 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14962 return Res;
14963
14964 if (SVN->isSplat()) {
14965 int Lane = SVN->getSplatIndex();
14966 // If this is undef splat, generate it via "just" vdup, if possible.
14967 if (Lane == -1)
14968 Lane = 0;
14969
14970 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14971 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14972 V1.getOperand(0));
14973 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14974 // constant. If so, we can just reference the lane's definition directly.
14975 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14977 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14978
14979 // Otherwise, duplicate from the lane of the input vector.
14980 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14981 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14982 }
14983
14984 // Check if the mask matches a DUP for a wider element
14985 for (unsigned LaneSize : {64U, 32U, 16U}) {
14986 unsigned Lane = 0;
14987 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14988 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14989 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14990 : AArch64ISD::DUPLANE16;
14991 // Cast V1 to an integer vector with required lane size
14992 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14993 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14994 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14995 V1 = DAG.getBitcast(NewVecTy, V1);
14996 // Construct the DUP instruction
14997 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14998 // Cast back to the original type
14999 return DAG.getBitcast(VT, V1);
15000 }
15001 }
15002
15003 unsigned NumElts = VT.getVectorNumElements();
15004 unsigned EltSize = VT.getScalarSizeInBits();
15005 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
15006 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
15007 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
15008 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
15009 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
15010 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
15011
15012 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
15013 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
15014 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
15015 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
15016 DAG.getConstant(8, DL, MVT::i32));
15017 }
15018
15019 bool ReverseEXT = false;
15020 unsigned Imm;
15021 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
15022 if (ReverseEXT)
15023 std::swap(V1, V2);
15024 Imm *= getExtFactor(V1);
15025 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
15026 DAG.getConstant(Imm, DL, MVT::i32));
15027 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
15028 Imm *= getExtFactor(V1);
15029 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
15030 DAG.getConstant(Imm, DL, MVT::i32));
15031 }
15032
15033 unsigned WhichResult;
15034 unsigned OperandOrder;
15035 if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15036 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15037 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15038 OperandOrder == 0 ? V2 : V1);
15039 }
15040 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
15041 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15042 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
15043 }
15044 if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15045 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15046 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15047 OperandOrder == 0 ? V2 : V1);
15048 }
15049
15050 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15051 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15052 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15053 }
15054 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15055 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15056 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15057 }
15058 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15059 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15060 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15061 }
15062
15064 return Concat;
15065
15066 bool DstIsLeft;
15067 int Anomaly;
15068 int NumInputElements = V1.getValueType().getVectorNumElements();
15069 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
15070 SDValue DstVec = DstIsLeft ? V1 : V2;
15071 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
15072
15073 SDValue SrcVec = V1;
15074 int SrcLane = ShuffleMask[Anomaly];
15075 if (SrcLane >= NumInputElements) {
15076 SrcVec = V2;
15077 SrcLane -= NumElts;
15078 }
15079 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
15080
15081 EVT ScalarVT = VT.getVectorElementType();
15082
15083 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
15084 ScalarVT = MVT::i32;
15085
15086 return DAG.getNode(
15087 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
15088 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
15089 DstLaneV);
15090 }
15091
15092 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
15093 return NewSD;
15094
15095 // If the shuffle is not directly supported and it has 4 elements, use
15096 // the PerfectShuffle-generated table to synthesize it from other shuffles.
15097 if (NumElts == 4) {
15098 unsigned PFIndexes[4];
15099 for (unsigned i = 0; i != 4; ++i) {
15100 if (ShuffleMask[i] < 0)
15101 PFIndexes[i] = 8;
15102 else
15103 PFIndexes[i] = ShuffleMask[i];
15104 }
15105
15106 // Compute the index in the perfect shuffle table.
15107 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
15108 PFIndexes[2] * 9 + PFIndexes[3];
15109 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
15110 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
15111 DL);
15112 }
15113
15114 // Check for a "select shuffle", generating a BSL to pick between lanes in
15115 // V1/V2.
15116 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
15117 assert(VT.getScalarSizeInBits() <= 32 &&
15118 "Expected larger vector element sizes to be handled already");
15119 SmallVector<SDValue> MaskElts;
15120 for (int M : ShuffleMask)
15121 MaskElts.push_back(DAG.getConstant(
15122 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
15123 EVT IVT = VT.changeVectorElementTypeToInteger();
15124 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
15125 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
15126 DAG.getBitcast(IVT, V1),
15127 DAG.getBitcast(IVT, V2)));
15128 }
15129
15130 // Fall back to generating a TBL
15131 return GenerateTBL(Op, ShuffleMask, DAG);
15132}
15133
15134SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
15135 SelectionDAG &DAG) const {
15136 EVT VT = Op.getValueType();
15137
15138 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15139 return LowerToScalableOp(Op, DAG);
15140
15141 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
15142 "Unexpected vector type!");
15143
15144 // We can handle the constant cases during isel.
15145 if (isa<ConstantSDNode>(Op.getOperand(0)))
15146 return Op;
15147
15148 // There isn't a natural way to handle the general i1 case, so we use some
15149 // trickery with whilelo.
15150 SDLoc DL(Op);
15151 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
15152 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
15153 DAG.getValueType(MVT::i1));
15154 SDValue ID =
15155 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
15156 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15157 if (VT == MVT::nxv1i1)
15158 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
15159 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
15160 Zero, SplatVal),
15161 Zero);
15162 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
15163}
15164
15165SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
15166 SelectionDAG &DAG) const {
15167 SDLoc DL(Op);
15168
15169 EVT VT = Op.getValueType();
15170 if (!isTypeLegal(VT) || !VT.isScalableVector())
15171 return SDValue();
15172
15173 // Current lowering only supports the SVE-ACLE types.
15175 return SDValue();
15176
15177 // The DUPQ operation is independent of element type so normalise to i64s.
15178 SDValue Idx128 = Op.getOperand(2);
15179
15180 // DUPQ can be used when idx is in range.
15181 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
15182 if (CIdx && (CIdx->getZExtValue() <= 3)) {
15183 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
15184 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
15185 }
15186
15187 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
15188
15189 // The ACLE says this must produce the same result as:
15190 // svtbl(data, svadd_x(svptrue_b64(),
15191 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
15192 // index * 2))
15193 SDValue One = DAG.getConstant(1, DL, MVT::i64);
15194 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
15195
15196 // create the vector 0,1,0,1,...
15197 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
15198 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
15199
15200 // create the vector idx64,idx64+1,idx64,idx64+1,...
15201 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
15202 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
15203 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
15204
15205 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
15206 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
15207 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
15208}
15209
15210
15211static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
15212 APInt &UndefBits) {
15213 EVT VT = BVN->getValueType(0);
15214 APInt SplatBits, SplatUndef;
15215 unsigned SplatBitSize;
15216 bool HasAnyUndefs;
15217 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
15218 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
15219
15220 for (unsigned i = 0; i < NumSplats; ++i) {
15221 CnstBits <<= SplatBitSize;
15222 UndefBits <<= SplatBitSize;
15223 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
15224 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
15225 }
15226
15227 return true;
15228 }
15229
15230 return false;
15231}
15232
15233// Try 64-bit splatted SIMD immediate.
15234static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15235 const APInt &Bits) {
15236 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15237 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15238 EVT VT = Op.getValueType();
15239 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
15240
15243
15244 SDLoc DL(Op);
15245 SDValue Mov =
15246 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15247 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15248 }
15249 }
15250
15251 return SDValue();
15252}
15253
15254// Try 32-bit splatted SIMD immediate.
15255static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15256 const APInt &Bits,
15257 const SDValue *LHS = nullptr) {
15258 EVT VT = Op.getValueType();
15259 if (VT.isFixedLengthVector() &&
15261 return SDValue();
15262
15263 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15264 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15265 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15266 bool isAdvSIMDModImm = false;
15267 uint64_t Shift;
15268
15269 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
15271 Shift = 0;
15272 }
15273 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
15275 Shift = 8;
15276 }
15277 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
15279 Shift = 16;
15280 }
15281 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
15283 Shift = 24;
15284 }
15285
15286 if (isAdvSIMDModImm) {
15287 SDLoc DL(Op);
15288 SDValue Mov;
15289
15290 if (LHS)
15291 Mov = DAG.getNode(NewOp, DL, MovTy,
15292 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15293 DAG.getConstant(Value, DL, MVT::i32),
15294 DAG.getConstant(Shift, DL, MVT::i32));
15295 else
15296 Mov =
15297 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15298 DAG.getConstant(Shift, DL, MVT::i32));
15299
15300 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15301 }
15302 }
15303
15304 return SDValue();
15305}
15306
15307// Try 16-bit splatted SIMD immediate.
15308static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15309 const APInt &Bits,
15310 const SDValue *LHS = nullptr) {
15311 EVT VT = Op.getValueType();
15312 if (VT.isFixedLengthVector() &&
15314 return SDValue();
15315
15316 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15317 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15318 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
15319 bool isAdvSIMDModImm = false;
15320 uint64_t Shift;
15321
15322 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
15324 Shift = 0;
15325 }
15326 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
15328 Shift = 8;
15329 }
15330
15331 if (isAdvSIMDModImm) {
15332 SDLoc DL(Op);
15333 SDValue Mov;
15334
15335 if (LHS)
15336 Mov = DAG.getNode(NewOp, DL, MovTy,
15337 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15338 DAG.getConstant(Value, DL, MVT::i32),
15339 DAG.getConstant(Shift, DL, MVT::i32));
15340 else
15341 Mov =
15342 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15343 DAG.getConstant(Shift, DL, MVT::i32));
15344
15345 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15346 }
15347 }
15348
15349 return SDValue();
15350}
15351
15352// Try 32-bit splatted SIMD immediate with shifted ones.
15354 SelectionDAG &DAG, const APInt &Bits) {
15355 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15356 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15357 EVT VT = Op.getValueType();
15358 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15359 bool isAdvSIMDModImm = false;
15360 uint64_t Shift;
15361
15362 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
15364 Shift = 264;
15365 }
15366 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
15368 Shift = 272;
15369 }
15370
15371 if (isAdvSIMDModImm) {
15372 SDLoc DL(Op);
15373 SDValue Mov =
15374 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15375 DAG.getConstant(Shift, DL, MVT::i32));
15376 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15377 }
15378 }
15379
15380 return SDValue();
15381}
15382
15383// Try 8-bit splatted SIMD immediate.
15384static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15385 const APInt &Bits) {
15386 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15387 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15388 EVT VT = Op.getValueType();
15389 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
15390
15393
15394 SDLoc DL(Op);
15395 SDValue Mov =
15396 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15397 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15398 }
15399 }
15400
15401 return SDValue();
15402}
15403
15404// Try FP splatted SIMD immediate.
15405static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15406 const APInt &Bits) {
15407 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15408 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15409 EVT VT = Op.getValueType();
15410 bool isWide = (VT.getSizeInBits() == 128);
15411 MVT MovTy;
15412 bool isAdvSIMDModImm = false;
15413
15414 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
15416 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
15417 }
15418 else if (isWide &&
15419 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
15421 MovTy = MVT::v2f64;
15422 }
15423
15424 if (isAdvSIMDModImm) {
15425 SDLoc DL(Op);
15426 SDValue Mov =
15427 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15428 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15429 }
15430 }
15431
15432 return SDValue();
15433}
15434
15435// Specialized code to quickly find if PotentialBVec is a BuildVector that
15436// consists of only the same constant int value, returned in reference arg
15437// ConstVal
15438static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15439 uint64_t &ConstVal) {
15440 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15441 if (!Bvec)
15442 return false;
15444 if (!FirstElt)
15445 return false;
15446 EVT VT = Bvec->getValueType(0);
15447 unsigned NumElts = VT.getVectorNumElements();
15448 for (unsigned i = 1; i < NumElts; ++i)
15449 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15450 return false;
15451 ConstVal = FirstElt->getZExtValue();
15452 return true;
15453}
15454
15456 // Look through cast.
15457 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15458 N = N.getOperand(0);
15459
15460 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15461}
15462
15464 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15465
15466 // Look through cast.
15467 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15468 N = N.getOperand(0);
15469 // When reinterpreting from a type with fewer elements the "new" elements
15470 // are not active, so bail if they're likely to be used.
15471 if (N.getValueType().getVectorMinNumElements() < NumElts)
15472 return false;
15473 }
15474
15475 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15476 return true;
15477
15478 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15479 // or smaller than the implicit element type represented by N.
15480 // NOTE: A larger element count implies a smaller element type.
15481 if (N.getOpcode() == AArch64ISD::PTRUE &&
15482 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15483 return N.getValueType().getVectorMinNumElements() >= NumElts;
15484
15485 return false;
15486}
15487
15488// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15489// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15490// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15491// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15492// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15493// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15495 EVT VT = N->getValueType(0);
15496
15497 if (!VT.isVector())
15498 return SDValue();
15499
15500 SDLoc DL(N);
15501
15502 SDValue And;
15503 SDValue Shift;
15504
15505 SDValue FirstOp = N->getOperand(0);
15506 unsigned FirstOpc = FirstOp.getOpcode();
15507 SDValue SecondOp = N->getOperand(1);
15508 unsigned SecondOpc = SecondOp.getOpcode();
15509
15510 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15511 // a BICi in order to use an immediate instead of a register.
15512 // Is the other operand an shl or lshr? This will have been turned into:
15513 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15514 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15515 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15516 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15517 SecondOpc == AArch64ISD::SHL_PRED ||
15518 SecondOpc == AArch64ISD::SRL_PRED)) {
15519 And = FirstOp;
15520 Shift = SecondOp;
15521
15522 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15523 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15524 FirstOpc == AArch64ISD::SHL_PRED ||
15525 FirstOpc == AArch64ISD::SRL_PRED)) {
15526 And = SecondOp;
15527 Shift = FirstOp;
15528 } else
15529 return SDValue();
15530
15531 bool IsAnd = And.getOpcode() == ISD::AND;
15532 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15533 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15534 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15535 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15536
15537 // Is the shift amount constant and are all lanes active?
15538 uint64_t C2;
15539 if (ShiftHasPredOp) {
15540 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15541 return SDValue();
15542 APInt C;
15544 return SDValue();
15545 C2 = C.getZExtValue();
15546 } else if (ConstantSDNode *C2node =
15548 C2 = C2node->getZExtValue();
15549 else
15550 return SDValue();
15551
15552 APInt C1AsAPInt;
15553 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15554 if (IsAnd) {
15555 // Is the and mask vector all constant?
15556 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15557 return SDValue();
15558 } else {
15559 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15560 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15561 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15562 assert(C1nodeImm && C1nodeShift);
15563 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15564 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15565 }
15566
15567 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15568 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15569 // how much one can shift elements of a particular size?
15570 if (C2 > ElemSizeInBits)
15571 return SDValue();
15572
15573 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15574 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15575 if (C1AsAPInt != RequiredC1)
15576 return SDValue();
15577
15578 SDValue X = And.getOperand(0);
15579 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15580 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15581 : Shift.getOperand(1);
15582
15583 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15584 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15585}
15586
15588 EVT VT = N->getValueType(0);
15589 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15590 SDLoc DL(N);
15591 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15592
15593 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15594 return SDValue();
15595
15596 SDValue N0 = N->getOperand(0);
15597 if (N0.getOpcode() != ISD::AND)
15598 return SDValue();
15599
15600 SDValue N1 = N->getOperand(1);
15601 if (N1.getOpcode() != ISD::AND)
15602 return SDValue();
15603
15604 // InstCombine does (not (neg a)) => (add a -1).
15605 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15606 // Loop over all combinations of AND operands.
15607 for (int i = 1; i >= 0; --i) {
15608 for (int j = 1; j >= 0; --j) {
15609 SDValue O0 = N0->getOperand(i);
15610 SDValue O1 = N1->getOperand(j);
15611 SDValue Sub, Add, SubSibling, AddSibling;
15612
15613 // Find a SUB and an ADD operand, one from each AND.
15614 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15615 Sub = O0;
15616 Add = O1;
15617 SubSibling = N0->getOperand(1 - i);
15618 AddSibling = N1->getOperand(1 - j);
15619 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15620 Add = O0;
15621 Sub = O1;
15622 AddSibling = N0->getOperand(1 - i);
15623 SubSibling = N1->getOperand(1 - j);
15624 } else
15625 continue;
15626
15627 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15628 continue;
15629
15630 // Constant ones is always righthand operand of the Add.
15631 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15632 continue;
15633
15634 if (Sub.getOperand(1) != Add.getOperand(0))
15635 continue;
15636
15637 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15638 }
15639 }
15640
15641 // (or (and a b) (and (not a) c)) => (bsl a b c)
15642 // We only have to look for constant vectors here since the general, variable
15643 // case can be handled in TableGen.
15644 unsigned Bits = VT.getScalarSizeInBits();
15645 for (int i = 1; i >= 0; --i)
15646 for (int j = 1; j >= 0; --j) {
15647 APInt Val1, Val2;
15648
15649 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15651 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15652 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15653 N0->getOperand(1 - i), N1->getOperand(1 - j));
15654 }
15657 if (!BVN0 || !BVN1)
15658 continue;
15659
15660 bool FoundMatch = true;
15661 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15664 if (!CN0 || !CN1 ||
15665 CN0->getAPIntValue().trunc(Bits) !=
15666 ~CN1->getAsAPIntVal().trunc(Bits)) {
15667 FoundMatch = false;
15668 break;
15669 }
15670 }
15671 if (FoundMatch)
15672 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15673 N0->getOperand(1 - i), N1->getOperand(1 - j));
15674 }
15675
15676 return SDValue();
15677}
15678
15679SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15680 SelectionDAG &DAG) const {
15681 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15682 !Subtarget->isNeonAvailable()))
15683 return LowerToScalableOp(Op, DAG);
15684
15685 if (SDValue Res = tryLowerToBSL(Op, DAG))
15686 return Res;
15687
15688 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15689 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15690 return Res;
15691
15692 EVT VT = Op.getValueType();
15693 if (VT.isScalableVector())
15694 return Op;
15695
15696 SDValue LHS = Op.getOperand(0);
15697 BuildVectorSDNode *BVN =
15698 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15699 if (!BVN) {
15700 // OR commutes, so try swapping the operands.
15701 LHS = Op.getOperand(1);
15702 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15703 }
15704 if (!BVN)
15705 return Op;
15706
15707 APInt DefBits(VT.getSizeInBits(), 0);
15708 APInt UndefBits(VT.getSizeInBits(), 0);
15709 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15710 SDValue NewOp;
15711
15712 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15713 DefBits, &LHS)) ||
15714 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15715 DefBits, &LHS)))
15716 return NewOp;
15717
15718 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15719 UndefBits, &LHS)) ||
15720 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15721 UndefBits, &LHS)))
15722 return NewOp;
15723 }
15724
15725 // We can always fall back to a non-immediate OR.
15726 return Op;
15727}
15728
15729// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15730// be truncated to fit element width.
15732 SelectionDAG &DAG) {
15733 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15734 SDLoc DL(Op);
15735 EVT VT = Op.getValueType();
15736 EVT EltTy= VT.getVectorElementType();
15737
15738 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15739 return Op;
15740
15742 for (SDValue Lane : Op->ops()) {
15743 // For integer vectors, type legalization would have promoted the
15744 // operands already. Otherwise, if Op is a floating-point splat
15745 // (with operands cast to integers), then the only possibilities
15746 // are constants and UNDEFs.
15747 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15748 Lane = DAG.getConstant(
15749 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15750 DL, MVT::i32);
15751 } else if (Lane.getNode()->isUndef()) {
15752 Lane = DAG.getUNDEF(MVT::i32);
15753 } else {
15754 assert(Lane.getValueType() == MVT::i32 &&
15755 "Unexpected BUILD_VECTOR operand type");
15756 }
15757 Ops.push_back(Lane);
15758 }
15759 return DAG.getBuildVector(VT, DL, Ops);
15760}
15761
15763 const AArch64Subtarget *ST, APInt &DefBits) {
15764 EVT VT = Op.getValueType();
15765 // TODO: We should be able to support 64-bit destinations too
15766 if (!ST->hasSVE() || !VT.is128BitVector() ||
15767 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15768 return SDValue();
15769
15770 // See if we can make use of the SVE dup instruction.
15771 APInt Val64 = DefBits.trunc(64);
15772 int32_t ImmVal, ShiftVal;
15773 uint64_t Encoding;
15774 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal) &&
15775 !AArch64_AM::isSVELogicalImm(64, Val64.getZExtValue(), Encoding))
15776 return SDValue();
15777
15778 SDLoc DL(Op);
15779 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15780 DAG.getConstant(Val64, DL, MVT::i64));
15781 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15782 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15783}
15784
15786 const AArch64Subtarget *ST) {
15787 EVT VT = Op.getValueType();
15788 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15789 "Expected a legal NEON vector");
15790
15791 APInt DefBits(VT.getSizeInBits(), 0);
15792 APInt UndefBits(VT.getSizeInBits(), 0);
15794 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15795 auto TryMOVIWithBits = [&](APInt DefBits) {
15796 SDValue NewOp;
15797 if ((NewOp =
15798 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15799 (NewOp =
15800 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15801 (NewOp =
15802 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15803 (NewOp =
15804 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15805 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15806 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15807 return NewOp;
15808
15809 APInt NotDefBits = ~DefBits;
15810 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15811 NotDefBits)) ||
15812 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15813 NotDefBits)) ||
15814 (NewOp =
15815 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15816 return NewOp;
15817 return SDValue();
15818 };
15819 if (SDValue R = TryMOVIWithBits(DefBits))
15820 return R;
15821 if (SDValue R = TryMOVIWithBits(UndefBits))
15822 return R;
15823
15824 // Try to materialise the constant using SVE when available.
15825 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15826 return R;
15827
15828 // See if a fneg of the constant can be materialized with a MOVI, etc
15829 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15830 // FNegate each sub-element of the constant
15831 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15832 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15833 .zext(VT.getSizeInBits());
15834 APInt NegBits(VT.getSizeInBits(), 0);
15835 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15836 for (unsigned i = 0; i < NumElts; i++)
15837 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15838 NegBits = DefBits ^ NegBits;
15839
15840 // Try to create the new constants with MOVI, and if so generate a fneg
15841 // for it.
15842 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15843 SDLoc DL(Op);
15844 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15845 return DAG.getNode(
15846 AArch64ISD::NVCAST, DL, VT,
15847 DAG.getNode(ISD::FNEG, DL, VFVT,
15848 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15849 }
15850 return SDValue();
15851 };
15852 SDValue R;
15853 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15854 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15855 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15856 return R;
15857 }
15858
15859 return SDValue();
15860}
15861
15862SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15863 SDValue Op, SelectionDAG &DAG) const {
15864 EVT VT = Op.getValueType();
15865 SDLoc DL(Op);
15866 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15867 auto *BVN = cast<BuildVectorSDNode>(Op);
15868
15869 if (auto SeqInfo = BVN->isConstantSequence()) {
15870 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15871 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15872 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15873 return convertFromScalableVector(DAG, VT, Seq);
15874 }
15875
15876 unsigned NumElems = VT.getVectorNumElements();
15877 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15878 NumElems <= 1 || BVN->isConstant())
15879 return SDValue();
15880
15881 auto IsExtractElt = [](SDValue Op) {
15882 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15883 };
15884
15885 // For integer types that are not already in vectors limit to at most four
15886 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15887 if (VT.getScalarType().isInteger() &&
15888 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15889 return SDValue();
15890
15891 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15892 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15894 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15895 return Op.isUndef() ? Undef
15896 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15897 ContainerVT, Undef, Op, ZeroI64);
15898 });
15899
15900 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15901 while (Intermediates.size() > 1) {
15902 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15903
15904 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15905 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15906 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15907 Intermediates[I / 2] =
15908 Op1.isUndef() ? Op0
15909 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15910 }
15911
15912 Intermediates.resize(Intermediates.size() / 2);
15913 ZipEC = ZipEC.divideCoefficientBy(2);
15914 }
15915
15916 assert(Intermediates.size() == 1);
15917 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15918 return convertFromScalableVector(DAG, VT, Vec);
15919}
15920
15921SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15922 SelectionDAG &DAG) const {
15923 EVT VT = Op.getValueType();
15924
15925 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15926 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15927 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15928 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15929
15930 // Try to build a simple constant vector.
15931 Op = NormalizeBuildVector(Op, DAG);
15932 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15933 // abort.
15934 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15935 return SDValue();
15936
15937 // Certain vector constants, used to express things like logical NOT and
15938 // arithmetic NEG, are passed through unmodified. This allows special
15939 // patterns for these operations to match, which will lower these constants
15940 // to whatever is proven necessary.
15941 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15942 if (BVN->isConstant()) {
15943 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15944 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15945 APInt Val(BitSize,
15946 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15947 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15948 return Op;
15949 }
15950 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15951 if (Const->isZero() && !Const->isNegative())
15952 return Op;
15953 }
15954
15955 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15956 return V;
15957
15958 // Scan through the operands to find some interesting properties we can
15959 // exploit:
15960 // 1) If only one value is used, we can use a DUP, or
15961 // 2) if only the low element is not undef, we can just insert that, or
15962 // 3) if only one constant value is used (w/ some non-constant lanes),
15963 // we can splat the constant value into the whole vector then fill
15964 // in the non-constant lanes.
15965 // 4) FIXME: If different constant values are used, but we can intelligently
15966 // select the values we'll be overwriting for the non-constant
15967 // lanes such that we can directly materialize the vector
15968 // some other way (MOVI, e.g.), we can be sneaky.
15969 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15970 SDLoc DL(Op);
15971 unsigned NumElts = VT.getVectorNumElements();
15972 bool isOnlyLowElement = true;
15973 bool usesOnlyOneValue = true;
15974 bool usesOnlyOneConstantValue = true;
15975 bool isConstant = true;
15976 bool AllLanesExtractElt = true;
15977 unsigned NumConstantLanes = 0;
15978 unsigned NumDifferentLanes = 0;
15979 unsigned NumUndefLanes = 0;
15980 SDValue Value;
15981 SDValue ConstantValue;
15982 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15983 unsigned ConsecutiveValCount = 0;
15984 SDValue PrevVal;
15985 for (unsigned i = 0; i < NumElts; ++i) {
15986 SDValue V = Op.getOperand(i);
15987 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15988 AllLanesExtractElt = false;
15989 if (V.isUndef()) {
15990 ++NumUndefLanes;
15991 continue;
15992 }
15993 if (i > 0)
15994 isOnlyLowElement = false;
15995 if (!isIntOrFPConstant(V))
15996 isConstant = false;
15997
15998 if (isIntOrFPConstant(V)) {
15999 ++NumConstantLanes;
16000 if (!ConstantValue.getNode())
16001 ConstantValue = V;
16002 else if (ConstantValue != V)
16003 usesOnlyOneConstantValue = false;
16004 }
16005
16006 if (!Value.getNode())
16007 Value = V;
16008 else if (V != Value) {
16009 usesOnlyOneValue = false;
16010 ++NumDifferentLanes;
16011 }
16012
16013 if (PrevVal != V) {
16014 ConsecutiveValCount = 0;
16015 PrevVal = V;
16016 }
16017
16018 // Keep different values and its last consecutive count. For example,
16019 //
16020 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16021 // t24, t24, t24, t24, t24, t24, t24, t24
16022 // t23 = consecutive count 8
16023 // t24 = consecutive count 8
16024 // ------------------------------------------------------------------
16025 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
16026 // t24, t24, t24, t24, t24, t24, t24, t24
16027 // t23 = consecutive count 5
16028 // t24 = consecutive count 9
16029 DifferentValueMap[V] = ++ConsecutiveValCount;
16030 }
16031
16032 if (!Value.getNode()) {
16033 LLVM_DEBUG(
16034 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
16035 return DAG.getUNDEF(VT);
16036 }
16037
16038 // Convert BUILD_VECTOR where all elements but the lowest are undef into
16039 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
16040 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
16041 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
16042 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
16043 "SCALAR_TO_VECTOR node\n");
16044 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
16045 }
16046
16047 if (AllLanesExtractElt) {
16048 SDNode *Vector = nullptr;
16049 bool Even = false;
16050 bool Odd = false;
16051 // Check whether the extract elements match the Even pattern <0,2,4,...> or
16052 // the Odd pattern <1,3,5,...>.
16053 for (unsigned i = 0; i < NumElts; ++i) {
16054 SDValue V = Op.getOperand(i);
16055 const SDNode *N = V.getNode();
16056 if (!isa<ConstantSDNode>(N->getOperand(1))) {
16057 Even = false;
16058 Odd = false;
16059 break;
16060 }
16061 SDValue N0 = N->getOperand(0);
16062
16063 // All elements are extracted from the same vector.
16064 if (!Vector) {
16065 Vector = N0.getNode();
16066 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
16067 // BUILD_VECTOR.
16068 if (VT.getVectorElementType() !=
16070 break;
16071 } else if (Vector != N0.getNode()) {
16072 Odd = false;
16073 Even = false;
16074 break;
16075 }
16076
16077 // Extracted values are either at Even indices <0,2,4,...> or at Odd
16078 // indices <1,3,5,...>.
16079 uint64_t Val = N->getConstantOperandVal(1);
16080 if (Val == 2 * i) {
16081 Even = true;
16082 continue;
16083 }
16084 if (Val - 1 == 2 * i) {
16085 Odd = true;
16086 continue;
16087 }
16088
16089 // Something does not match: abort.
16090 Odd = false;
16091 Even = false;
16092 break;
16093 }
16094 if (Even || Odd) {
16095 SDValue LHS =
16097 DAG.getConstant(0, DL, MVT::i64));
16098 SDValue RHS =
16100 DAG.getConstant(NumElts, DL, MVT::i64));
16101
16102 if (Even && !Odd)
16103 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
16104 if (Odd && !Even)
16105 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
16106 }
16107 }
16108
16109 // Use DUP for non-constant splats. For f32 constant splats, reduce to
16110 // i32 and try again.
16111 if (usesOnlyOneValue) {
16112 if (!isConstant) {
16113 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16114 Value.getValueType() != VT) {
16115 LLVM_DEBUG(
16116 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
16117 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
16118 }
16119
16120 // This is actually a DUPLANExx operation, which keeps everything vectory.
16121
16122 SDValue Lane = Value.getOperand(1);
16123 Value = Value.getOperand(0);
16124 if (Value.getValueSizeInBits() == 64) {
16125 LLVM_DEBUG(
16126 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
16127 "widening it\n");
16128 Value = WidenVector(Value, DAG);
16129 }
16130
16131 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
16132 return DAG.getNode(Opcode, DL, VT, Value, Lane);
16133 }
16134
16137 EVT EltTy = VT.getVectorElementType();
16138 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
16139 EltTy == MVT::f64) && "Unsupported floating-point vector type");
16140 LLVM_DEBUG(
16141 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
16142 "BITCASTS, and try again\n");
16143 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
16144 for (unsigned i = 0; i < NumElts; ++i)
16145 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
16146 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
16147 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
16148 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
16149 Val.dump(););
16150 Val = LowerBUILD_VECTOR(Val, DAG);
16151 if (Val.getNode())
16152 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
16153 }
16154 }
16155
16156 // If we need to insert a small number of different non-constant elements and
16157 // the vector width is sufficiently large, prefer using DUP with the common
16158 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
16159 // skip the constant lane handling below.
16160 bool PreferDUPAndInsert =
16161 !isConstant && NumDifferentLanes >= 1 &&
16162 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
16163 NumDifferentLanes >= NumConstantLanes;
16164
16165 // If there was only one constant value used and for more than one lane,
16166 // start by splatting that value, then replace the non-constant lanes. This
16167 // is better than the default, which will perform a separate initialization
16168 // for each lane.
16169 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
16170 // Firstly, try to materialize the splat constant.
16171 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
16172 unsigned BitSize = VT.getScalarSizeInBits();
16173 APInt ConstantValueAPInt(1, 0);
16174 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
16175 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
16176 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
16177 !ConstantValueAPInt.isAllOnes()) {
16178 Val = ConstantBuildVector(Val, DAG, Subtarget);
16179 if (!Val)
16180 // Otherwise, materialize the constant and splat it.
16181 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
16182 }
16183
16184 // Now insert the non-constant lanes.
16185 for (unsigned i = 0; i < NumElts; ++i) {
16186 SDValue V = Op.getOperand(i);
16187 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16188 if (!isIntOrFPConstant(V) && !V.isUndef())
16189 // Note that type legalization likely mucked about with the VT of the
16190 // source operand, so we may have to convert it here before inserting.
16191 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
16192 }
16193 return Val;
16194 }
16195
16196 // This will generate a load from the constant pool.
16197 if (isConstant) {
16198 LLVM_DEBUG(
16199 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
16200 "expansion\n");
16201 return SDValue();
16202 }
16203
16204 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
16205 // v4i32s. This is really a truncate, which we can construct out of (legal)
16206 // concats and truncate nodes.
16208 return M;
16209
16210 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
16211 if (NumElts >= 4) {
16212 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
16213 return Shuffle;
16214
16215 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
16216 return Shuffle;
16217 }
16218
16219 if (PreferDUPAndInsert) {
16220 // First, build a constant vector with the common element.
16222 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
16223 // Next, insert the elements that do not match the common value.
16224 for (unsigned I = 0; I < NumElts; ++I)
16225 if (Op.getOperand(I) != Value)
16226 NewVector =
16227 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
16228 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
16229
16230 return NewVector;
16231 }
16232
16233 // If vector consists of two different values, try to generate two DUPs and
16234 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
16235 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
16237 // Check the consecutive count of the value is the half number of vector
16238 // elements. In this case, we can use CONCAT_VECTORS. For example,
16239 //
16240 // canUseVECTOR_CONCAT = true;
16241 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16242 // t24, t24, t24, t24, t24, t24, t24, t24
16243 //
16244 // canUseVECTOR_CONCAT = false;
16245 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
16246 // t24, t24, t24, t24, t24, t24, t24, t24
16247 bool canUseVECTOR_CONCAT = true;
16248 for (auto Pair : DifferentValueMap) {
16249 // Check different values have same length which is NumElts / 2.
16250 if (Pair.second != NumElts / 2)
16251 canUseVECTOR_CONCAT = false;
16252 Vals.push_back(Pair.first);
16253 }
16254
16255 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
16256 // CONCAT_VECTORs. For example,
16257 //
16258 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
16259 // t24, t24, t24, t24, t24, t24, t24, t24
16260 // ==>
16261 // t26: v8i8 = AArch64ISD::DUP t23
16262 // t28: v8i8 = AArch64ISD::DUP t24
16263 // t29: v16i8 = concat_vectors t26, t28
16264 if (canUseVECTOR_CONCAT) {
16265 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16266 if (isTypeLegal(SubVT) && SubVT.isVector() &&
16267 SubVT.getVectorNumElements() >= 2) {
16268 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
16269 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
16270 SDValue DUP1 =
16271 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
16272 SDValue DUP2 =
16273 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
16275 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
16276 return CONCAT_VECTORS;
16277 }
16278 }
16279
16280 // Let's try to generate VECTOR_SHUFFLE. For example,
16281 //
16282 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
16283 // ==>
16284 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
16285 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
16286 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
16287 if (NumElts >= 8) {
16288 SmallVector<int, 16> MaskVec;
16289 // Build mask for VECTOR_SHUFLLE.
16290 SDValue FirstLaneVal = Op.getOperand(0);
16291 for (unsigned i = 0; i < NumElts; ++i) {
16292 SDValue Val = Op.getOperand(i);
16293 if (FirstLaneVal == Val)
16294 MaskVec.push_back(i);
16295 else
16296 MaskVec.push_back(i + NumElts);
16297 }
16298
16299 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
16300 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
16301 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
16302 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
16304 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
16305 return VECTOR_SHUFFLE;
16306 }
16307 }
16308
16309 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
16310 // know the default expansion would otherwise fall back on something even
16311 // worse. For a vector with one or two non-undef values, that's
16312 // scalar_to_vector for the elements followed by a shuffle (provided the
16313 // shuffle is valid for the target) and materialization element by element
16314 // on the stack followed by a load for everything else.
16315 if (!isConstant && !usesOnlyOneValue) {
16316 LLVM_DEBUG(
16317 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
16318 "of INSERT_VECTOR_ELT\n");
16319
16320 SDValue Vec = DAG.getUNDEF(VT);
16321 SDValue Op0 = Op.getOperand(0);
16322 unsigned i = 0;
16323
16324 // Use SCALAR_TO_VECTOR for lane zero to
16325 // a) Avoid a RMW dependency on the full vector register, and
16326 // b) Allow the register coalescer to fold away the copy if the
16327 // value is already in an S or D register, and we're forced to emit an
16328 // INSERT_SUBREG that we can't fold anywhere.
16329 //
16330 // We also allow types like i8 and i16 which are illegal scalar but legal
16331 // vector element types. After type-legalization the inserted value is
16332 // extended (i32) and it is safe to cast them to the vector type by ignoring
16333 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
16334 if (!Op0.isUndef()) {
16335 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
16336 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
16337 ++i;
16338 }
16339 LLVM_DEBUG({
16340 if (i < NumElts)
16341 dbgs() << "Creating nodes for the other vector elements:\n";
16342 });
16343 for (; i < NumElts; ++i) {
16344 SDValue V = Op.getOperand(i);
16345 if (V.isUndef())
16346 continue;
16347 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16348 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
16349 }
16350 return Vec;
16351 }
16352
16353 LLVM_DEBUG(
16354 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
16355 "better alternative\n");
16356 return SDValue();
16357}
16358
16359SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
16360 SelectionDAG &DAG) const {
16361 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16362 !Subtarget->isNeonAvailable()))
16363 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
16364
16365 assert(Op.getValueType().isScalableVector() &&
16366 isTypeLegal(Op.getValueType()) &&
16367 "Expected legal scalable vector type!");
16368
16369 if (isTypeLegal(Op.getOperand(0).getValueType())) {
16370 unsigned NumOperands = Op->getNumOperands();
16371 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
16372 "Unexpected number of operands in CONCAT_VECTORS");
16373
16374 if (NumOperands == 2)
16375 return Op;
16376
16377 // Concat each pair of subvectors and pack into the lower half of the array.
16378 SmallVector<SDValue> ConcatOps(Op->ops());
16379 while (ConcatOps.size() > 1) {
16380 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
16381 SDValue V1 = ConcatOps[I];
16382 SDValue V2 = ConcatOps[I + 1];
16383 EVT SubVT = V1.getValueType();
16384 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
16385 ConcatOps[I / 2] =
16386 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
16387 }
16388 ConcatOps.resize(ConcatOps.size() / 2);
16389 }
16390 return ConcatOps[0];
16391 }
16392
16393 return SDValue();
16394}
16395
16396SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
16397 SelectionDAG &DAG) const {
16398 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
16399
16400 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16401 !Subtarget->isNeonAvailable()))
16402 return LowerFixedLengthInsertVectorElt(Op, DAG);
16403
16404 EVT VT = Op.getOperand(0).getValueType();
16405
16406 if (VT.getScalarType() == MVT::i1) {
16407 EVT VectorVT = getPromotedVTForPredicate(VT);
16408 SDLoc DL(Op);
16409 SDValue ExtendedVector =
16410 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
16411 SDValue ExtendedValue =
16412 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
16413 VectorVT.getScalarType().getSizeInBits() < 32
16414 ? MVT::i32
16415 : VectorVT.getScalarType());
16416 ExtendedVector =
16417 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
16418 ExtendedValue, Op.getOperand(2));
16419 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
16420 }
16421
16422 // Check for non-constant or out of range lane.
16423 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
16424 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16425 return SDValue();
16426
16427 return Op;
16428}
16429
16430SDValue
16431AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16432 SelectionDAG &DAG) const {
16433 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16434 EVT VT = Op.getOperand(0).getValueType();
16435
16436 if (VT.getScalarType() == MVT::i1) {
16437 SDLoc DL(Op);
16438 // There are no operations to extend a nxv1i1 predicate to a nxv1i128 vector
16439 // An easy lowering is widening the input predicate to nxv2i1.
16440 if (VT == MVT::nxv1i1) {
16441 SDValue WidenedPred = DAG.getInsertSubvector(
16442 DL, DAG.getPOISON(MVT::nxv2i1), Op->getOperand(0), 0);
16443 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
16444 WidenedPred, Op.getOperand(1));
16445 }
16446 // We can't directly extract from an SVE predicate; extend it first.
16447 // (This isn't the only possible lowering, but it's straightforward.)
16448 EVT VectorVT = getPromotedVTForPredicate(VT);
16449 SDValue Extend =
16450 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16451 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16452 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16453 Extend, Op.getOperand(1));
16454 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16455 }
16456
16457 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16458 return LowerFixedLengthExtractVectorElt(Op, DAG);
16459
16460 // Check for non-constant or out of range lane.
16461 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16462 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16463 return SDValue();
16464
16465 // Insertion/extraction are legal for V128 types.
16466 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16467 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16468 VT == MVT::v8f16 || VT == MVT::v8bf16)
16469 return Op;
16470
16471 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16472 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16473 VT != MVT::v4bf16)
16474 return SDValue();
16475
16476 // For V64 types, we perform extraction by expanding the value
16477 // to a V128 type and perform the extraction on that.
16478 SDLoc DL(Op);
16479 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16480 EVT WideTy = WideVec.getValueType();
16481
16482 EVT ExtrTy = WideTy.getVectorElementType();
16483 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16484 ExtrTy = MVT::i32;
16485
16486 // For extractions, we just return the result directly.
16487 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16488 Op.getOperand(1));
16489}
16490
16491SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16492 SelectionDAG &DAG) const {
16493 EVT VT = Op.getValueType();
16495 "Only cases that extract a fixed length vector are supported!");
16496 EVT InVT = Op.getOperand(0).getValueType();
16497
16498 // If we don't have legal types yet, do nothing
16499 if (!isTypeLegal(InVT))
16500 return SDValue();
16501
16502 if (InVT.is128BitVector()) {
16503 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16504 unsigned Idx = Op.getConstantOperandVal(1);
16505
16506 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16507 if (Idx == 0)
16508 return Op;
16509
16510 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16511 // that directly.
16512 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16513 return Op;
16514 }
16515
16516 if (InVT.isScalableVector() ||
16517 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16518 SDLoc DL(Op);
16519 SDValue Vec = Op.getOperand(0);
16520 SDValue Idx = Op.getOperand(1);
16521
16522 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16523 if (PackedVT != InVT) {
16524 // Pack input into the bottom part of an SVE register and try again.
16525 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16526 DAG.getUNDEF(PackedVT), Vec,
16527 DAG.getVectorIdxConstant(0, DL));
16528 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16529 }
16530
16531 // This will get matched by custom code during ISelDAGToDAG.
16532 if (isNullConstant(Idx))
16533 return Op;
16534
16535 assert(InVT.isScalableVector() && "Unexpected vector type!");
16536 // Move requested subvector to the start of the vector and try again.
16537 SDValue Splice =
16538 DAG.getNode(ISD::VECTOR_SPLICE_LEFT, DL, InVT, Vec, Vec, Idx);
16539 return convertFromScalableVector(DAG, VT, Splice);
16540 }
16541
16542 return SDValue();
16543}
16544
16545SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16546 SelectionDAG &DAG) const {
16547 assert(Op.getValueType().isScalableVector() &&
16548 "Only expect to lower inserts into scalable vectors!");
16549
16550 EVT InVT = Op.getOperand(1).getValueType();
16551 unsigned Idx = Op.getConstantOperandVal(2);
16552
16553 SDValue Vec0 = Op.getOperand(0);
16554 SDValue Vec1 = Op.getOperand(1);
16555 SDLoc DL(Op);
16556 EVT VT = Op.getValueType();
16557
16558 if (InVT.isScalableVector()) {
16559 if (!isTypeLegal(VT))
16560 return SDValue();
16561
16562 // Break down insert_subvector into simpler parts.
16563 if (VT.getVectorElementType() == MVT::i1) {
16564 unsigned NumElts = VT.getVectorMinNumElements();
16565 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16566
16567 SDValue Lo, Hi;
16568 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16569 DAG.getVectorIdxConstant(0, DL));
16570 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16571 DAG.getVectorIdxConstant(NumElts / 2, DL));
16572 if (Idx < (NumElts / 2))
16573 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16574 DAG.getVectorIdxConstant(Idx, DL));
16575 else
16576 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16577 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16578
16579 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16580 }
16581
16582 // We can select these directly.
16583 if (isTypeLegal(InVT) && Vec0.isUndef())
16584 return Op;
16585
16586 // Ensure the subvector is half the size of the main vector.
16587 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16588 return SDValue();
16589
16590 // Here narrow and wide refers to the vector element types. After "casting"
16591 // both vectors must have the same bit length and so because the subvector
16592 // has fewer elements, those elements need to be bigger.
16593 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16594 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16595
16596 // NOP cast operands to the largest legal vector of the same element count.
16597 if (VT.isFloatingPoint()) {
16598 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16599 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16600 } else {
16601 // Legal integer vectors are already their largest so Vec0 is fine as is.
16602 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16603 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16604 }
16605
16606 // To replace the top/bottom half of vector V with vector SubV we widen the
16607 // preserved half of V, concatenate this to SubV (the order depending on the
16608 // half being replaced) and then narrow the result.
16609 SDValue Narrow;
16610 if (Idx == 0) {
16611 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16612 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16613 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16614 } else {
16615 assert(Idx == InVT.getVectorMinNumElements() &&
16616 "Invalid subvector index!");
16617 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16618 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16619 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16620 }
16621
16622 return getSVESafeBitCast(VT, Narrow, DAG);
16623 }
16624
16625 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16626 // This will be matched by custom code during ISelDAGToDAG.
16627 if (Vec0.isUndef())
16628 return Op;
16629
16630 std::optional<unsigned> PredPattern =
16632 auto PredTy = VT.changeVectorElementType(*DAG.getContext(), MVT::i1);
16633 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16634 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16635 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16636 }
16637
16638 return SDValue();
16639}
16640
16641static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16642 if (Op.getOpcode() != AArch64ISD::DUP &&
16643 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16644 Op.getOpcode() != ISD::BUILD_VECTOR)
16645 return false;
16646
16647 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16648 !isAllConstantBuildVector(Op, SplatVal))
16649 return false;
16650
16651 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16652 !isa<ConstantSDNode>(Op->getOperand(0)))
16653 return false;
16654
16655 SplatVal = Op->getConstantOperandVal(0);
16656 if (Op.getValueType().getVectorElementType() != MVT::i64)
16657 SplatVal = (int32_t)SplatVal;
16658
16659 Negated = false;
16660 if (isPowerOf2_64(SplatVal))
16661 return true;
16662
16663 Negated = true;
16664 if (isPowerOf2_64(-SplatVal)) {
16665 SplatVal = -SplatVal;
16666 return true;
16667 }
16668
16669 return false;
16670}
16671
16672SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16673 EVT VT = Op.getValueType();
16674 SDLoc DL(Op);
16675
16676 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16677 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16678
16679 assert(VT.isScalableVector() && "Expected a scalable vector.");
16680
16681 bool Signed = Op.getOpcode() == ISD::SDIV;
16682 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16683
16684 bool Negated;
16685 uint64_t SplatVal;
16686 // NOTE: SRAD cannot be used to represent sdiv-by-one.
16687 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
16688 SplatVal > 1) {
16690 SDValue Res =
16691 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16692 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16693 if (Negated)
16694 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16695
16696 return Res;
16697 }
16698
16699 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16700 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16701
16702 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16703 // operations, and truncate the result.
16704 EVT WidenedVT;
16705 if (VT == MVT::nxv16i8)
16706 WidenedVT = MVT::nxv8i16;
16707 else if (VT == MVT::nxv8i16)
16708 WidenedVT = MVT::nxv4i32;
16709 else
16710 llvm_unreachable("Unexpected Custom DIV operation");
16711
16712 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16713 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16714 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16715 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16716 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16717 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16718 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16719 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16720 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16721 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16722 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16723}
16724
16725bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16726 EVT VT, unsigned DefinedValues) const {
16727 if (!Subtarget->isNeonAvailable())
16728 return false;
16730}
16731
16733 // Currently no fixed length shuffles that require SVE are legal.
16734 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16735 return false;
16736
16737 if (VT.getVectorNumElements() == 4 &&
16738 (VT.is128BitVector() || VT.is64BitVector())) {
16739 unsigned Cost = getPerfectShuffleCost(M);
16740 if (Cost <= 1)
16741 return true;
16742 }
16743
16744 bool DummyBool;
16745 int DummyInt;
16746 unsigned DummyUnsigned;
16747
16748 unsigned EltSize = VT.getScalarSizeInBits();
16749 unsigned NumElts = VT.getVectorNumElements();
16751 isREVMask(M, EltSize, NumElts, 64) ||
16752 isREVMask(M, EltSize, NumElts, 32) ||
16753 isREVMask(M, EltSize, NumElts, 16) ||
16754 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16755 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16756 isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16757 isUZPMask(M, NumElts, DummyUnsigned) ||
16758 isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16759 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16760 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16761 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16762 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16763 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16764}
16765
16767 EVT VT) const {
16768 // Just delegate to the generic legality, clear masks aren't special.
16769 return isShuffleMaskLegal(M, VT);
16770}
16771
16772/// getVShiftImm - Check if this is a valid build_vector for the immediate
16773/// operand of a vector shift operation, where all the elements of the
16774/// build_vector must have the same constant integer value.
16775static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16776 // Ignore bit_converts.
16777 while (Op.getOpcode() == ISD::BITCAST)
16778 Op = Op.getOperand(0);
16780 APInt SplatBits, SplatUndef;
16781 unsigned SplatBitSize;
16782 bool HasAnyUndefs;
16783 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16784 HasAnyUndefs, ElementBits) ||
16785 SplatBitSize > ElementBits)
16786 return false;
16787 Cnt = SplatBits.getSExtValue();
16788 return true;
16789}
16790
16791/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16792/// operand of a vector shift left operation. That value must be in the range:
16793/// 0 <= Value < ElementBits for a left shift; or
16794/// 0 <= Value <= ElementBits for a long left shift.
16795static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16796 assert(VT.isVector() && "vector shift count is not a vector type");
16797 int64_t ElementBits = VT.getScalarSizeInBits();
16798 if (!getVShiftImm(Op, ElementBits, Cnt))
16799 return false;
16800 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16801}
16802
16803/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16804/// operand of a vector shift right operation. The value must be in the range:
16805/// 1 <= Value <= ElementBits for a right shift; or
16806static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16807 assert(VT.isVector() && "vector shift count is not a vector type");
16808 int64_t ElementBits = VT.getScalarSizeInBits();
16809 if (!getVShiftImm(Op, ElementBits, Cnt))
16810 return false;
16811 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16812}
16813
16814SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16815 SelectionDAG &DAG) const {
16816 EVT VT = Op.getValueType();
16817
16818 if (VT.getScalarType() == MVT::i1) {
16819 // Lower i1 truncate to `(x & 1) != 0`.
16820 SDLoc DL(Op);
16821 EVT OpVT = Op.getOperand(0).getValueType();
16822 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16823 SDValue One = DAG.getConstant(1, DL, OpVT);
16824 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16825 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16826 }
16827
16828 if (!VT.isVector() || VT.isScalableVector())
16829 return SDValue();
16830
16831 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16832 !Subtarget->isNeonAvailable()))
16833 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16834
16835 return SDValue();
16836}
16837
16838// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16839// possibly a truncated type, it tells how many bits of the value are to be
16840// used.
16842 SelectionDAG &DAG,
16843 unsigned &ShiftValue,
16844 SDValue &RShOperand) {
16845 if (Shift->getOpcode() != ISD::SRL)
16846 return false;
16847
16848 EVT VT = Shift.getValueType();
16849 assert(VT.isScalableVT());
16850
16851 auto ShiftOp1 =
16853 if (!ShiftOp1)
16854 return false;
16855
16856 ShiftValue = ShiftOp1->getZExtValue();
16857 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16858 return false;
16859
16860 SDValue Add = Shift->getOperand(0);
16861 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16862 return false;
16863
16865 "ResVT must be truncated or same type as the shift.");
16866 // Check if an overflow can lead to incorrect results.
16867 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16868 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16869 return false;
16870
16871 auto AddOp1 =
16873 if (!AddOp1)
16874 return false;
16875 uint64_t AddValue = AddOp1->getZExtValue();
16876 if (AddValue != 1ULL << (ShiftValue - 1))
16877 return false;
16878
16879 RShOperand = Add->getOperand(0);
16880 return true;
16881}
16882
16883SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16884 SelectionDAG &DAG) const {
16885 EVT VT = Op.getValueType();
16886 SDLoc DL(Op);
16887 int64_t Cnt;
16888
16889 if (!Op.getOperand(1).getValueType().isVector())
16890 return Op;
16891 unsigned EltSize = VT.getScalarSizeInBits();
16892
16893 switch (Op.getOpcode()) {
16894 case ISD::SHL:
16895 if (VT.isScalableVector() ||
16896 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16897 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16898
16899 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16900 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16901 DAG.getTargetConstant(Cnt, DL, MVT::i32));
16902 return DAG.getNode(
16904 DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
16905 Op.getOperand(0), Op.getOperand(1));
16906 case ISD::SRA:
16907 case ISD::SRL:
16908 if (VT.isScalableVector() &&
16909 (Subtarget->hasSVE2() ||
16910 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16911 SDValue RShOperand;
16912 unsigned ShiftValue;
16913 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16914 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16915 getPredicateForVector(DAG, DL, VT), RShOperand,
16916 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16917 }
16918
16919 if (VT.isScalableVector() ||
16920 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16921 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16922 : AArch64ISD::SRL_PRED;
16923 return LowerToPredicatedOp(Op, DAG, Opc);
16924 }
16925
16926 // Right shift immediate
16927 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16928 unsigned Opc =
16929 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16930 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16931 DAG.getTargetConstant(Cnt, DL, MVT::i32),
16932 Op->getFlags());
16933 }
16934
16935 // Right shift register. Note, there is not a shift right register
16936 // instruction, but the shift left register instruction takes a signed
16937 // value, where negative numbers specify a right shift.
16938 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16939 : Intrinsic::aarch64_neon_ushl;
16940 // negate the shift amount
16941 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16942 Op.getOperand(1));
16943 SDValue NegShiftLeft =
16945 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16946 NegShift);
16947 return NegShiftLeft;
16948 }
16949
16950 llvm_unreachable("unexpected shift opcode");
16951}
16952
16953SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16954 SelectionDAG &DAG) const {
16955 if (Op.getValueType().isScalableVector())
16956 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16957
16958 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16959 !Subtarget->isNeonAvailable()))
16960 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16961
16962 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16963 SDValue LHS = Op.getOperand(0);
16964 SDValue RHS = Op.getOperand(1);
16965 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16966 SDLoc DL(Op);
16967
16968 if (LHS.getValueType().getVectorElementType().isInteger())
16969 return Op;
16970
16971 assert(((!Subtarget->hasFullFP16() &&
16972 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16973 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16974 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16975 "Unexpected type!");
16976
16977 // Lower isnan(x) | isnan(never-nan) to x != x.
16978 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16979 if (CC == ISD::SETUO || CC == ISD::SETO) {
16980 bool OneNaN = false;
16981 if (LHS == RHS) {
16982 OneNaN = true;
16983 } else if (DAG.isKnownNeverNaN(RHS)) {
16984 OneNaN = true;
16985 RHS = LHS;
16986 } else if (DAG.isKnownNeverNaN(LHS)) {
16987 OneNaN = true;
16988 LHS = RHS;
16989 }
16990 if (OneNaN) {
16991 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16992 }
16993 }
16994
16995 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16996 // clean. Some of them require two branches to implement.
16997 AArch64CC::CondCode CC1, CC2;
16998 bool ShouldInvert;
16999 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
17000
17001 bool NoNaNs =
17002 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
17003 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
17004 if (!Cmp.getNode())
17005 return SDValue();
17006
17007 if (CC2 != AArch64CC::AL) {
17008 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
17009 if (!Cmp2.getNode())
17010 return SDValue();
17011
17012 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
17013 }
17014
17015 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
17016
17017 if (ShouldInvert)
17018 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
17019
17020 return Cmp;
17021}
17022
17023static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
17024 SelectionDAG &DAG) {
17025 SDValue VecOp = ScalarOp.getOperand(0);
17026 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
17027 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
17028 DAG.getConstant(0, DL, MVT::i64));
17029}
17030
17031static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
17032 SDLoc DL, SelectionDAG &DAG) {
17033 unsigned ScalarOpcode;
17034 switch (Opcode) {
17035 case ISD::VECREDUCE_AND:
17036 ScalarOpcode = ISD::AND;
17037 break;
17038 case ISD::VECREDUCE_OR:
17039 ScalarOpcode = ISD::OR;
17040 break;
17041 case ISD::VECREDUCE_XOR:
17042 ScalarOpcode = ISD::XOR;
17043 break;
17044 default:
17045 llvm_unreachable("Expected bitwise vector reduction");
17046 return SDValue();
17047 }
17048
17049 EVT VecVT = Vec.getValueType();
17050 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
17051 "Expected power-of-2 length vector");
17052
17053 EVT ElemVT = VecVT.getVectorElementType();
17054
17055 SDValue Result;
17056 unsigned NumElems = VecVT.getVectorNumElements();
17057
17058 // Special case for boolean reductions
17059 if (ElemVT == MVT::i1) {
17060 // Split large vectors into smaller ones
17061 if (NumElems > 16) {
17062 SDValue Lo, Hi;
17063 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17064 EVT HalfVT = Lo.getValueType();
17065 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
17066 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
17067 }
17068
17069 // Results of setcc operations get widened to 128 bits if their input
17070 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
17071 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
17072 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
17073 // size leads to the best codegen, since e.g. setcc results might need to be
17074 // truncated otherwise.
17075 unsigned ExtendedWidth = 64;
17076 if (Vec.getOpcode() == ISD::SETCC &&
17077 Vec.getOperand(0).getValueSizeInBits() >= 128) {
17078 ExtendedWidth = 128;
17079 }
17080 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
17081
17082 // any_ext doesn't work with umin/umax, so only use it for uadd.
17083 unsigned ExtendOp =
17084 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
17085 SDValue Extended = DAG.getNode(
17086 ExtendOp, DL,
17087 VecVT.changeVectorElementType(*DAG.getContext(), ExtendedVT), Vec);
17088 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
17089 // in that case we bitcast the sign extended values from v2i64 to v4i32
17090 // before reduction for optimal code generation.
17091 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
17092 NumElems == 2 && ExtendedWidth == 128) {
17093 Extended = DAG.getBitcast(MVT::v4i32, Extended);
17094 ExtendedVT = MVT::i32;
17095 }
17096 switch (ScalarOpcode) {
17097 case ISD::AND:
17098 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
17099 break;
17100 case ISD::OR:
17101 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
17102 break;
17103 case ISD::XOR:
17104 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
17105 break;
17106 default:
17107 llvm_unreachable("Unexpected Opcode");
17108 }
17109
17110 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
17111 } else {
17112 // Iteratively split the vector in half and combine using the bitwise
17113 // operation until it fits in a 64 bit register.
17114 while (VecVT.getSizeInBits() > 64) {
17115 SDValue Lo, Hi;
17116 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17117 VecVT = Lo.getValueType();
17118 NumElems = VecVT.getVectorNumElements();
17119 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
17120 }
17121
17122 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
17123
17124 // Do the remaining work on a scalar since it allows the code generator to
17125 // combine the shift and bitwise operation into one instruction and since
17126 // integer instructions can have higher throughput than vector instructions.
17127 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
17128
17129 // Iteratively combine the lower and upper halves of the scalar using the
17130 // bitwise operation, halving the relevant region of the scalar in each
17131 // iteration, until the relevant region is just one element of the original
17132 // vector.
17133 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
17134 SDValue ShiftAmount =
17135 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
17136 SDValue Shifted =
17137 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
17138 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
17139 }
17140
17141 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
17142 }
17143
17144 return DAG.getAnyExtOrTrunc(Result, DL, VT);
17145}
17146
17147SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
17148 SelectionDAG &DAG) const {
17149 SDValue Src = Op.getOperand(0);
17150 EVT SrcVT = Src.getValueType();
17151
17152 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
17153 // widening by inserting zeroes.
17154 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
17155 SrcVT == MVT::v2f16) {
17156 SDLoc DL(Op);
17157 return DAG.getNode(ISD::FADD, DL, MVT::f16,
17158 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
17159 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
17160 }
17161
17162 // Try to lower fixed length reductions to SVE.
17163 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
17164 Op.getOpcode() == ISD::VECREDUCE_AND ||
17165 Op.getOpcode() == ISD::VECREDUCE_OR ||
17166 Op.getOpcode() == ISD::VECREDUCE_XOR ||
17167 Op.getOpcode() == ISD::VECREDUCE_FADD ||
17168 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
17169 SrcVT.getVectorElementType() == MVT::i64);
17170 if (SrcVT.isScalableVector() ||
17172 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
17173
17174 if (SrcVT.getVectorElementType() == MVT::i1)
17175 return LowerPredReductionToSVE(Op, DAG);
17176
17177 switch (Op.getOpcode()) {
17178 case ISD::VECREDUCE_ADD:
17179 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
17180 case ISD::VECREDUCE_AND:
17181 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
17182 case ISD::VECREDUCE_OR:
17183 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
17185 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
17187 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
17189 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
17191 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
17192 case ISD::VECREDUCE_XOR:
17193 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
17195 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
17197 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
17199 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
17201 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
17203 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
17204 default:
17205 llvm_unreachable("Unhandled fixed length reduction");
17206 }
17207 }
17208
17209 // Lower NEON reductions.
17210 SDLoc DL(Op);
17211 switch (Op.getOpcode()) {
17212 case ISD::VECREDUCE_AND:
17213 case ISD::VECREDUCE_OR:
17214 case ISD::VECREDUCE_XOR:
17215 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
17216 Op.getValueType(), DL, DAG);
17217 case ISD::VECREDUCE_ADD:
17218 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
17220 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
17222 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
17224 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
17226 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
17227 default:
17228 llvm_unreachable("Unhandled reduction");
17229 }
17230}
17231
17232SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
17233 SelectionDAG &DAG) const {
17234 SDLoc DL(Op);
17235 SDValue Src = Op.getOperand(0);
17236 EVT SrcVT = Src.getValueType();
17237 assert(SrcVT.isScalableVector() && "Unexpected operand type!");
17238
17239 SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
17240 unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
17241 SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
17242
17243 // Whilst we don't know the size of the vector we do know the maximum size so
17244 // can perform a tree reduction with an identity vector, which means once we
17245 // arrive at the result the remaining stages (when the vector is smaller than
17246 // the maximum) have no affect.
17247
17249 unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
17250
17251 for (unsigned I = 0; I < Stages; ++I) {
17252 Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
17253 Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
17254 }
17255
17256 return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
17257}
17258
17259SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
17260 SelectionDAG &DAG) const {
17261 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17262 // No point replacing if we don't have the relevant instruction/libcall anyway
17263 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
17264 return SDValue();
17265
17266 // LSE has an atomic load-clear instruction, but not a load-and.
17267 SDLoc DL(Op);
17268 MVT VT = Op.getSimpleValueType();
17269 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
17270 SDValue RHS = Op.getOperand(2);
17271 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
17272 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
17273 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
17274 Op.getOperand(0), Op.getOperand(1), RHS,
17275 AN->getMemOperand());
17276}
17277
17278SDValue
17279AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
17280 SelectionDAG &DAG) const {
17281
17282 SDLoc DL(Op);
17283 // Get the inputs.
17284 SDNode *Node = Op.getNode();
17285 SDValue Chain = Op.getOperand(0);
17286 SDValue Size = Op.getOperand(1);
17287 MaybeAlign Align =
17288 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17289 EVT VT = Node->getValueType(0);
17290
17292 "no-stack-arg-probe")) {
17293 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17294 Chain = SP.getValue(1);
17295 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17296 if (Align)
17297 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17298 DAG.getSignedConstant(-Align->value(), DL, VT));
17299 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17300 SDValue Ops[2] = {SP, Chain};
17301 return DAG.getMergeValues(Ops, DL);
17302 }
17303
17304 RTLIB::LibcallImpl ChkStkImpl = getLibcallImpl(RTLIB::STACK_PROBE);
17305 if (ChkStkImpl == RTLIB::Unsupported)
17306 return SDValue();
17307
17308 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
17309
17310 EVT PtrVT = getPointerTy(DAG.getDataLayout());
17312 getLibcallImplName(ChkStkImpl).data(), PtrVT, 0);
17313
17314 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17315 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
17316 if (Subtarget->hasCustomCallingConv())
17317 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
17318
17319 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
17320 DAG.getConstant(4, DL, MVT::i64));
17321 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
17322 Chain =
17323 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
17324 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
17325 DAG.getRegisterMask(Mask), Chain.getValue(1));
17326 // To match the actual intent better, we should read the output from X15 here
17327 // again (instead of potentially spilling it to the stack), but rereading Size
17328 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
17329 // here.
17330
17331 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
17332 DAG.getConstant(4, DL, MVT::i64));
17333
17334 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17335 Chain = SP.getValue(1);
17336 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17337 if (Align)
17338 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17339 DAG.getSignedConstant(-Align->value(), DL, VT));
17340 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17341
17342 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
17343
17344 SDValue Ops[2] = {SP, Chain};
17345 return DAG.getMergeValues(Ops, DL);
17346}
17347
17348SDValue
17349AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
17350 SelectionDAG &DAG) const {
17351 // Get the inputs.
17352 SDNode *Node = Op.getNode();
17353 SDValue Chain = Op.getOperand(0);
17354 SDValue Size = Op.getOperand(1);
17355
17356 MaybeAlign Align =
17357 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17358 SDLoc DL(Op);
17359 EVT VT = Node->getValueType(0);
17360
17361 // Construct the new SP value in a GPR.
17362 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17363 Chain = SP.getValue(1);
17364 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17365 if (Align)
17366 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17367 DAG.getSignedConstant(-Align->value(), DL, VT));
17368
17369 // Set the real SP to the new value with a probing loop.
17370 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
17371 SDValue Ops[2] = {SP, Chain};
17372 return DAG.getMergeValues(Ops, DL);
17373}
17374
17375SDValue
17376AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17377 SelectionDAG &DAG) const {
17378 MachineFunction &MF = DAG.getMachineFunction();
17379
17380 if (Subtarget->isTargetWindows())
17381 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
17382 else if (hasInlineStackProbe(MF))
17383 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
17384 else
17385 return SDValue();
17386}
17387
17388SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
17389 unsigned NewOp) const {
17390 if (Subtarget->hasSVE2())
17391 return LowerToPredicatedOp(Op, DAG, NewOp);
17392
17393 // Default to expand.
17394 return SDValue();
17395}
17396
17397SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
17398 SelectionDAG &DAG) const {
17399 EVT VT = Op.getValueType();
17400 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
17401
17402 SDLoc DL(Op);
17403 APInt MulImm = Op.getConstantOperandAPInt(0);
17404 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
17405 VT);
17406}
17407
17408/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
17409template <unsigned NumVecs>
17410static bool
17414 // Retrieve EC from first vector argument.
17415 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
17417#ifndef NDEBUG
17418 // Check the assumption that all input vectors are the same type.
17419 for (unsigned I = 0; I < NumVecs; ++I)
17420 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
17421 "Invalid type.");
17422#endif
17423 // memVT is `NumVecs * VT`.
17425 EC * NumVecs);
17426 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
17427 Info.offset = 0;
17428 Info.align.reset();
17430 return true;
17431}
17432
17433/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
17434/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
17435/// specified in the intrinsic calls.
17437 const CallBase &I,
17438 MachineFunction &MF,
17439 unsigned Intrinsic) const {
17440 auto &DL = I.getDataLayout();
17441 switch (Intrinsic) {
17442 case Intrinsic::aarch64_sve_st2:
17443 return setInfoSVEStN<2>(*this, DL, Info, I);
17444 case Intrinsic::aarch64_sve_st3:
17445 return setInfoSVEStN<3>(*this, DL, Info, I);
17446 case Intrinsic::aarch64_sve_st4:
17447 return setInfoSVEStN<4>(*this, DL, Info, I);
17448 case Intrinsic::aarch64_neon_ld2:
17449 case Intrinsic::aarch64_neon_ld3:
17450 case Intrinsic::aarch64_neon_ld4:
17451 case Intrinsic::aarch64_neon_ld1x2:
17452 case Intrinsic::aarch64_neon_ld1x3:
17453 case Intrinsic::aarch64_neon_ld1x4: {
17454 Info.opc = ISD::INTRINSIC_W_CHAIN;
17455 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
17456 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17457 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17458 Info.offset = 0;
17459 Info.align.reset();
17460 // volatile loads with NEON intrinsics not supported
17461 Info.flags = MachineMemOperand::MOLoad;
17462 return true;
17463 }
17464 case Intrinsic::aarch64_neon_ld2lane:
17465 case Intrinsic::aarch64_neon_ld3lane:
17466 case Intrinsic::aarch64_neon_ld4lane:
17467 case Intrinsic::aarch64_neon_ld2r:
17468 case Intrinsic::aarch64_neon_ld3r:
17469 case Intrinsic::aarch64_neon_ld4r: {
17470 Info.opc = ISD::INTRINSIC_W_CHAIN;
17471 // ldx return struct with the same vec type
17472 Type *RetTy = I.getType();
17473 auto *StructTy = cast<StructType>(RetTy);
17474 unsigned NumElts = StructTy->getNumElements();
17475 Type *VecTy = StructTy->getElementType(0);
17476 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17477 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17478 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17479 Info.offset = 0;
17480 Info.align.reset();
17481 // volatile loads with NEON intrinsics not supported
17482 Info.flags = MachineMemOperand::MOLoad;
17483 return true;
17484 }
17485 case Intrinsic::aarch64_neon_st2:
17486 case Intrinsic::aarch64_neon_st3:
17487 case Intrinsic::aarch64_neon_st4:
17488 case Intrinsic::aarch64_neon_st1x2:
17489 case Intrinsic::aarch64_neon_st1x3:
17490 case Intrinsic::aarch64_neon_st1x4: {
17491 Info.opc = ISD::INTRINSIC_VOID;
17492 unsigned NumElts = 0;
17493 for (const Value *Arg : I.args()) {
17494 Type *ArgTy = Arg->getType();
17495 if (!ArgTy->isVectorTy())
17496 break;
17497 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17498 }
17499 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17500 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17501 Info.offset = 0;
17502 Info.align.reset();
17503 // volatile stores with NEON intrinsics not supported
17504 Info.flags = MachineMemOperand::MOStore;
17505 return true;
17506 }
17507 case Intrinsic::aarch64_neon_st2lane:
17508 case Intrinsic::aarch64_neon_st3lane:
17509 case Intrinsic::aarch64_neon_st4lane: {
17510 Info.opc = ISD::INTRINSIC_VOID;
17511 unsigned NumElts = 0;
17512 // all the vector type is same
17513 Type *VecTy = I.getArgOperand(0)->getType();
17514 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17515
17516 for (const Value *Arg : I.args()) {
17517 Type *ArgTy = Arg->getType();
17518 if (!ArgTy->isVectorTy())
17519 break;
17520 NumElts += 1;
17521 }
17522
17523 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17524 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17525 Info.offset = 0;
17526 Info.align.reset();
17527 // volatile stores with NEON intrinsics not supported
17528 Info.flags = MachineMemOperand::MOStore;
17529 return true;
17530 }
17531 case Intrinsic::aarch64_ldaxr:
17532 case Intrinsic::aarch64_ldxr: {
17533 Type *ValTy = I.getParamElementType(0);
17534 Info.opc = ISD::INTRINSIC_W_CHAIN;
17535 Info.memVT = MVT::getVT(ValTy);
17536 Info.ptrVal = I.getArgOperand(0);
17537 Info.offset = 0;
17538 Info.align = DL.getABITypeAlign(ValTy);
17540 return true;
17541 }
17542 case Intrinsic::aarch64_stlxr:
17543 case Intrinsic::aarch64_stxr: {
17544 Type *ValTy = I.getParamElementType(1);
17545 Info.opc = ISD::INTRINSIC_W_CHAIN;
17546 Info.memVT = MVT::getVT(ValTy);
17547 Info.ptrVal = I.getArgOperand(1);
17548 Info.offset = 0;
17549 Info.align = DL.getABITypeAlign(ValTy);
17551 return true;
17552 }
17553 case Intrinsic::aarch64_ldaxp:
17554 case Intrinsic::aarch64_ldxp:
17555 Info.opc = ISD::INTRINSIC_W_CHAIN;
17556 Info.memVT = MVT::i128;
17557 Info.ptrVal = I.getArgOperand(0);
17558 Info.offset = 0;
17559 Info.align = Align(16);
17561 return true;
17562 case Intrinsic::aarch64_stlxp:
17563 case Intrinsic::aarch64_stxp:
17564 Info.opc = ISD::INTRINSIC_W_CHAIN;
17565 Info.memVT = MVT::i128;
17566 Info.ptrVal = I.getArgOperand(2);
17567 Info.offset = 0;
17568 Info.align = Align(16);
17570 return true;
17571 case Intrinsic::aarch64_sve_ldnt1: {
17572 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17573 Info.opc = ISD::INTRINSIC_W_CHAIN;
17574 Info.memVT = MVT::getVT(I.getType());
17575 Info.ptrVal = I.getArgOperand(1);
17576 Info.offset = 0;
17577 Info.align = DL.getABITypeAlign(ElTy);
17579 return true;
17580 }
17581 case Intrinsic::aarch64_sve_stnt1: {
17582 Type *ElTy =
17583 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17584 Info.opc = ISD::INTRINSIC_W_CHAIN;
17585 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17586 Info.ptrVal = I.getArgOperand(2);
17587 Info.offset = 0;
17588 Info.align = DL.getABITypeAlign(ElTy);
17590 return true;
17591 }
17592 case Intrinsic::aarch64_mops_memset_tag: {
17593 Value *Dst = I.getArgOperand(0);
17594 Value *Val = I.getArgOperand(1);
17595 Info.opc = ISD::INTRINSIC_W_CHAIN;
17596 Info.memVT = MVT::getVT(Val->getType());
17597 Info.ptrVal = Dst;
17598 Info.offset = 0;
17599 Info.align = I.getParamAlign(0).valueOrOne();
17600 Info.flags = MachineMemOperand::MOStore;
17601 // The size of the memory being operated on is unknown at this point
17602 Info.size = MemoryLocation::UnknownSize;
17603 return true;
17604 }
17605 default:
17606 break;
17607 }
17608
17609 return false;
17610}
17611
17613 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17614 std::optional<unsigned> ByteOffset) const {
17615 // TODO: This may be worth removing. Check regression tests for diffs.
17616 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17617 ByteOffset))
17618 return false;
17619
17620 // If we're reducing the load width in order to avoid having to use an extra
17621 // instruction to do extension then it's probably a good idea.
17622 if (ExtTy != ISD::NON_EXTLOAD)
17623 return true;
17624 // Don't reduce load width if it would prevent us from combining a shift into
17625 // the offset.
17626 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17627 assert(Mem);
17628 const SDValue &Base = Mem->getBasePtr();
17629 if (Base.getOpcode() == ISD::ADD &&
17630 Base.getOperand(1).getOpcode() == ISD::SHL &&
17631 Base.getOperand(1).hasOneUse() &&
17632 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17633 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17634 if (Mem->getMemoryVT().isScalableVector())
17635 return false;
17636 // The shift can be combined if it matches the size of the value being
17637 // loaded (and so reducing the width would make it not match).
17638 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17639 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17640 if (ShiftAmount == Log2_32(LoadBytes))
17641 return false;
17642 }
17643 // We have no reason to disallow reducing the load width, so allow it.
17644 return true;
17645}
17646
17647// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17649 EVT VT = Extend.getValueType();
17650 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17651 SDValue Extract = Extend.getOperand(0);
17652 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17653 Extract = Extract.getOperand(0);
17654 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17655 EVT VecVT = Extract.getOperand(0).getValueType();
17656 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17657 return false;
17658 }
17659 }
17660 return true;
17661}
17662
17663// Truncations from 64-bit GPR to 32-bit GPR is free.
17665 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17666 return false;
17667 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17668 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17669 return NumBits1 > NumBits2;
17670}
17672 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17673 return false;
17674 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17675 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17676 return NumBits1 > NumBits2;
17677}
17678
17679/// Check if it is profitable to hoist instruction in then/else to if.
17680/// Not profitable if I and it's user can form a FMA instruction
17681/// because we prefer FMSUB/FMADD.
17683 if (I->getOpcode() != Instruction::FMul)
17684 return true;
17685
17686 if (!I->hasOneUse())
17687 return true;
17688
17689 Instruction *User = I->user_back();
17690
17691 if (!(User->getOpcode() == Instruction::FSub ||
17692 User->getOpcode() == Instruction::FAdd))
17693 return true;
17694
17696 const Function *F = I->getFunction();
17697 const DataLayout &DL = F->getDataLayout();
17698 Type *Ty = User->getOperand(0)->getType();
17699
17700 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17702 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17703 I->getFastMathFlags().allowContract()));
17704}
17705
17706// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17707// 64-bit GPR.
17709 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17710 return false;
17711 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17712 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17713 return NumBits1 == 32 && NumBits2 == 64;
17714}
17716 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17717 return false;
17718 unsigned NumBits1 = VT1.getSizeInBits();
17719 unsigned NumBits2 = VT2.getSizeInBits();
17720 return NumBits1 == 32 && NumBits2 == 64;
17721}
17722
17724 EVT VT1 = Val.getValueType();
17725 if (isZExtFree(VT1, VT2)) {
17726 return true;
17727 }
17728
17729 if (Val.getOpcode() != ISD::LOAD)
17730 return false;
17731
17732 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17733 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17734 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17735 VT1.getSizeInBits() <= 32);
17736}
17737
17738bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17739 if (isa<FPExtInst>(Ext))
17740 return false;
17741
17742 // Vector types are not free.
17743 if (Ext->getType()->isVectorTy())
17744 return false;
17745
17746 for (const Use &U : Ext->uses()) {
17747 // The extension is free if we can fold it with a left shift in an
17748 // addressing mode or an arithmetic operation: add, sub, and cmp.
17749
17750 // Is there a shift?
17751 const Instruction *Instr = cast<Instruction>(U.getUser());
17752
17753 // Is this a constant shift?
17754 switch (Instr->getOpcode()) {
17755 case Instruction::Shl:
17756 if (!isa<ConstantInt>(Instr->getOperand(1)))
17757 return false;
17758 break;
17759 case Instruction::GetElementPtr: {
17760 gep_type_iterator GTI = gep_type_begin(Instr);
17761 auto &DL = Ext->getDataLayout();
17762 std::advance(GTI, U.getOperandNo()-1);
17763 Type *IdxTy = GTI.getIndexedType();
17764 // This extension will end up with a shift because of the scaling factor.
17765 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17766 // Get the shift amount based on the scaling factor:
17767 // log2(sizeof(IdxTy)) - log2(8).
17768 if (IdxTy->isScalableTy())
17769 return false;
17770 uint64_t ShiftAmt =
17771 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17772 3;
17773 // Is the constant foldable in the shift of the addressing mode?
17774 // I.e., shift amount is between 1 and 4 inclusive.
17775 if (ShiftAmt == 0 || ShiftAmt > 4)
17776 return false;
17777 break;
17778 }
17779 case Instruction::Trunc:
17780 // Check if this is a noop.
17781 // trunc(sext ty1 to ty2) to ty1.
17782 if (Instr->getType() == Ext->getOperand(0)->getType())
17783 continue;
17784 [[fallthrough]];
17785 default:
17786 return false;
17787 }
17788
17789 // At this point we can use the bfm family, so this extension is free
17790 // for that use.
17791 }
17792 return true;
17793}
17794
17795static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17796 unsigned NumElts, bool IsLittleEndian,
17797 SmallVectorImpl<int> &Mask) {
17798 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17799 return false;
17800
17801 assert(DstWidth % SrcWidth == 0 &&
17802 "TBL lowering is not supported for a conversion instruction with this "
17803 "source and destination element type.");
17804
17805 unsigned Factor = DstWidth / SrcWidth;
17806 unsigned MaskLen = NumElts * Factor;
17807
17808 Mask.clear();
17809 Mask.resize(MaskLen, NumElts);
17810
17811 unsigned SrcIndex = 0;
17812 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17813 Mask[I] = SrcIndex++;
17814
17815 return true;
17816}
17817
17819 FixedVectorType *ZExtTy,
17820 FixedVectorType *DstTy,
17821 bool IsLittleEndian) {
17822 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17823 unsigned NumElts = SrcTy->getNumElements();
17824 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17825 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17826
17827 SmallVector<int> Mask;
17828 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17829 return nullptr;
17830
17831 auto *FirstEltZero = Builder.CreateInsertElement(
17832 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17833 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17834 Result = Builder.CreateBitCast(Result, DstTy);
17835 if (DstTy != ZExtTy)
17836 Result = Builder.CreateZExt(Result, ZExtTy);
17837 return Result;
17838}
17839
17841 FixedVectorType *DstTy,
17842 bool IsLittleEndian) {
17843 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17844 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17845 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17846
17847 SmallVector<int> Mask;
17848 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17849 !IsLittleEndian, Mask))
17850 return nullptr;
17851
17852 auto *FirstEltZero = Builder.CreateInsertElement(
17853 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17854
17855 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17856}
17857
17858static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17859 IRBuilder<> Builder(TI);
17861 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17862 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17863 auto *DstTy = cast<FixedVectorType>(TI->getType());
17864 assert(SrcTy->getElementType()->isIntegerTy() &&
17865 "Non-integer type source vector element is not supported");
17866 assert(DstTy->getElementType()->isIntegerTy(8) &&
17867 "Unsupported destination vector element type");
17868 unsigned SrcElemTySz =
17869 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17870 unsigned DstElemTySz =
17871 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17872 assert((SrcElemTySz % DstElemTySz == 0) &&
17873 "Cannot lower truncate to tbl instructions for a source element size "
17874 "that is not divisible by the destination element size");
17875 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17876 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17877 "Unsupported source vector element type size");
17878 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17879
17880 // Create a mask to choose every nth byte from the source vector table of
17881 // bytes to create the truncated destination vector, where 'n' is the truncate
17882 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17883 // 0,8,16,..Y*8th bytes for the little-endian format
17885 for (int Itr = 0; Itr < 16; Itr++) {
17886 if (Itr < NumElements)
17887 MaskConst.push_back(Builder.getInt8(
17888 IsLittleEndian ? Itr * TruncFactor
17889 : Itr * TruncFactor + (TruncFactor - 1)));
17890 else
17891 MaskConst.push_back(Builder.getInt8(255));
17892 }
17893
17894 int MaxTblSz = 128 * 4;
17895 int MaxSrcSz = SrcElemTySz * NumElements;
17896 int ElemsPerTbl =
17897 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17898 assert(ElemsPerTbl <= 16 &&
17899 "Maximum elements selected using TBL instruction cannot exceed 16!");
17900
17901 int ShuffleCount = 128 / SrcElemTySz;
17902 SmallVector<int> ShuffleLanes;
17903 for (int i = 0; i < ShuffleCount; ++i)
17904 ShuffleLanes.push_back(i);
17905
17906 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17907 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17908 // call TBL & save the result in a vector of TBL results for combining later.
17910 while (ShuffleLanes.back() < NumElements) {
17911 Parts.push_back(Builder.CreateBitCast(
17912 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17913
17914 if (Parts.size() == 4) {
17915 Parts.push_back(ConstantVector::get(MaskConst));
17916 Results.push_back(
17917 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17918 Parts.clear();
17919 }
17920
17921 for (int i = 0; i < ShuffleCount; ++i)
17922 ShuffleLanes[i] += ShuffleCount;
17923 }
17924
17925 assert((Parts.empty() || Results.empty()) &&
17926 "Lowering trunc for vectors requiring different TBL instructions is "
17927 "not supported!");
17928 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17929 // registers
17930 if (!Parts.empty()) {
17931 Intrinsic::ID TblID;
17932 switch (Parts.size()) {
17933 case 1:
17934 TblID = Intrinsic::aarch64_neon_tbl1;
17935 break;
17936 case 2:
17937 TblID = Intrinsic::aarch64_neon_tbl2;
17938 break;
17939 case 3:
17940 TblID = Intrinsic::aarch64_neon_tbl3;
17941 break;
17942 }
17943
17944 Parts.push_back(ConstantVector::get(MaskConst));
17945 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17946 }
17947
17948 // Extract the destination vector from TBL result(s) after combining them
17949 // where applicable. Currently, at most two TBLs are supported.
17950 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17951 "more than 2 tbl instructions!");
17952 Value *FinalResult = Results[0];
17953 if (Results.size() == 1) {
17954 if (ElemsPerTbl < 16) {
17955 SmallVector<int> FinalMask(ElemsPerTbl);
17956 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17957 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17958 }
17959 } else {
17960 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17961 if (ElemsPerTbl < 16) {
17962 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17963 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17964 } else {
17965 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17966 }
17967 FinalResult =
17968 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17969 }
17970
17971 TI->replaceAllUsesWith(FinalResult);
17972 TI->eraseFromParent();
17973}
17974
17976 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17977 // shuffle_vector instructions are serialized when targeting SVE,
17978 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17979 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17980 return false;
17981
17982 // Try to optimize conversions using tbl. This requires materializing constant
17983 // index vectors, which can increase code size and add loads. Skip the
17984 // transform unless the conversion is in a loop block guaranteed to execute
17985 // and we are not optimizing for size.
17986 Function *F = I->getParent()->getParent();
17987 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17988 return false;
17989
17990 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17991 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17992 if (!SrcTy || !DstTy)
17993 return false;
17994
17995 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17996 // lowered to tbl instructions to insert the original i8 elements
17997 // into i8x lanes. This is enabled for cases where it is beneficial.
17998 auto *ZExt = dyn_cast<ZExtInst>(I);
17999 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
18000 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
18001 if (DstWidth % 8 != 0)
18002 return false;
18003
18004 auto *TruncDstType =
18006 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
18007 // the remaining ZExt folded into the user, don't use tbl lowering.
18008 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
18009 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
18012 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
18013 return false;
18014
18015 DstTy = TruncDstType;
18016 }
18017
18018 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
18019 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
18020 // most one extra extend step is needed and using tbl is not profitable.
18021 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
18022 // udot instruction.
18023 if (SrcWidth * 4 <= DstWidth) {
18024 if (all_of(I->users(), [&](auto *U) {
18025 using namespace llvm::PatternMatch;
18026 auto *SingleUser = cast<Instruction>(&*U);
18027 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
18028 return true;
18029 if (match(SingleUser,
18030 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
18031 m_Value(), m_Specific(I))))
18032 return true;
18033 return false;
18034 }))
18035 return false;
18036 }
18037
18038 if (DstTy->getScalarSizeInBits() >= 64)
18039 return false;
18040
18041 IRBuilder<> Builder(ZExt);
18043 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
18044 DstTy, Subtarget->isLittleEndian());
18045 if (!Result)
18046 return false;
18047 ZExt->replaceAllUsesWith(Result);
18048 ZExt->eraseFromParent();
18049 return true;
18050 }
18051
18052 auto *UIToFP = dyn_cast<UIToFPInst>(I);
18053 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
18054 DstTy->getElementType()->isFloatTy()) ||
18055 (SrcTy->getElementType()->isIntegerTy(16) &&
18056 DstTy->getElementType()->isDoubleTy()))) {
18057 IRBuilder<> Builder(I);
18059 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
18060 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
18061 assert(ZExt && "Cannot fail for the i8 to float conversion");
18062 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
18063 I->replaceAllUsesWith(UI);
18064 I->eraseFromParent();
18065 return true;
18066 }
18067
18068 auto *SIToFP = dyn_cast<SIToFPInst>(I);
18069 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
18070 DstTy->getElementType()->isFloatTy()) {
18071 IRBuilder<> Builder(I);
18072 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
18074 Subtarget->isLittleEndian());
18075 assert(Shuffle && "Cannot fail for the i8 to float conversion");
18076 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
18077 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
18078 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
18079 I->replaceAllUsesWith(SI);
18080 I->eraseFromParent();
18081 return true;
18082 }
18083
18084 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
18085 // followed by a truncate lowered to using tbl.4.
18086 auto *FPToUI = dyn_cast<FPToUIInst>(I);
18087 if (FPToUI &&
18088 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
18089 SrcTy->getElementType()->isFloatTy() &&
18090 DstTy->getElementType()->isIntegerTy(8)) {
18091 IRBuilder<> Builder(I);
18092 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
18093 VectorType::getInteger(SrcTy));
18094 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
18095 I->replaceAllUsesWith(TruncI);
18096 I->eraseFromParent();
18097 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
18098 return true;
18099 }
18100
18101 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
18102 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
18103 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
18104 // registers
18105 auto *TI = dyn_cast<TruncInst>(I);
18106 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
18107 ((SrcTy->getElementType()->isIntegerTy(32) ||
18108 SrcTy->getElementType()->isIntegerTy(64)) &&
18109 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
18110 createTblForTrunc(TI, Subtarget->isLittleEndian());
18111 return true;
18112 }
18113
18114 return false;
18115}
18116
18118 Align &RequiredAlignment) const {
18119 if (!LoadedType.isSimple() ||
18120 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
18121 return false;
18122 // Cyclone supports unaligned accesses.
18123 RequiredAlignment = Align(1);
18124 unsigned NumBits = LoadedType.getSizeInBits();
18125 return NumBits == 32 || NumBits == 64;
18126}
18127
18128/// A helper function for determining the number of interleaved accesses we
18129/// will generate when lowering accesses of the given type.
18131 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
18132 unsigned VecSize = 128;
18133 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18134 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
18135 if (UseScalable && isa<FixedVectorType>(VecTy))
18136 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18137 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
18138}
18139
18142 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
18143 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
18144 return MOStridedAccess;
18146}
18147
18149 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
18150 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18151 auto EC = VecTy->getElementCount();
18152 unsigned MinElts = EC.getKnownMinValue();
18153
18154 UseScalable = false;
18155
18156 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
18157 (!Subtarget->useSVEForFixedLengthVectors() ||
18159 return false;
18160
18161 if (isa<ScalableVectorType>(VecTy) &&
18162 !Subtarget->isSVEorStreamingSVEAvailable())
18163 return false;
18164
18165 // Ensure the number of vector elements is greater than 1.
18166 if (MinElts < 2)
18167 return false;
18168
18169 // Ensure the element type is legal.
18170 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
18171 return false;
18172
18173 if (EC.isScalable()) {
18174 UseScalable = true;
18175 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
18176 }
18177
18178 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
18179 if (Subtarget->useSVEForFixedLengthVectors()) {
18180 unsigned MinSVEVectorSize =
18181 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18182 if (VecSize % MinSVEVectorSize == 0 ||
18183 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
18184 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
18185 UseScalable = true;
18186 return true;
18187 }
18188 }
18189
18190 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
18191 // 128 will be split into multiple interleaved accesses.
18192 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
18193}
18194
18196 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
18197 return ScalableVectorType::get(VTy->getElementType(), 2);
18198
18199 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
18200 return ScalableVectorType::get(VTy->getElementType(), 4);
18201
18202 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
18203 return ScalableVectorType::get(VTy->getElementType(), 8);
18204
18205 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
18206 return ScalableVectorType::get(VTy->getElementType(), 8);
18207
18208 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
18209 return ScalableVectorType::get(VTy->getElementType(), 2);
18210
18211 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
18212 return ScalableVectorType::get(VTy->getElementType(), 4);
18213
18214 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
18215 return ScalableVectorType::get(VTy->getElementType(), 8);
18216
18217 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
18218 return ScalableVectorType::get(VTy->getElementType(), 16);
18219
18220 llvm_unreachable("Cannot handle input vector type");
18221}
18222
18223static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
18224 bool Scalable, Type *LDVTy,
18225 Type *PtrTy) {
18226 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18227 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
18228 Intrinsic::aarch64_sve_ld3_sret,
18229 Intrinsic::aarch64_sve_ld4_sret};
18230 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
18231 Intrinsic::aarch64_neon_ld3,
18232 Intrinsic::aarch64_neon_ld4};
18233 if (Scalable)
18234 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
18235
18236 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
18237 {LDVTy, PtrTy});
18238}
18239
18240static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
18241 bool Scalable, Type *STVTy,
18242 Type *PtrTy) {
18243 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18244 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
18245 Intrinsic::aarch64_sve_st3,
18246 Intrinsic::aarch64_sve_st4};
18247 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
18248 Intrinsic::aarch64_neon_st3,
18249 Intrinsic::aarch64_neon_st4};
18250 if (Scalable)
18251 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
18252
18253 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
18254 {STVTy, PtrTy});
18255}
18256
18257/// Lower an interleaved load into a ldN intrinsic.
18258///
18259/// E.g. Lower an interleaved load (Factor = 2):
18260/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
18261/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
18262/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
18263///
18264/// Into:
18265/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
18266/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
18267/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
18269 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
18270 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
18271 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18272 "Invalid interleave factor");
18273 assert(!Shuffles.empty() && "Empty shufflevector input");
18274 assert(Shuffles.size() == Indices.size() &&
18275 "Unmatched number of shufflevectors and indices");
18276
18277 auto *LI = dyn_cast<LoadInst>(Load);
18278 if (!LI)
18279 return false;
18280 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
18281
18282 const DataLayout &DL = LI->getDataLayout();
18283
18284 VectorType *VTy = Shuffles[0]->getType();
18285
18286 // Skip if we do not have NEON and skip illegal vector types. We can
18287 // "legalize" wide vector types into multiple interleaved accesses as long as
18288 // the vector types are divisible by 128.
18289 bool UseScalable;
18290 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18291 return false;
18292
18293 // Check if the interleave is a zext(shuffle), that can be better optimized
18294 // into shift / and masks. For the moment we do this just for uitofp (not
18295 // zext) to avoid issues with widening instructions.
18296 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
18297 using namespace llvm::PatternMatch;
18298 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
18299 SI->getType()->getScalarSizeInBits() * 4 ==
18300 SI->user_back()->getType()->getScalarSizeInBits();
18301 }))
18302 return false;
18303
18304 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18305
18306 auto *FVTy = cast<FixedVectorType>(VTy);
18307
18308 // A pointer vector can not be the return type of the ldN intrinsics. Need to
18309 // load integer vectors first and then convert to pointer vectors.
18310 Type *EltTy = FVTy->getElementType();
18311 if (EltTy->isPointerTy())
18312 FVTy =
18313 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
18314
18315 // If we're going to generate more than one load, reset the sub-vector type
18316 // to something legal.
18317 FVTy = FixedVectorType::get(FVTy->getElementType(),
18318 FVTy->getNumElements() / NumLoads);
18319
18320 auto *LDVTy =
18321 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
18322
18323 IRBuilder<> Builder(LI);
18324
18325 // The base address of the load.
18326 Value *BaseAddr = LI->getPointerOperand();
18327
18328 Type *PtrTy = LI->getPointerOperandType();
18329 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
18330 LDVTy->getElementCount());
18331
18332 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18333 UseScalable, LDVTy, PtrTy);
18334
18335 // Holds sub-vectors extracted from the load intrinsic return values. The
18336 // sub-vectors are associated with the shufflevector instructions they will
18337 // replace.
18339
18340 Value *PTrue = nullptr;
18341 if (UseScalable) {
18342 std::optional<unsigned> PgPattern =
18343 getSVEPredPatternFromNumElements(FVTy->getNumElements());
18344 if (Subtarget->getMinSVEVectorSizeInBits() ==
18345 Subtarget->getMaxSVEVectorSizeInBits() &&
18346 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
18347 PgPattern = AArch64SVEPredPattern::all;
18348
18349 auto *PTruePat =
18350 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
18351 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18352 {PTruePat});
18353 }
18354
18355 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
18356
18357 // If we're generating more than one load, compute the base address of
18358 // subsequent loads as an offset from the previous.
18359 if (LoadCount > 0)
18360 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
18361 FVTy->getNumElements() * Factor);
18362
18363 CallInst *LdN;
18364 if (UseScalable)
18365 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
18366 else
18367 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18368
18369 // Extract and store the sub-vectors returned by the load intrinsic.
18370 for (unsigned i = 0; i < Shuffles.size(); i++) {
18371 ShuffleVectorInst *SVI = Shuffles[i];
18372 unsigned Index = Indices[i];
18373
18374 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
18375
18376 if (UseScalable)
18377 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
18378
18379 // Convert the integer vector to pointer vector if the element is pointer.
18380 if (EltTy->isPointerTy())
18381 SubVec = Builder.CreateIntToPtr(
18383 FVTy->getNumElements()));
18384
18385 SubVecs[SVI].push_back(SubVec);
18386 }
18387 }
18388
18389 // Replace uses of the shufflevector instructions with the sub-vectors
18390 // returned by the load intrinsic. If a shufflevector instruction is
18391 // associated with more than one sub-vector, those sub-vectors will be
18392 // concatenated into a single wide vector.
18393 for (ShuffleVectorInst *SVI : Shuffles) {
18394 auto &SubVec = SubVecs[SVI];
18395 auto *WideVec =
18396 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
18397 SVI->replaceAllUsesWith(WideVec);
18398 }
18399
18400 return true;
18401}
18402
18403template <typename Iter>
18404bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
18405 int MaxLookupDist = 20;
18406 unsigned IdxWidth = DL.getIndexSizeInBits(0);
18407 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
18408 const Value *PtrA1 =
18410
18411 while (++It != End) {
18412 if (It->isDebugOrPseudoInst())
18413 continue;
18414 if (MaxLookupDist-- == 0)
18415 break;
18416 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
18417 const Value *PtrB1 =
18418 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
18419 DL, OffsetB);
18420 if (PtrA1 == PtrB1 &&
18421 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
18422 .abs() == 16)
18423 return true;
18424 }
18425 }
18426
18427 return false;
18428}
18429
18430/// Lower an interleaved store into a stN intrinsic.
18431///
18432/// E.g. Lower an interleaved store (Factor = 3):
18433/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
18434/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
18435/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18436///
18437/// Into:
18438/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
18439/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
18440/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
18441/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18442///
18443/// Note that the new shufflevectors will be removed and we'll only generate one
18444/// st3 instruction in CodeGen.
18445///
18446/// Example for a more general valid mask (Factor 3). Lower:
18447/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
18448/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
18449/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18450///
18451/// Into:
18452/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
18453/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
18454/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
18455/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18457 Value *LaneMask,
18458 ShuffleVectorInst *SVI,
18459 unsigned Factor,
18460 const APInt &GapMask) const {
18461
18462 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18463 "Invalid interleave factor");
18464 auto *SI = dyn_cast<StoreInst>(Store);
18465 if (!SI)
18466 return false;
18467 assert(!LaneMask && GapMask.popcount() == Factor &&
18468 "Unexpected mask on store");
18469
18470 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18471 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18472
18473 unsigned LaneLen = VecTy->getNumElements() / Factor;
18474 Type *EltTy = VecTy->getElementType();
18475 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18476
18477 const DataLayout &DL = SI->getDataLayout();
18478 bool UseScalable;
18479
18480 // Skip if we do not have NEON and skip illegal vector types. We can
18481 // "legalize" wide vector types into multiple interleaved accesses as long as
18482 // the vector types are divisible by 128.
18483 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18484 return false;
18485
18486 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18487
18488 Value *Op0 = SVI->getOperand(0);
18489 Value *Op1 = SVI->getOperand(1);
18490 IRBuilder<> Builder(SI);
18491
18492 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18493 // vectors to integer vectors.
18494 if (EltTy->isPointerTy()) {
18495 Type *IntTy = DL.getIntPtrType(EltTy);
18496 unsigned NumOpElts =
18497 cast<FixedVectorType>(Op0->getType())->getNumElements();
18498
18499 // Convert to the corresponding integer vector.
18500 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18501 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18502 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18503
18504 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18505 }
18506
18507 // If we're going to generate more than one store, reset the lane length
18508 // and sub-vector type to something legal.
18509 LaneLen /= NumStores;
18510 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18511
18512 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18513 : SubVecTy;
18514
18515 // The base address of the store.
18516 Value *BaseAddr = SI->getPointerOperand();
18517
18518 auto Mask = SVI->getShuffleMask();
18519
18520 // Sanity check if all the indices are NOT in range.
18521 // If mask is `poison`, `Mask` may be a vector of -1s.
18522 // If all of them are `poison`, OOB read will happen later.
18523 if (llvm::all_of(Mask, equal_to(PoisonMaskElem))) {
18524 return false;
18525 }
18526 // A 64bit st2 which does not start at element 0 will involved adding extra
18527 // ext elements making the st2 unprofitable, and if there is a nearby store
18528 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18529 // zip;ldp pair which has higher throughput.
18530 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18531 (Mask[0] != 0 ||
18532 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18533 DL) ||
18534 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18535 BaseAddr, DL)))
18536 return false;
18537
18538 Type *PtrTy = SI->getPointerOperandType();
18539 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18540 STVTy->getElementCount());
18541
18542 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18543 UseScalable, STVTy, PtrTy);
18544
18545 Value *PTrue = nullptr;
18546 if (UseScalable) {
18547 std::optional<unsigned> PgPattern =
18548 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18549 if (Subtarget->getMinSVEVectorSizeInBits() ==
18550 Subtarget->getMaxSVEVectorSizeInBits() &&
18551 Subtarget->getMinSVEVectorSizeInBits() ==
18552 DL.getTypeSizeInBits(SubVecTy))
18553 PgPattern = AArch64SVEPredPattern::all;
18554
18555 auto *PTruePat =
18556 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18557 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18558 {PTruePat});
18559 }
18560
18561 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18562
18564
18565 // Split the shufflevector operands into sub vectors for the new stN call.
18566 for (unsigned i = 0; i < Factor; i++) {
18567 Value *Shuffle;
18568 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18569 if (Mask[IdxI] >= 0) {
18570 Shuffle = Builder.CreateShuffleVector(
18571 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18572 } else {
18573 unsigned StartMask = 0;
18574 for (unsigned j = 1; j < LaneLen; j++) {
18575 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18576 if (Mask[IdxJ] >= 0) {
18577 StartMask = Mask[IdxJ] - j;
18578 break;
18579 }
18580 }
18581 // Note: Filling undef gaps with random elements is ok, since
18582 // those elements were being written anyway (with undefs).
18583 // In the case of all undefs we're defaulting to using elems from 0
18584 // Note: StartMask cannot be negative, it's checked in
18585 // isReInterleaveMask
18586 Shuffle = Builder.CreateShuffleVector(
18587 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18588 }
18589
18590 if (UseScalable)
18591 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18592 Shuffle, uint64_t(0));
18593
18594 Ops.push_back(Shuffle);
18595 }
18596
18597 if (UseScalable)
18598 Ops.push_back(PTrue);
18599
18600 // If we generating more than one store, we compute the base address of
18601 // subsequent stores as an offset from the previous.
18602 if (StoreCount > 0)
18603 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18604 BaseAddr, LaneLen * Factor);
18605
18606 Ops.push_back(BaseAddr);
18607 Builder.CreateCall(StNFunc, Ops);
18608 }
18609 return true;
18610}
18611
18613 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18614 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18615 if (Factor != 2 && Factor != 3 && Factor != 4) {
18616 LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
18617 return false;
18618 }
18619 auto *LI = dyn_cast<LoadInst>(Load);
18620 if (!LI)
18621 return false;
18622 assert(!Mask && "Unexpected mask on a load\n");
18623
18625
18626 const DataLayout &DL = LI->getModule()->getDataLayout();
18627 bool UseScalable;
18628 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18629 return false;
18630
18631 // TODO: Add support for using SVE instructions with fixed types later, using
18632 // the code from lowerInterleavedLoad to obtain the correct container type.
18633 if (UseScalable && !VTy->isScalableTy())
18634 return false;
18635
18636 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18637 VectorType *LdTy =
18639 VTy->getElementCount().divideCoefficientBy(NumLoads));
18640
18641 Type *PtrTy = LI->getPointerOperandType();
18642 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18643 UseScalable, LdTy, PtrTy);
18644
18645 IRBuilder<> Builder(LI);
18646 Value *Pred = nullptr;
18647 if (UseScalable)
18648 Pred =
18649 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18650
18651 Value *BaseAddr = LI->getPointerOperand();
18652 Value *Result = nullptr;
18653 if (NumLoads > 1) {
18654 // Create multiple legal small ldN.
18655 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18656 for (unsigned I = 0; I < NumLoads; ++I) {
18657 Value *Offset = Builder.getInt64(I * Factor);
18658
18659 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18660 Value *LdN = nullptr;
18661 if (UseScalable)
18662 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18663 else
18664 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18665 Value *Idx =
18666 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18667 for (unsigned J = 0; J < Factor; ++J) {
18668 ExtractedLdValues[J] = Builder.CreateInsertVector(
18669 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18670 }
18671 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18672 }
18673
18674 // Merge the values from different factors.
18675 Result = PoisonValue::get(DI->getType());
18676 for (unsigned J = 0; J < Factor; ++J)
18677 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18678 } else {
18679 if (UseScalable)
18680 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18681 else
18682 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18683 }
18684
18685 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18686 DI->replaceAllUsesWith(Result);
18687 return true;
18688}
18689
18691 Instruction *Store, Value *Mask,
18692 ArrayRef<Value *> InterleavedValues) const {
18693 unsigned Factor = InterleavedValues.size();
18694 if (Factor != 2 && Factor != 3 && Factor != 4) {
18695 LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
18696 return false;
18697 }
18699 if (!SI)
18700 return false;
18701 assert(!Mask && "Unexpected mask on plain store");
18702
18703 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18704 const DataLayout &DL = SI->getModule()->getDataLayout();
18705
18706 bool UseScalable;
18707 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18708 return false;
18709
18710 // TODO: Add support for using SVE instructions with fixed types later, using
18711 // the code from lowerInterleavedStore to obtain the correct container type.
18712 if (UseScalable && !VTy->isScalableTy())
18713 return false;
18714
18715 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18716
18717 VectorType *StTy =
18719 VTy->getElementCount().divideCoefficientBy(NumStores));
18720
18721 Type *PtrTy = SI->getPointerOperandType();
18722 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18723 UseScalable, StTy, PtrTy);
18724
18725 IRBuilder<> Builder(SI);
18726
18727 Value *BaseAddr = SI->getPointerOperand();
18728 Value *Pred = nullptr;
18729
18730 if (UseScalable)
18731 Pred =
18732 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18733
18734 auto ExtractedValues = InterleavedValues;
18735 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18736 if (UseScalable)
18737 StoreOperands.push_back(Pred);
18738 StoreOperands.push_back(BaseAddr);
18739 for (unsigned I = 0; I < NumStores; ++I) {
18740 Value *Address = BaseAddr;
18741 if (NumStores > 1) {
18742 Value *Offset = Builder.getInt64(I * Factor);
18743 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18744 Value *Idx =
18745 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18746 for (unsigned J = 0; J < Factor; J++) {
18747 StoreOperands[J] =
18748 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18749 }
18750 // update the address
18751 StoreOperands[StoreOperands.size() - 1] = Address;
18752 }
18753 Builder.CreateCall(StNFunc, StoreOperands);
18754 }
18755 return true;
18756}
18757
18759 LLVMContext &Context, const MemOp &Op,
18760 const AttributeList &FuncAttributes) const {
18761 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18762 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18763 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18764 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18765 // taken one instruction to materialize the v2i64 zero and one store (with
18766 // restrictive addressing mode). Just do i64 stores.
18767 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18768 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18769 if (Op.isAligned(AlignCheck))
18770 return true;
18771 unsigned Fast;
18772 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18774 Fast;
18775 };
18776
18777 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18778 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18779 return MVT::v16i8;
18780 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18781 return MVT::f128;
18782 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18783 return MVT::i64;
18784 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18785 return MVT::i32;
18786 return MVT::Other;
18787}
18788
18790 const MemOp &Op, const AttributeList &FuncAttributes) const {
18791 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18792 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18793 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18794 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18795 // taken one instruction to materialize the v2i64 zero and one store (with
18796 // restrictive addressing mode). Just do i64 stores.
18797 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18798 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18799 if (Op.isAligned(AlignCheck))
18800 return true;
18801 unsigned Fast;
18802 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18804 Fast;
18805 };
18806
18807 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18808 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18809 return LLT::fixed_vector(2, 64);
18810 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18811 return LLT::scalar(128);
18812 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18813 return LLT::scalar(64);
18814 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18815 return LLT::scalar(32);
18816 return LLT();
18817}
18818
18819// 12-bit optionally shifted immediates are legal for adds.
18821 if (Immed == std::numeric_limits<int64_t>::min()) {
18822 return false;
18823 }
18824 // Same encoding for add/sub, just flip the sign.
18825 return isLegalArithImmed((uint64_t)std::abs(Immed));
18826}
18827
18829 // We will only emit addvl/inc* instructions for SVE2
18830 if (!Subtarget->hasSVE2())
18831 return false;
18832
18833 // addvl's immediates are in terms of the number of bytes in a register.
18834 // Since there are 16 in the base supported size (128bits), we need to
18835 // divide the immediate by that much to give us a useful immediate to
18836 // multiply by vscale. We can't have a remainder as a result of this.
18837 if (Imm % 16 == 0)
18838 return isInt<6>(Imm / 16);
18839
18840 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18841 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18842 // of addvl as a result, so only take h|w|d into account.
18843 // Dec[h|w|d] will cover subtractions.
18844 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18845 // FIXME: Can we make use of other patterns to cover other immediates?
18846
18847 // inch|dech
18848 if (Imm % 8 == 0)
18849 return std::abs(Imm / 8) <= 16;
18850 // incw|decw
18851 if (Imm % 4 == 0)
18852 return std::abs(Imm / 4) <= 16;
18853 // incd|decd
18854 if (Imm % 2 == 0)
18855 return std::abs(Imm / 2) <= 16;
18856
18857 return false;
18858}
18859
18860// Return false to prevent folding
18861// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18862// if the folding leads to worse code.
18864 SDValue AddNode, SDValue ConstNode) const {
18865 // Let the DAGCombiner decide for vector types and large types.
18866 const EVT VT = AddNode.getValueType();
18867 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18868 return true;
18869
18870 // It is worse if c1 is legal add immediate, while c1*c2 is not
18871 // and has to be composed by at least two instructions.
18872 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18873 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18874 const int64_t C1 = C1Node->getSExtValue();
18875 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18877 return true;
18879 // Adapt to the width of a register.
18880 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18881 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18882 if (Insn.size() > 1)
18883 return false;
18884
18885 // Default to true and let the DAGCombiner decide.
18886 return true;
18887}
18888
18889// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18890// immediates is the same as for an add or a sub.
18892 return isLegalAddImmediate(Immed);
18893}
18894
18895/// isLegalAddressingMode - Return true if the addressing mode represented
18896/// by AM is legal for this target, for a load/store of the specified type.
18898 const AddrMode &AMode, Type *Ty,
18899 unsigned AS, Instruction *I) const {
18900 // AArch64 has five basic addressing modes:
18901 // reg
18902 // reg + 9-bit signed offset
18903 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18904 // reg1 + reg2
18905 // reg + SIZE_IN_BYTES * reg
18906
18907 // No global is ever allowed as a base.
18908 if (AMode.BaseGV)
18909 return false;
18910
18911 // No reg+reg+imm addressing.
18912 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18913 return false;
18914
18915 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18916 // `2*ScaledReg` into `BaseReg + ScaledReg`
18917 AddrMode AM = AMode;
18918 if (AM.Scale && !AM.HasBaseReg) {
18919 if (AM.Scale == 1) {
18920 AM.HasBaseReg = true;
18921 AM.Scale = 0;
18922 } else if (AM.Scale == 2) {
18923 AM.HasBaseReg = true;
18924 AM.Scale = 1;
18925 } else {
18926 return false;
18927 }
18928 }
18929
18930 // A base register is required in all addressing modes.
18931 if (!AM.HasBaseReg)
18932 return false;
18933
18934 if (Ty->isScalableTy()) {
18935 if (isa<ScalableVectorType>(Ty)) {
18936 // See if we have a foldable vscale-based offset, for vector types which
18937 // are either legal or smaller than the minimum; more work will be
18938 // required if we need to consider addressing for types which need
18939 // legalization by splitting.
18940 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18941 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18942 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18943 isPowerOf2_64(VecNumBytes))
18944 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18945
18946 uint64_t VecElemNumBytes =
18947 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18948 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18949 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18950 }
18951
18952 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18953 }
18954
18955 // No scalable offsets allowed for non-scalable types.
18956 if (AM.ScalableOffset)
18957 return false;
18958
18959 // check reg + imm case:
18960 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18961 uint64_t NumBytes = 0;
18962 if (Ty->isSized()) {
18963 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18964 NumBytes = NumBits / 8;
18965 if (!isPowerOf2_64(NumBits))
18966 NumBytes = 0;
18967 }
18968
18969 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18970 AM.Scale);
18971}
18972
18973// Check whether the 2 offsets belong to the same imm24 range, and their high
18974// 12bits are same, then their high part can be decoded with the offset of add.
18975int64_t
18977 int64_t MaxOffset) const {
18978 int64_t HighPart = MinOffset & ~0xfffULL;
18979 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18980 // Rebase the value to an integer multiple of imm12.
18981 return HighPart;
18982 }
18983
18984 return 0;
18985}
18986
18988 // Consider splitting large offset of struct or array.
18989 return true;
18990}
18991
18993 const MachineFunction &MF, EVT VT) const {
18994 EVT ScalarVT = VT.getScalarType();
18995
18996 if (!ScalarVT.isSimple())
18997 return false;
18998
18999 switch (ScalarVT.getSimpleVT().SimpleTy) {
19000 case MVT::f16:
19001 return Subtarget->hasFullFP16();
19002 case MVT::f32:
19003 case MVT::f64:
19004 return true;
19005 case MVT::bf16:
19006 return VT.isScalableVector() && Subtarget->hasBF16() &&
19007 Subtarget->isNonStreamingSVEorSME2Available();
19008 default:
19009 break;
19010 }
19011
19012 return false;
19013}
19014
19016 Type *Ty) const {
19017 switch (Ty->getScalarType()->getTypeID()) {
19018 case Type::FloatTyID:
19019 case Type::DoubleTyID:
19020 return true;
19021 default:
19022 return false;
19023 }
19024}
19025
19027 EVT VT, CodeGenOptLevel OptLevel) const {
19028 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
19030}
19031
19032const MCPhysReg *
19034 // LR is a callee-save register, but we must treat it as clobbered by any call
19035 // site. Hence we include LR in the scratch registers, which are in turn added
19036 // as implicit-defs for stackmaps and patchpoints.
19037 static const MCPhysReg ScratchRegs[] = {
19038 AArch64::X16, AArch64::X17, AArch64::LR, 0
19039 };
19040 return ScratchRegs;
19041}
19042
19044 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
19045 return RCRegs;
19046}
19047
19048bool
19050 CombineLevel Level) const {
19051 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
19052 N->getOpcode() == ISD::SRL) &&
19053 "Expected shift op");
19054
19055 SDValue ShiftLHS = N->getOperand(0);
19056 EVT VT = N->getValueType(0);
19057
19058 if (!ShiftLHS->hasOneUse())
19059 return false;
19060
19061 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
19062 !ShiftLHS.getOperand(0)->hasOneUse())
19063 return false;
19064
19065 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
19066 // combine it with shift 'N' to let it be lowered to UBFX except:
19067 // ((x >> C) & mask) << C.
19068 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
19069 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
19070 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
19071 if (isMask_64(TruncMask)) {
19072 SDValue AndLHS = ShiftLHS.getOperand(0);
19073 if (AndLHS.getOpcode() == ISD::SRL) {
19074 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
19075 if (N->getOpcode() == ISD::SHL)
19076 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
19077 return SRLC->getZExtValue() == SHLC->getZExtValue();
19078 return false;
19079 }
19080 }
19081 }
19082 }
19083 return true;
19084}
19085
19087 const SDNode *N) const {
19088 assert(N->getOpcode() == ISD::XOR &&
19089 (N->getOperand(0).getOpcode() == ISD::SHL ||
19090 N->getOperand(0).getOpcode() == ISD::SRL) &&
19091 "Expected XOR(SHIFT) pattern");
19092
19093 // Only commute if the entire NOT mask is a hidden shifted mask.
19094 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
19095 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19096 if (XorC && ShiftC) {
19097 unsigned MaskIdx, MaskLen;
19098 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
19099 unsigned ShiftAmt = ShiftC->getZExtValue();
19100 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
19101 if (N->getOperand(0).getOpcode() == ISD::SHL)
19102 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
19103 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
19104 }
19105 }
19106
19107 return false;
19108}
19109
19111 const SDNode *N) const {
19112 assert(((N->getOpcode() == ISD::SHL &&
19113 N->getOperand(0).getOpcode() == ISD::SRL) ||
19114 (N->getOpcode() == ISD::SRL &&
19115 N->getOperand(0).getOpcode() == ISD::SHL)) &&
19116 "Expected shift-shift mask");
19117 // Don't allow multiuse shift folding with the same shift amount.
19118 if (!N->getOperand(0)->hasOneUse())
19119 return false;
19120
19121 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
19122 EVT VT = N->getValueType(0);
19123 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
19124 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19125 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
19126 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
19127 }
19128
19129 // We do not need to fold when this shifting used in specific load case:
19130 // (ldr x, (add x, (shl (srl x, c1) 2)))
19131 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
19132 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
19133 unsigned ShlAmt = C2->getZExtValue();
19134 if (auto ShouldADD = *N->user_begin();
19135 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
19136 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
19137 EVT MemVT = Load->getMemoryVT();
19138
19139 if (Load->getValueType(0).isScalableVector())
19140 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
19141
19142 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
19143 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
19144 }
19145 }
19146 }
19147 }
19148
19149 return true;
19150}
19151
19153 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
19154 SDValue Y) const {
19155 return VT.isScalableVector() && isTypeLegal(VT) &&
19156 SelectOpcode == ISD::VSELECT;
19157}
19158
19160 Type *Ty) const {
19161 assert(Ty->isIntegerTy());
19162
19163 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19164 if (BitSize == 0)
19165 return false;
19166
19167 int64_t Val = Imm.getSExtValue();
19168 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
19169 return true;
19170
19171 if (Val < 0)
19172 Val = ~Val;
19173 if (BitSize == 32)
19174 Val &= (1LL << 32) - 1;
19175
19176 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
19177 // MOVZ is free so return true for one or fewer MOVK.
19178 return Shift < 3;
19179}
19180
19182 unsigned Index) const {
19184 return false;
19185
19186 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
19187}
19188
19190 LLVMContext &Context, EVT VT) const {
19191 if (getTypeAction(Context, VT) != TypeExpandInteger)
19192 return false;
19193
19194 EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
19195 return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
19196}
19197
19198/// Turn vector tests of the signbit in the form of:
19199/// xor (sra X, elt_size(X)-1), -1
19200/// into:
19201/// cmge X, X, #0
19203 const AArch64Subtarget *Subtarget) {
19204 EVT VT = N->getValueType(0);
19205 if (!Subtarget->hasNEON() || !VT.isVector())
19206 return SDValue();
19207
19208 // There must be a shift right algebraic before the xor, and the xor must be a
19209 // 'not' operation.
19210 SDValue Shift = N->getOperand(0);
19211 SDValue Ones = N->getOperand(1);
19212 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
19214 return SDValue();
19215
19216 // The shift should be smearing the sign bit across each vector element.
19217 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
19218 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
19219 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
19220 return SDValue();
19221
19222 SDLoc DL(N);
19223 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
19224 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
19225}
19226
19227// Given a vecreduce_add node, detect the below pattern and convert it to the
19228// node sequence with UABDL, [S|U]ADB and UADDLP.
19229//
19230// i32 vecreduce_add(
19231// v16i32 abs(
19232// v16i32 sub(
19233// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
19234//
19235// or
19236//
19237// i32 vecreduce_add(
19238// v16i32 zext(
19239// v16i16 abs(
19240// v16i16 sub(
19241// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
19242//
19243// =================>
19244// i32 vecreduce_add(
19245// v4i32 UADDLP(
19246// v8i16 add(
19247// v8i16 zext(
19248// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
19249// v8i16 zext(
19250// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
19252 SelectionDAG &DAG) {
19253 // Assumed i32 vecreduce_add
19254 if (N->getValueType(0) != MVT::i32)
19255 return SDValue();
19256
19257 SDValue VecReduceOp0 = N->getOperand(0);
19258 bool SawTrailingZext = false;
19259 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
19260 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
19261 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
19262 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
19263 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
19264 SawTrailingZext = true;
19265 VecReduceOp0 = VecReduceOp0.getOperand(0);
19266 }
19267
19268 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
19269 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
19270 // Assumed v16i16 or v16i32 abs input
19271 unsigned Opcode = VecReduceOp0.getOpcode();
19272 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
19273 return SDValue();
19274
19275 SDValue ABS = VecReduceOp0;
19276 // Assumed v16i16 or v16i32 sub
19277 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
19278 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
19279 return SDValue();
19280
19281 SDValue SUB = ABS->getOperand(0);
19282 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
19283 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
19284 // Assumed v16i16 or v16i32 type
19285 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
19286 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
19287 return SDValue();
19288
19289 // Assumed zext or sext
19290 bool IsZExt = false;
19291 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
19292 IsZExt = true;
19293 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
19294 IsZExt = false;
19295 } else
19296 return SDValue();
19297
19298 SDValue EXT0 = SUB->getOperand(0);
19299 SDValue EXT1 = SUB->getOperand(1);
19300 // Assumed zext's operand has v16i8 type
19301 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
19302 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
19303 return SDValue();
19304
19305 // Pattern is detected. Let's convert it to sequence of nodes.
19306 SDLoc DL(N);
19307
19308 // First, create the node pattern of UABD/SABD.
19309 SDValue UABDHigh8Op0 =
19310 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19311 DAG.getConstant(8, DL, MVT::i64));
19312 SDValue UABDHigh8Op1 =
19313 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19314 DAG.getConstant(8, DL, MVT::i64));
19315 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19316 UABDHigh8Op0, UABDHigh8Op1);
19317 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
19318
19319 // Second, create the node pattern of UABAL.
19320 SDValue UABDLo8Op0 =
19321 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19322 DAG.getConstant(0, DL, MVT::i64));
19323 SDValue UABDLo8Op1 =
19324 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19325 DAG.getConstant(0, DL, MVT::i64));
19326 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19327 UABDLo8Op0, UABDLo8Op1);
19328 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
19329 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
19330
19331 // Third, create the node of UADDLP.
19332 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
19333
19334 // Fourth, create the node of VECREDUCE_ADD.
19335 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
19336}
19337
19338static SDValue
19340 const AArch64Subtarget *ST) {
19341 if (DCI.isBeforeLegalize())
19342 return SDValue();
19343
19344 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
19345 /*IsEqual=*/false))
19346 return While;
19347
19348 if (!N->getValueType(0).isScalableVector() ||
19349 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
19350 return SDValue();
19351
19352 // Count the number of users which are extract_vectors.
19353 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
19354 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
19355 });
19356
19357 auto MaskEC = N->getValueType(0).getVectorElementCount();
19358 if (!MaskEC.isKnownMultipleOf(NumExts))
19359 return SDValue();
19360
19361 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
19362 if (ExtMinEC.getKnownMinValue() < 2)
19363 return SDValue();
19364
19365 SmallVector<SDNode *> Extracts(NumExts, nullptr);
19366 for (SDNode *Use : N->users()) {
19367 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
19368 continue;
19369
19370 // Ensure the extract type is correct (e.g. if NumExts is 4 and
19371 // the mask return type is nxv8i1, each extract should be nxv2i1.
19372 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
19373 return SDValue();
19374
19375 // There should be exactly one extract for each part of the mask.
19376 unsigned Offset = Use->getConstantOperandVal(1);
19377 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
19378 if (Extracts[Part] != nullptr)
19379 return SDValue();
19380
19381 Extracts[Part] = Use;
19382 }
19383
19384 SelectionDAG &DAG = DCI.DAG;
19385 SDLoc DL(N);
19386 SDValue ID =
19387 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
19388
19389 SDValue Idx = N->getOperand(0);
19390 SDValue TC = N->getOperand(1);
19391 if (Idx.getValueType() != MVT::i64) {
19392 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
19393 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
19394 }
19395
19396 // Create the whilelo_x2 intrinsics from each pair of extracts
19397 EVT ExtVT = Extracts[0]->getValueType(0);
19398 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19399 auto R =
19400 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19401 DCI.CombineTo(Extracts[0], R.getValue(0));
19402 DCI.CombineTo(Extracts[1], R.getValue(1));
19403 SmallVector<SDValue> Concats = {DAG.getNode(
19404 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
19405
19406 if (NumExts == 2) {
19407 assert(N->getValueType(0) == DoubleExtVT);
19408 return Concats[0];
19409 }
19410
19411 auto Elts =
19412 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
19413 for (unsigned I = 2; I < NumExts; I += 2) {
19414 // After the first whilelo_x2, we need to increment the starting value.
19415 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
19416 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19417 DCI.CombineTo(Extracts[I], R.getValue(0));
19418 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
19419 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
19420 R.getValue(0), R.getValue(1)));
19421 }
19422
19423 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
19424}
19425
19426// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
19427// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
19428// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
19429// If we have vectors larger than v16i8 we extract v16i8 vectors,
19430// Follow the same steps above to get DOT instructions concatenate them
19431// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
19433 const AArch64Subtarget *ST) {
19434 if (!ST->isNeonAvailable())
19435 return SDValue();
19436
19437 if (!ST->hasDotProd())
19439
19440 SDValue Op0 = N->getOperand(0);
19441 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
19442 Op0.getValueType().getVectorElementType() != MVT::i32)
19443 return SDValue();
19444
19445 unsigned ExtOpcode = Op0.getOpcode();
19446 SDValue A = Op0;
19447 SDValue B;
19448 unsigned DotOpcode;
19449 if (ExtOpcode == ISD::MUL) {
19450 A = Op0.getOperand(0);
19451 B = Op0.getOperand(1);
19452 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
19453 return SDValue();
19454 auto OpCodeA = A.getOpcode();
19455 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
19456 return SDValue();
19457
19458 auto OpCodeB = B.getOpcode();
19459 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
19460 return SDValue();
19461
19462 if (OpCodeA == OpCodeB) {
19463 DotOpcode =
19464 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
19465 } else {
19466 // Check USDOT support support
19467 if (!ST->hasMatMulInt8())
19468 return SDValue();
19469 DotOpcode = AArch64ISD::USDOT;
19470 if (OpCodeA == ISD::SIGN_EXTEND)
19471 std::swap(A, B);
19472 }
19473 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
19474 DotOpcode = AArch64ISD::UDOT;
19475 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
19476 DotOpcode = AArch64ISD::SDOT;
19477 } else {
19478 return SDValue();
19479 }
19480
19481 EVT Op0VT = A.getOperand(0).getValueType();
19482 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19483 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19484 if (!IsValidElementCount || !IsValidSize)
19485 return SDValue();
19486
19487 SDLoc DL(Op0);
19488 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19489 // the extend B.
19490 if (!B)
19491 B = DAG.getConstant(1, DL, Op0VT);
19492 else
19493 B = B.getOperand(0);
19494
19495 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19496 unsigned NumOfVecReduce;
19497 EVT TargetType;
19498 if (IsMultipleOf16) {
19499 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19500 TargetType = MVT::v4i32;
19501 } else {
19502 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19503 TargetType = MVT::v2i32;
19504 }
19505 // Handle the case where we need to generate only one Dot operation.
19506 if (NumOfVecReduce == 1) {
19507 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19508 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19509 A.getOperand(0), B);
19510 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19511 }
19512 // Generate Dot instructions that are multiple of 16.
19513 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19514 SmallVector<SDValue, 4> SDotVec16;
19515 unsigned I = 0;
19516 for (; I < VecReduce16Num; I += 1) {
19517 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19518 SDValue Op0 =
19519 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19520 DAG.getConstant(I * 16, DL, MVT::i64));
19521 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19522 DAG.getConstant(I * 16, DL, MVT::i64));
19523 SDValue Dot =
19524 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19525 SDotVec16.push_back(Dot);
19526 }
19527 // Concatenate dot operations.
19528 EVT SDot16EVT =
19529 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19530 SDValue ConcatSDot16 =
19531 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19532 SDValue VecReduceAdd16 =
19533 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19534 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19535 if (VecReduce8Num == 0)
19536 return VecReduceAdd16;
19537
19538 // Generate the remainder Dot operation that is multiple of 8.
19539 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19540 SDValue Vec8Op0 =
19541 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19542 DAG.getConstant(I * 16, DL, MVT::i64));
19543 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19544 DAG.getConstant(I * 16, DL, MVT::i64));
19545 SDValue Dot =
19546 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19547 SDValue VecReduceAdd8 =
19548 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19549 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19550 VecReduceAdd8);
19551}
19552
19553// Given an (integer) vecreduce, we know the order of the inputs does not
19554// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19555// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19556// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19558 auto DetectAddExtract = [&](SDValue A) {
19559 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19560 // UADDLP(x) if found.
19561 assert(A.getOpcode() == ISD::ADD);
19562 EVT VT = A.getValueType();
19563 SDValue Op0 = A.getOperand(0);
19564 SDValue Op1 = A.getOperand(1);
19565 if (Op0.getOpcode() != Op1.getOpcode() ||
19566 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19567 Op0.getOpcode() != ISD::SIGN_EXTEND))
19568 return SDValue();
19569 SDValue Ext0 = Op0.getOperand(0);
19570 SDValue Ext1 = Op1.getOperand(0);
19571 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19573 Ext0.getOperand(0) != Ext1.getOperand(0) ||
19575 return SDValue();
19576 // Check that the type is twice the add types, and the extract are from
19577 // upper/lower parts of the same source.
19579 VT.getVectorNumElements() * 2)
19580 return SDValue();
19581 if ((Ext0.getConstantOperandVal(1) != 0 ||
19583 (Ext1.getConstantOperandVal(1) != 0 ||
19585 return SDValue();
19586 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19587 : AArch64ISD::SADDLP;
19588 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19589 };
19590
19591 if (SDValue R = DetectAddExtract(A))
19592 return R;
19593
19594 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19595 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19596 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19597 A.getOperand(1));
19598 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19599 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19600 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19601 A.getOperand(0));
19602 return SDValue();
19603}
19604
19605// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19606// UADDLV(concat), where the concat represents the 64-bit zext sources.
19608 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19609 // UADDLV(concat(zext, zext)) if found.
19610 assert(A.getOpcode() == ISD::ADD);
19611 EVT VT = A.getValueType();
19612 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19613 return SDValue();
19614 SDValue Op0 = A.getOperand(0);
19615 SDValue Op1 = A.getOperand(1);
19616 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19617 return SDValue();
19618 SDValue Ext0 = Op0.getOperand(0);
19619 SDValue Ext1 = Op1.getOperand(0);
19620 EVT ExtVT0 = Ext0.getValueType();
19621 EVT ExtVT1 = Ext1.getValueType();
19622 // Check zext VTs are the same and 64-bit length.
19623 if (ExtVT0 != ExtVT1 ||
19624 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19625 return SDValue();
19626 // Get VT for concat of zext sources.
19627 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19628 SDValue Concat =
19629 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19630
19631 switch (VT.getSimpleVT().SimpleTy) {
19632 case MVT::v2i64:
19633 case MVT::v4i32:
19634 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19635 case MVT::v8i16: {
19636 SDValue Uaddlv =
19637 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19638 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19639 }
19640 default:
19641 llvm_unreachable("Unhandled vector type");
19642 }
19643}
19644
19646 SDValue A = N->getOperand(0);
19647 if (A.getOpcode() == ISD::ADD) {
19648 if (SDValue R = performUADDVAddCombine(A, DAG))
19649 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19650 else if (SDValue R = performUADDVZextCombine(A, DAG))
19651 return R;
19652 }
19653
19654 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19655 MVT OpVT = A.getSimpleValueType();
19656 assert(N->getSimpleValueType(0) == OpVT &&
19657 "The operand type should be consistent with the result type of UADDV");
19659 Mask.clearBit(0);
19660 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19661 if (KnownLeadingLanes.isZero())
19662 return A;
19663
19664 return SDValue();
19665}
19666
19670 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
19671 APInt DemandedElts =
19672 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
19673
19675 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
19676 return SDValue(N, 0);
19677 return SDValue();
19678}
19679
19682 const AArch64Subtarget *Subtarget) {
19683 if (DCI.isBeforeLegalizeOps())
19684 return SDValue();
19685
19686 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19687}
19688
19689SDValue
19690AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19691 SelectionDAG &DAG,
19692 SmallVectorImpl<SDNode *> &Created) const {
19693 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19694 if (isIntDivCheap(N->getValueType(0), Attr))
19695 return SDValue(N, 0); // Lower SDIV as SDIV
19696
19697 EVT VT = N->getValueType(0);
19698
19699 // If SVE is available, we can generate
19700 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19701 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19702 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19703 return SDValue(N, 0);
19704
19705 // fold (sdiv X, pow2)
19706 if ((VT != MVT::i32 && VT != MVT::i64) ||
19707 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19708 return SDValue();
19709
19710 // If the divisor is 2 or -2, the default expansion is better. It will add
19711 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19712 if (Divisor == 2 ||
19713 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19714 return SDValue();
19715
19716 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19717}
19718
19719SDValue
19720AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19721 SelectionDAG &DAG,
19722 SmallVectorImpl<SDNode *> &Created) const {
19723 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19724 if (isIntDivCheap(N->getValueType(0), Attr))
19725 return SDValue(N, 0); // Lower SREM as SREM
19726
19727 EVT VT = N->getValueType(0);
19728
19729 // For scalable and fixed types, mark them as cheap so we can handle it much
19730 // later. This allows us to handle larger than legal types.
19731 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19732 return SDValue(N, 0);
19733
19734 // fold (srem X, pow2)
19735 if ((VT != MVT::i32 && VT != MVT::i64) ||
19736 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19737 return SDValue();
19738
19739 unsigned Lg2 = Divisor.countr_zero();
19740 if (Lg2 == 0)
19741 return SDValue();
19742
19743 SDLoc DL(N);
19744 SDValue N0 = N->getOperand(0);
19745 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19746 SDValue Zero = DAG.getConstant(0, DL, VT);
19747 SDValue CCVal, CSNeg;
19748 if (Lg2 == 1) {
19749 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19750 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19751 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19752
19753 Created.push_back(Cmp.getNode());
19754 Created.push_back(And.getNode());
19755 } else {
19756 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19757 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19758
19759 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19760 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19761 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19762 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19763 Negs.getValue(1));
19764
19765 Created.push_back(Negs.getNode());
19766 Created.push_back(AndPos.getNode());
19767 Created.push_back(AndNeg.getNode());
19768 }
19769
19770 return CSNeg;
19771}
19772
19774 switch(getIntrinsicID(S.getNode())) {
19775 default:
19776 break;
19777 case Intrinsic::aarch64_sve_cntb:
19778 case Intrinsic::aarch64_sve_cnth:
19779 case Intrinsic::aarch64_sve_cntw:
19780 case Intrinsic::aarch64_sve_cntd:
19781 return true;
19782 }
19783 return false;
19784}
19785
19786// Returns the maximum (scalable) value that can be returned by an SVE count
19787// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
19788static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
19789 Intrinsic::ID IID = getIntrinsicID(Op.getNode());
19790 if (IID == Intrinsic::aarch64_sve_cntp)
19791 return Op.getOperand(1).getValueType().getVectorElementCount();
19792 switch (IID) {
19793 case Intrinsic::aarch64_sve_cntd:
19794 return ElementCount::getScalable(2);
19795 case Intrinsic::aarch64_sve_cntw:
19796 return ElementCount::getScalable(4);
19797 case Intrinsic::aarch64_sve_cnth:
19798 return ElementCount::getScalable(8);
19799 case Intrinsic::aarch64_sve_cntb:
19800 return ElementCount::getScalable(16);
19801 default:
19802 return std::nullopt;
19803 }
19804}
19805
19806/// Calculates what the pre-extend type is, based on the extension
19807/// operation node provided by \p Extend.
19808///
19809/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19810/// pre-extend type is pulled directly from the operand, while other extend
19811/// operations need a bit more inspection to get this information.
19812///
19813/// \param Extend The SDNode from the DAG that represents the extend operation
19814///
19815/// \returns The type representing the \p Extend source type, or \p MVT::Other
19816/// if no valid type can be determined
19818 switch (Extend.getOpcode()) {
19819 case ISD::SIGN_EXTEND:
19820 case ISD::ZERO_EXTEND:
19821 case ISD::ANY_EXTEND:
19822 return Extend.getOperand(0).getValueType();
19823 case ISD::AssertSext:
19824 case ISD::AssertZext:
19826 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19827 if (!TypeNode)
19828 return MVT::Other;
19829 return TypeNode->getVT();
19830 }
19831 case ISD::AND: {
19834 if (!Constant)
19835 return MVT::Other;
19836
19837 uint32_t Mask = Constant->getZExtValue();
19838
19839 if (Mask == UCHAR_MAX)
19840 return MVT::i8;
19841 else if (Mask == USHRT_MAX)
19842 return MVT::i16;
19843 else if (Mask == UINT_MAX)
19844 return MVT::i32;
19845
19846 return MVT::Other;
19847 }
19848 default:
19849 return MVT::Other;
19850 }
19851}
19852
19853/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19854/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19855/// SExt/ZExt rather than the scalar SExt/ZExt
19857 EVT VT = BV.getValueType();
19858 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19860 return SDValue();
19861
19862 // Use the first item in the buildvector/shuffle to get the size of the
19863 // extend, and make sure it looks valid.
19864 SDValue Extend = BV->getOperand(0);
19865 unsigned ExtendOpcode = Extend.getOpcode();
19866 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19867 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19868 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19869 ExtendOpcode == ISD::AssertSext;
19870 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19871 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19872 return SDValue();
19873 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19874 // ensure calculatePreExtendType will work without issue.
19875 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19876 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19877 return SDValue();
19878
19879 // Restrict valid pre-extend data type
19880 EVT PreExtendType = calculatePreExtendType(Extend);
19881 if (PreExtendType == MVT::Other ||
19882 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19883 return SDValue();
19884
19885 // Make sure all other operands are equally extended.
19886 bool SeenZExtOrSExt = !IsAnyExt;
19887 for (SDValue Op : drop_begin(BV->ops())) {
19888 if (Op.isUndef())
19889 continue;
19890
19891 if (calculatePreExtendType(Op) != PreExtendType)
19892 return SDValue();
19893
19894 unsigned Opc = Op.getOpcode();
19895 if (Opc == ISD::ANY_EXTEND)
19896 continue;
19897
19898 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19900
19901 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19902 return SDValue();
19903
19904 IsSExt = OpcIsSExt;
19905 SeenZExtOrSExt = true;
19906 }
19907
19908 SDValue NBV;
19909 SDLoc DL(BV);
19910 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19911 EVT PreExtendVT =
19912 VT.changeVectorElementType(*DAG.getContext(), PreExtendType);
19913 EVT PreExtendLegalType =
19914 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19916 for (SDValue Op : BV->ops())
19917 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19918 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19919 PreExtendLegalType));
19920 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19921 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19922 EVT PreExtendVT = VT.changeVectorElementType(*DAG.getContext(),
19923 PreExtendType.getScalarType());
19924 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19925 BV.getOperand(1).isUndef()
19926 ? DAG.getUNDEF(PreExtendVT)
19927 : BV.getOperand(1).getOperand(0),
19928 cast<ShuffleVectorSDNode>(BV)->getMask());
19929 }
19930 unsigned ExtOpc = !SeenZExtOrSExt
19932 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19933 return DAG.getNode(ExtOpc, DL, VT, NBV);
19934}
19935
19936/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19937/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19939 // If the value type isn't a vector, none of the operands are going to be dups
19940 EVT VT = Mul->getValueType(0);
19941 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19942 return SDValue();
19943
19944 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19945 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19946
19947 // Neither operands have been changed, don't make any further changes
19948 if (!Op0 && !Op1)
19949 return SDValue();
19950
19951 SDLoc DL(Mul);
19952 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19953 Op1 ? Op1 : Mul->getOperand(1));
19954}
19955
19956// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
19957// folding a power-of-two factor of the constant into the RDSVL immediate and
19958// compensating with an extra shift.
19959//
19960// We rewrite:
19961// (mul (srl (rdsvl 1), w), x)
19962// to one of:
19963// (shl (rdsvl y), z) if z > 0
19964// (srl (rdsvl y), abs(z)) if z < 0
19965// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
19967 SDLoc DL(Mul);
19968 EVT VT = Mul->getValueType(0);
19969 SDValue MulOp0 = Mul->getOperand(0);
19970 int ConstMultiplier =
19971 cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
19972 if ((MulOp0->getOpcode() != ISD::SRL) ||
19973 (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
19974 return SDValue();
19975
19976 unsigned AbsConstValue = abs(ConstMultiplier);
19977 unsigned OperandShift =
19978 cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
19979
19980 // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
19981 // integral)
19982 int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
19983
19984 // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
19985 // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
19986 unsigned B = ConstMultiplier < 0 ? 32 : 31;
19987 unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
19988 int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
19989
19990 // No valid solution found.
19991 if (LowerBound > UpperBound)
19992 return SDValue();
19993
19994 // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
19995 // shift if possible.
19996 int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
19997
19998 // y = x / 2^(w + z)
19999 int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
20000 (ConstMultiplier < 0 ? -1 : 1);
20001 auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
20002 DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
20003
20004 if (Shift == 0)
20005 return Rdsvl;
20006 return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
20007 DAG.getConstant(abs(Shift), DL, MVT::i32),
20009}
20010
20011// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
20012// Same for other types with equivalent constants.
20014 EVT VT = N->getValueType(0);
20015 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
20016 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
20017 return SDValue();
20018 if (N->getOperand(0).getOpcode() != ISD::AND ||
20019 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
20020 return SDValue();
20021
20022 SDValue And = N->getOperand(0);
20023 SDValue Srl = And.getOperand(0);
20024
20025 APInt V1, V2, V3;
20026 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
20027 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
20029 return SDValue();
20030
20031 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
20032 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
20033 V3 != (HalfSize - 1))
20034 return SDValue();
20035
20036 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20037 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
20038 VT.getVectorElementCount() * 2);
20039
20040 SDLoc DL(N);
20041 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
20042 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
20043 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
20044 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
20045}
20046
20047// Transform vector add(zext i8 to i32, zext i8 to i32)
20048// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
20049// This allows extra uses of saddl/uaddl at the lower vector widths, and less
20050// extends.
20052 EVT VT = N->getValueType(0);
20053 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
20054 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
20055 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
20056 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
20057 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
20058 N->getOperand(0).getOperand(0).getValueType() !=
20059 N->getOperand(1).getOperand(0).getValueType())
20060 return SDValue();
20061
20062 if (N->getOpcode() == ISD::MUL &&
20063 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
20064 return SDValue();
20065
20066 SDValue N0 = N->getOperand(0).getOperand(0);
20067 SDValue N1 = N->getOperand(1).getOperand(0);
20068 EVT InVT = N0.getValueType();
20069
20070 EVT S1 = InVT.getScalarType();
20071 EVT S2 = VT.getScalarType();
20072 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
20073 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
20074 SDLoc DL(N);
20075 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20078 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
20079 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
20080 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
20081 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
20082 : (unsigned)ISD::SIGN_EXTEND,
20083 DL, VT, NewOp);
20084 }
20085 return SDValue();
20086}
20087
20090 const AArch64Subtarget *Subtarget) {
20091
20092 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
20093 return Ext;
20095 return Ext;
20096 if (SDValue Ext = performVectorExtCombine(N, DAG))
20097 return Ext;
20098
20099 if (DCI.isBeforeLegalizeOps())
20100 return SDValue();
20101
20102 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
20103 // and in MachineCombiner pass, add+mul will be combined into madd.
20104 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
20105 SDLoc DL(N);
20106 EVT VT = N->getValueType(0);
20107 SDValue N0 = N->getOperand(0);
20108 SDValue N1 = N->getOperand(1);
20109 SDValue MulOper;
20110 unsigned AddSubOpc;
20111
20112 auto IsAddSubWith1 = [&](SDValue V) -> bool {
20113 AddSubOpc = V->getOpcode();
20114 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
20115 SDValue Opnd = V->getOperand(1);
20116 MulOper = V->getOperand(0);
20117 if (AddSubOpc == ISD::SUB)
20118 std::swap(Opnd, MulOper);
20119 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
20120 return C->isOne();
20121 }
20122 return false;
20123 };
20124
20125 if (IsAddSubWith1(N0)) {
20126 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
20127 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
20128 }
20129
20130 if (IsAddSubWith1(N1)) {
20131 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
20132 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
20133 }
20134
20135 // The below optimizations require a constant RHS.
20136 if (!isa<ConstantSDNode>(N1))
20137 return SDValue();
20138
20139 if (SDValue Ext = performMulRdsvlCombine(N, DAG))
20140 return Ext;
20141
20143 const APInt &ConstValue = C->getAPIntValue();
20144
20145 // Allow the scaling to be folded into the `cnt` instruction by preventing
20146 // the scaling to be obscured here. This makes it easier to pattern match.
20147 if (IsSVECntIntrinsic(N0) ||
20148 (N0->getOpcode() == ISD::TRUNCATE &&
20149 (IsSVECntIntrinsic(N0->getOperand(0)))))
20150 if (ConstValue.sge(1) && ConstValue.sle(16))
20151 return SDValue();
20152
20153 // Multiplication of a power of two plus/minus one can be done more
20154 // cheaply as shift+add/sub. For now, this is true unilaterally. If
20155 // future CPUs have a cheaper MADD instruction, this may need to be
20156 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
20157 // 64-bit is 5 cycles, so this is always a win.
20158 // More aggressively, some multiplications N0 * C can be lowered to
20159 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
20160 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
20161 // TODO: lower more cases.
20162
20163 // TrailingZeroes is used to test if the mul can be lowered to
20164 // shift+add+shift.
20165 unsigned TrailingZeroes = ConstValue.countr_zero();
20166 if (TrailingZeroes) {
20167 // Conservatively do not lower to shift+add+shift if the mul might be
20168 // folded into smul or umul.
20169 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
20170 isZeroExtended(N0, DAG)))
20171 return SDValue();
20172 // Conservatively do not lower to shift+add+shift if the mul might be
20173 // folded into madd or msub.
20174 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
20175 N->user_begin()->getOpcode() == ISD::SUB))
20176 return SDValue();
20177 }
20178 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
20179 // and shift+add+shift.
20180 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
20181 unsigned ShiftAmt;
20182
20183 auto Shl = [&](SDValue N0, unsigned N1) {
20184 if (!N0.getNode())
20185 return SDValue();
20186 // If shift causes overflow, ignore this combine.
20187 if (N1 >= N0.getValueSizeInBits())
20188 return SDValue();
20189 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
20190 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
20191 };
20192 auto Add = [&](SDValue N0, SDValue N1) {
20193 if (!N0.getNode() || !N1.getNode())
20194 return SDValue();
20195 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
20196 };
20197 auto Sub = [&](SDValue N0, SDValue N1) {
20198 if (!N0.getNode() || !N1.getNode())
20199 return SDValue();
20200 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
20201 };
20202 auto Negate = [&](SDValue N) {
20203 if (!N0.getNode())
20204 return SDValue();
20205 SDValue Zero = DAG.getConstant(0, DL, VT);
20206 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
20207 };
20208
20209 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
20210 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
20211 // the (2^N - 1) can't be execused via a single instruction.
20212 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
20213 unsigned BitWidth = C.getBitWidth();
20214 for (unsigned i = 1; i < BitWidth / 2; i++) {
20215 APInt Rem;
20216 APInt X(BitWidth, (1 << i) + 1);
20217 APInt::sdivrem(C, X, N, Rem);
20218 APInt NVMinus1 = N - 1;
20219 if (Rem == 0 && NVMinus1.isPowerOf2()) {
20220 M = X;
20221 return true;
20222 }
20223 }
20224 return false;
20225 };
20226
20227 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
20228 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
20229 // the (2^N - 1) can't be execused via a single instruction.
20230 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
20231 APInt CVMinus1 = C - 1;
20232 if (CVMinus1.isNegative())
20233 return false;
20234 unsigned TrailingZeroes = CVMinus1.countr_zero();
20235 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
20236 if (SCVMinus1.isPowerOf2()) {
20237 unsigned BitWidth = SCVMinus1.getBitWidth();
20238 M = APInt(BitWidth, SCVMinus1.logBase2());
20239 N = APInt(BitWidth, TrailingZeroes);
20240 return true;
20241 }
20242 return false;
20243 };
20244
20245 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
20246 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
20247 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
20248 APInt CVMinus1 = C - 1;
20249 if (CVMinus1.isNegative())
20250 return false;
20251 unsigned TrailingZeroes = CVMinus1.countr_zero();
20252 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
20253 if (CVPlus1.isPowerOf2()) {
20254 unsigned BitWidth = CVPlus1.getBitWidth();
20255 M = APInt(BitWidth, CVPlus1.logBase2());
20256 N = APInt(BitWidth, TrailingZeroes);
20257 return true;
20258 }
20259 return false;
20260 };
20261
20262 if (ConstValue.isNonNegative()) {
20263 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
20264 // (mul x, 2^N - 1) => (sub (shl x, N), x)
20265 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
20266 // (mul x, (2^M + 1) * (2^N + 1))
20267 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
20268 // (mul x, (2^M + 1) * 2^N + 1))
20269 // => MV = add (shl x, M), x); add (shl MV, N), x)
20270 // (mul x, 1 - (1 - 2^M) * 2^N))
20271 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
20272 APInt SCVMinus1 = ShiftedConstValue - 1;
20273 APInt SCVPlus1 = ShiftedConstValue + 1;
20274 APInt CVPlus1 = ConstValue + 1;
20275 APInt CVM, CVN;
20276 if (SCVMinus1.isPowerOf2()) {
20277 ShiftAmt = SCVMinus1.logBase2();
20278 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
20279 } else if (CVPlus1.isPowerOf2()) {
20280 ShiftAmt = CVPlus1.logBase2();
20281 return Sub(Shl(N0, ShiftAmt), N0);
20282 } else if (SCVPlus1.isPowerOf2()) {
20283 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20284 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
20285 }
20286 if (Subtarget->hasALULSLFast() &&
20287 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
20288 APInt CVMMinus1 = CVM - 1;
20289 APInt CVNMinus1 = CVN - 1;
20290 unsigned ShiftM1 = CVMMinus1.logBase2();
20291 unsigned ShiftN1 = CVNMinus1.logBase2();
20292 // ALULSLFast implicate that Shifts <= 4 places are fast
20293 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
20294 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
20295 return Add(Shl(MVal, ShiftN1), MVal);
20296 }
20297 }
20298 if (Subtarget->hasALULSLFast() &&
20299 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
20300 unsigned ShiftM = CVM.getZExtValue();
20301 unsigned ShiftN = CVN.getZExtValue();
20302 // ALULSLFast implicate that Shifts <= 4 places are fast
20303 if (ShiftM <= 4 && ShiftN <= 4) {
20304 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
20305 return Add(Shl(MVal, CVN.getZExtValue()), N0);
20306 }
20307 }
20308
20309 if (Subtarget->hasALULSLFast() &&
20310 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
20311 unsigned ShiftM = CVM.getZExtValue();
20312 unsigned ShiftN = CVN.getZExtValue();
20313 // ALULSLFast implicate that Shifts <= 4 places are fast
20314 if (ShiftM <= 4 && ShiftN <= 4) {
20315 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
20316 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
20317 }
20318 }
20319 } else {
20320 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
20321 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
20322 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
20323 APInt SCVPlus1 = -ShiftedConstValue + 1;
20324 APInt CVNegPlus1 = -ConstValue + 1;
20325 APInt CVNegMinus1 = -ConstValue - 1;
20326 if (CVNegPlus1.isPowerOf2()) {
20327 ShiftAmt = CVNegPlus1.logBase2();
20328 return Sub(N0, Shl(N0, ShiftAmt));
20329 } else if (CVNegMinus1.isPowerOf2()) {
20330 ShiftAmt = CVNegMinus1.logBase2();
20331 return Negate(Add(Shl(N0, ShiftAmt), N0));
20332 } else if (SCVPlus1.isPowerOf2()) {
20333 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20334 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
20335 }
20336 }
20337
20338 return SDValue();
20339}
20340
20342 SelectionDAG &DAG) {
20343 // Take advantage of vector comparisons producing 0 or -1 in each lane to
20344 // optimize away operation when it's from a constant.
20345 //
20346 // The general transformation is:
20347 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
20348 // AND(VECTOR_CMP(x,y), constant2)
20349 // constant2 = UNARYOP(constant)
20350
20351 // Early exit if this isn't a vector operation, the operand of the
20352 // unary operation isn't a bitwise AND, or if the sizes of the operations
20353 // aren't the same.
20354 EVT VT = N->getValueType(0);
20355 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
20356 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
20357 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
20358 return SDValue();
20359
20360 // Now check that the other operand of the AND is a constant. We could
20361 // make the transformation for non-constant splats as well, but it's unclear
20362 // that would be a benefit as it would not eliminate any operations, just
20363 // perform one more step in scalar code before moving to the vector unit.
20364 if (BuildVectorSDNode *BV =
20365 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
20366 // Bail out if the vector isn't a constant.
20367 if (!BV->isConstant())
20368 return SDValue();
20369
20370 // Everything checks out. Build up the new and improved node.
20371 SDLoc DL(N);
20372 EVT IntVT = BV->getValueType(0);
20373 // Create a new constant of the appropriate type for the transformed
20374 // DAG.
20375 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
20376 // The AND node needs bitcasts to/from an integer vector type around it.
20377 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
20378 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
20379 N->getOperand(0)->getOperand(0), MaskConst);
20380 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
20381 return Res;
20382 }
20383
20384 return SDValue();
20385}
20386
20387/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
20388/// functions, this can help to reduce the number of fmovs to/from GPRs.
20389static SDValue
20392 const AArch64Subtarget *Subtarget) {
20393 if (N->isStrictFPOpcode())
20394 return SDValue();
20395
20396 if (DCI.isBeforeLegalizeOps())
20397 return SDValue();
20398
20399 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
20400 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
20401 return SDValue();
20402
20403 auto isSupportedType = [](EVT VT) {
20404 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
20405 };
20406
20407 SDValue SrcVal = N->getOperand(0);
20408 EVT SrcTy = SrcVal.getValueType();
20409 EVT DestTy = N->getValueType(0);
20410
20411 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
20412 return SDValue();
20413
20414 EVT SrcVecTy;
20415 EVT DestVecTy;
20416 if (DestTy.bitsGT(SrcTy)) {
20417 DestVecTy = getPackedSVEVectorVT(DestTy);
20418 SrcVecTy = DestVecTy.changeVectorElementType(*DAG.getContext(), SrcTy);
20419 } else {
20420 SrcVecTy = getPackedSVEVectorVT(SrcTy);
20421 DestVecTy = SrcVecTy.changeVectorElementType(*DAG.getContext(), DestTy);
20422 }
20423
20424 // Ensure the resulting src/dest vector type is legal.
20425 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
20426 return SDValue();
20427
20428 SDLoc DL(N);
20429 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20430 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
20431 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
20432 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
20433 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
20434}
20435
20438 const AArch64Subtarget *Subtarget) {
20439 // First try to optimize away the conversion when it's conditionally from
20440 // a constant. Vectors only.
20442 return Res;
20443
20444 if (SDValue Res =
20445 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20446 return Res;
20447
20448 EVT VT = N->getValueType(0);
20449 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
20450 return SDValue();
20451 if (VT == MVT::f16 && !Subtarget->hasFullFP16())
20452 return SDValue();
20453
20454 // Only optimize when the source and destination types have the same width.
20455 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
20456 return SDValue();
20457
20458 // If the result of an integer load is only used by an integer-to-float
20459 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
20460 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
20461 SDValue N0 = N->getOperand(0);
20462 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
20463 N0.hasOneUse() &&
20464 // Do not change the width of a volatile load.
20465 !cast<LoadSDNode>(N0)->isVolatile()) {
20466 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
20467 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
20468 LN0->getPointerInfo(), LN0->getAlign(),
20469 LN0->getMemOperand()->getFlags());
20470
20471 // Make sure successors of the original load stay after it by updating them
20472 // to use the new Chain.
20473 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
20474
20475 unsigned Opcode =
20476 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
20477 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
20478 }
20479
20480 return SDValue();
20481}
20482
20483/// Fold a floating-point multiply by power of two into floating-point to
20484/// fixed-point conversion.
20487 const AArch64Subtarget *Subtarget) {
20488 if (SDValue Res =
20489 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20490 return Res;
20491
20492 if (!Subtarget->isNeonAvailable())
20493 return SDValue();
20494
20495 if (!N->getValueType(0).isSimple())
20496 return SDValue();
20497
20498 SDValue Op = N->getOperand(0);
20499 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
20500 return SDValue();
20501
20502 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
20503 return SDValue();
20504
20505 SDValue ConstVec = Op->getOperand(1);
20506 if (!isa<BuildVectorSDNode>(ConstVec))
20507 return SDValue();
20508
20509 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
20510 uint32_t FloatBits = FloatTy.getSizeInBits();
20511 if (FloatBits != 32 && FloatBits != 64 &&
20512 (FloatBits != 16 || !Subtarget->hasFullFP16()))
20513 return SDValue();
20514
20515 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
20516 uint32_t IntBits = IntTy.getSizeInBits();
20517 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
20518 return SDValue();
20519
20520 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
20521 if (IntBits > FloatBits)
20522 return SDValue();
20523
20524 BitVector UndefElements;
20526 int32_t Bits = IntBits == 64 ? 64 : 32;
20527 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
20528 if (C == -1 || C == 0 || C > Bits)
20529 return SDValue();
20530
20531 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
20532 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
20533 return SDValue();
20534
20535 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
20536 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
20537 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
20538 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
20539 return SDValue();
20540 }
20541
20542 SDLoc DL(N);
20543 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
20544 N->getOpcode() == ISD::FP_TO_SINT_SAT);
20545 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
20546 : Intrinsic::aarch64_neon_vcvtfp2fxu;
20547 SDValue FixConv =
20549 DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32),
20550 Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));
20551 // We can handle smaller integers by generating an extra trunc.
20552 if (IntBits < FloatBits)
20553 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
20554
20555 return FixConv;
20556}
20557
20558// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
20559// convert to csel(ccmp(.., cc0)), depending on cc1:
20560
20561// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20562// =>
20563// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
20564//
20565// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20566// =>
20567// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
20569 EVT VT = N->getValueType(0);
20570 SDValue CSel0 = N->getOperand(0);
20571 SDValue CSel1 = N->getOperand(1);
20572
20573 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
20574 CSel1.getOpcode() != AArch64ISD::CSEL)
20575 return SDValue();
20576
20577 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
20578 return SDValue();
20579
20580 if (!isNullConstant(CSel0.getOperand(0)) ||
20581 !isOneConstant(CSel0.getOperand(1)) ||
20582 !isNullConstant(CSel1.getOperand(0)) ||
20583 !isOneConstant(CSel1.getOperand(1)))
20584 return SDValue();
20585
20586 SDValue Cmp0 = CSel0.getOperand(3);
20587 SDValue Cmp1 = CSel1.getOperand(3);
20590 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
20591 return SDValue();
20592 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
20593 Cmp0.getOpcode() == AArch64ISD::SUBS) {
20594 std::swap(Cmp0, Cmp1);
20595 std::swap(CC0, CC1);
20596 }
20597
20598 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
20599 return SDValue();
20600
20601 SDLoc DL(N);
20602 SDValue CCmp, Condition;
20603 unsigned NZCV;
20604
20605 if (N->getOpcode() == ISD::AND || N->getOpcode() == AArch64ISD::ANDS) {
20607 Condition = getCondCode(DAG, InvCC0);
20609 } else {
20611 Condition = getCondCode(DAG, CC0);
20613 }
20614
20615 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
20616
20617 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
20618 if (Op1 && Op1->getAPIntValue().isNegative() &&
20619 Op1->getAPIntValue().sgt(-32)) {
20620 // CCMP accept the constant int the range [0, 31]
20621 // if the Op1 is a constant in the range [-31, -1], we
20622 // can select to CCMN to avoid the extra mov
20623 SDValue AbsOp1 =
20624 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
20625 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
20626 AbsOp1, NZCVOp, Condition, Cmp0);
20627 } else {
20628 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
20629 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
20630 }
20631 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
20632 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
20633}
20634
20636 const AArch64Subtarget *Subtarget,
20637 const AArch64TargetLowering &TLI) {
20638 SelectionDAG &DAG = DCI.DAG;
20639
20640 if (SDValue R = performANDORCSELCombine(N, DAG))
20641 return R;
20642
20643 return SDValue();
20644}
20645
20647 if (!MemVT.getVectorElementType().isSimple())
20648 return false;
20649
20650 uint64_t MaskForTy = 0ull;
20651 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
20652 case MVT::i8:
20653 MaskForTy = 0xffull;
20654 break;
20655 case MVT::i16:
20656 MaskForTy = 0xffffull;
20657 break;
20658 case MVT::i32:
20659 MaskForTy = 0xffffffffull;
20660 break;
20661 default:
20662 return false;
20663 break;
20664 }
20665
20666 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
20667 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
20668 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
20669
20670 return false;
20671}
20672
20674 SDValue LeafOp = SDValue(N, 0);
20675 SDValue Op = N->getOperand(0);
20676 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
20677 LeafOp.getValueType() != Op.getValueType())
20678 Op = Op->getOperand(0);
20679 if (LeafOp.getValueType() == Op.getValueType())
20680 return Op;
20681 return SDValue();
20682}
20683
20686 SelectionDAG &DAG = DCI.DAG;
20687 SDValue Src = N->getOperand(0);
20688 unsigned Opc = Src->getOpcode();
20689
20690 // Zero/any extend of an unsigned unpack
20691 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20692 SDValue UnpkOp = Src->getOperand(0);
20693 SDValue Dup = N->getOperand(1);
20694
20695 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
20696 return SDValue();
20697
20698 SDLoc DL(N);
20700 if (!C)
20701 return SDValue();
20702
20703 uint64_t ExtVal = C->getZExtValue();
20704
20705 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
20706 return ((ExtVal == 0xFF && VT == MVT::i8) ||
20707 (ExtVal == 0xFFFF && VT == MVT::i16) ||
20708 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
20709 };
20710
20711 // If the mask is fully covered by the unpack, we don't need to push
20712 // a new AND onto the operand
20713 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
20714 if (MaskAndTypeMatch(EltTy))
20715 return Src;
20716
20717 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
20718 // to see if the mask is all-ones of size MemTy.
20719 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
20720 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
20721 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
20722 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
20723 if (MaskAndTypeMatch(EltTy))
20724 return Src;
20725 }
20726
20727 // Truncate to prevent a DUP with an over wide constant
20728 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
20729
20730 // Otherwise, make sure we propagate the AND to the operand
20731 // of the unpack
20732 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
20733 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
20734
20735 SDValue And = DAG.getNode(ISD::AND, DL,
20736 UnpkOp->getValueType(0), UnpkOp, Dup);
20737
20738 return DAG.getNode(Opc, DL, N->getValueType(0), And);
20739 }
20740
20741 if (DCI.isBeforeLegalizeOps())
20742 return SDValue();
20743
20744 // If both sides of AND operations are i1 splat_vectors then
20745 // we can produce just i1 splat_vector as the result.
20746 if (isAllActivePredicate(DAG, N->getOperand(0)))
20747 return N->getOperand(1);
20748 if (isAllActivePredicate(DAG, N->getOperand(1)))
20749 return N->getOperand(0);
20750
20752 return SDValue();
20753
20754 SDValue Mask = N->getOperand(1);
20755
20756 if (!Src.hasOneUse())
20757 return SDValue();
20758
20759 EVT MemVT;
20760
20761 // SVE load instructions perform an implicit zero-extend, which makes them
20762 // perfect candidates for combining.
20763 switch (Opc) {
20764 case AArch64ISD::LD1_MERGE_ZERO:
20765 case AArch64ISD::LDNF1_MERGE_ZERO:
20766 case AArch64ISD::LDFF1_MERGE_ZERO:
20767 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
20768 break;
20769 case AArch64ISD::GLD1_MERGE_ZERO:
20770 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20771 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20772 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20773 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20774 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20775 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20776 case AArch64ISD::GLDFF1_MERGE_ZERO:
20777 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20778 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20779 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20780 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20781 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20782 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20783 case AArch64ISD::GLDNT1_MERGE_ZERO:
20784 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
20785 break;
20786 default:
20787 return SDValue();
20788 }
20789
20790 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
20791 return Src;
20792
20793 return SDValue();
20794}
20795
20796// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
20799
20800 // This function performs an optimization on a specific pattern involving
20801 // an AND operation and SETCC (Set Condition Code) node.
20802
20803 SDValue SetCC = N->getOperand(0);
20804 EVT VT = N->getValueType(0);
20805 SelectionDAG &DAG = DCI.DAG;
20806
20807 // Checks if the current node (N) is used by any SELECT instruction and
20808 // returns an empty SDValue to avoid applying the optimization to prevent
20809 // incorrect results
20810 for (auto U : N->users())
20811 if (U->getOpcode() == ISD::SELECT)
20812 return SDValue();
20813
20814 // Check if the operand is a SETCC node with floating-point comparison
20815 if (SetCC.getOpcode() == ISD::SETCC &&
20816 SetCC.getOperand(0).getValueType() == MVT::f32) {
20817
20818 SDValue Cmp;
20820
20821 // Check if the DAG is after legalization and if we can emit the conjunction
20822 if (!DCI.isBeforeLegalize() &&
20823 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
20824
20826
20827 SDLoc DL(N);
20828 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20829 DAG.getConstant(0, DL, VT),
20830 getCondCode(DAG, InvertedCC), Cmp);
20831 }
20832 }
20833 return SDValue();
20834}
20835
20838 SelectionDAG &DAG = DCI.DAG;
20839 SDValue LHS = N->getOperand(0);
20840 SDValue RHS = N->getOperand(1);
20841 EVT VT = N->getValueType(0);
20842
20843 if (SDValue R = performANDORCSELCombine(N, DAG))
20844 return R;
20845
20846 if (SDValue R = performANDSETCCCombine(N,DCI))
20847 return R;
20848
20849 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20850 return SDValue();
20851
20852 if (VT.isScalableVector())
20853 return performSVEAndCombine(N, DCI);
20854
20855 // The combining code below works only for NEON vectors. In particular, it
20856 // does not work for SVE when dealing with vectors wider than 128 bits.
20857 if (!VT.is64BitVector() && !VT.is128BitVector())
20858 return SDValue();
20859
20861 if (!BVN)
20862 return SDValue();
20863
20864 // AND does not accept an immediate, so check if we can use a BIC immediate
20865 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20866 // pattern in isel, because some immediates may be lowered to the preferred
20867 // (and x, (movi imm)) form, even though an mvni representation also exists.
20868 APInt DefBits(VT.getSizeInBits(), 0);
20869 APInt UndefBits(VT.getSizeInBits(), 0);
20870 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20871 SDValue NewOp;
20872
20873 // Any bits known to already be 0 need not be cleared again, which can help
20874 // reduce the size of the immediate to one supported by the instruction.
20875 KnownBits Known = DAG.computeKnownBits(LHS);
20876 APInt ZeroSplat(VT.getSizeInBits(), 0);
20877 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20878 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20879 << (Known.Zero.getBitWidth() * I);
20880
20881 DefBits = ~(DefBits | ZeroSplat);
20882 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20883 DefBits, &LHS)) ||
20884 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20885 DefBits, &LHS)))
20886 return NewOp;
20887
20888 UndefBits = ~(UndefBits | ZeroSplat);
20889 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20890 UndefBits, &LHS)) ||
20891 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20892 UndefBits, &LHS)))
20893 return NewOp;
20894 }
20895
20896 return SDValue();
20897}
20898
20901 SelectionDAG &DAG = DCI.DAG;
20902 SDValue LHS = N->getOperand(0);
20903 SDValue RHS = N->getOperand(1);
20904 EVT VT = N->getValueType(0);
20905 SDLoc DL(N);
20906
20907 if (!N->getFlags().hasAllowReassociation())
20908 return SDValue();
20909
20910 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20911 auto ReassocComplex = [&](SDValue A, SDValue B) {
20912 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20913 return SDValue();
20914 unsigned Opc = A.getConstantOperandVal(0);
20915 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20916 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20917 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20918 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20919 return SDValue();
20920 SDValue VCMLA = DAG.getNode(
20921 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20922 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20923 A.getOperand(2), A.getOperand(3));
20924 VCMLA->setFlags(A->getFlags());
20925 return VCMLA;
20926 };
20927 if (SDValue R = ReassocComplex(LHS, RHS))
20928 return R;
20929 if (SDValue R = ReassocComplex(RHS, LHS))
20930 return R;
20931
20932 return SDValue();
20933}
20934
20935static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20936 switch (Opcode) {
20937 case ISD::STRICT_FADD:
20938 case ISD::FADD:
20939 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20940 case ISD::ADD:
20941 return VT == MVT::i64;
20942 default:
20943 return false;
20944 }
20945}
20946
20947static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20949
20951 if ((N.getOpcode() == ISD::SETCC) ||
20952 // get_active_lane_mask is lowered to a whilelo instruction.
20953 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20954 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20955 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20956 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
20957 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20958 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
20959 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20960 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
20961 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20962 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
20963 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20964 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
20965 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20966 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
20967 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20968 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
20969 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20970 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
20971 return true;
20972
20973 return false;
20974}
20975
20976// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20977// ... into: "ptrue p, all" + PTEST
20978static SDValue
20981 const AArch64Subtarget *Subtarget) {
20982 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20983 // Make sure PTEST can be legalised with illegal types.
20984 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20985 return SDValue();
20986
20987 SDValue N0 = N->getOperand(0);
20988 EVT VT = N0.getValueType();
20989
20990 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20991 !isNullConstant(N->getOperand(1)))
20992 return SDValue();
20993
20994 // Restricted the DAG combine to only cases where we're extracting from a
20995 // flag-setting operation.
20996 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
20997 return SDValue();
20998
20999 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
21000 SelectionDAG &DAG = DCI.DAG;
21001 SDValue Pg = DAG.getConstant(1, SDLoc(N), VT);
21002 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
21003}
21004
21005// Materialize : Idx = (add (mul vscale, NumEls), -1)
21006// i1 = extract_vector_elt t37, Constant:i64<Idx>
21007// ... into: "ptrue p, all" + PTEST
21008static SDValue
21011 const AArch64Subtarget *Subtarget) {
21012 using namespace llvm::SDPatternMatch;
21013 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
21014 // Make sure PTEST is legal types.
21015 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
21016 return SDValue();
21017
21018 SDValue N0 = N->getOperand(0);
21019 EVT OpVT = N0.getValueType();
21020
21021 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
21022 return SDValue();
21023
21024 SDValue Idx = N->getOperand(1);
21025 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
21026 if (!sd_match(Idx, m_ZExtOrSelf(
21027 m_Add(m_VScale(m_SpecificInt(NumEls)), m_AllOnes()))))
21028 return SDValue();
21029
21030 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
21031 SelectionDAG &DAG = DCI.DAG;
21032 SDValue Pg = DAG.getConstant(1, SDLoc(N), OpVT);
21033 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
21034}
21035
21036static SDValue
21038 const AArch64Subtarget *Subtarget) {
21039 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
21040 SelectionDAG &DAG = DCI.DAG;
21041 SDValue Vec = N->getOperand(0);
21042 SDValue Idx = N->getOperand(1);
21043
21045 return SDValue();
21046
21047 // Only legal for 8, 16, 32, and 64 bit element types.
21048 EVT EltVT = Vec.getValueType().getVectorElementType();
21049 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
21050 MVT::bf16, MVT::f32, MVT::f64}),
21051 EltVT.getSimpleVT().SimpleTy))
21052 return SDValue();
21053
21054 SDValue Mask = Idx.getOperand(0);
21055 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21056 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
21057 return SDValue();
21058
21059 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
21060 Vec);
21061}
21062
21063static SDValue
21065 const AArch64Subtarget *Subtarget) {
21066 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
21067 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
21068 return Res;
21069 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
21070 return Res;
21071 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
21072 return Res;
21073
21074 SelectionDAG &DAG = DCI.DAG;
21075 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
21076
21077 EVT VT = N->getValueType(0);
21078 const bool FullFP16 = Subtarget->hasFullFP16();
21079 bool IsStrict = N0->isStrictFPOpcode();
21080
21081 // extract(dup x) -> x
21082 if (N0.getOpcode() == AArch64ISD::DUP)
21083 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
21084 : N0.getOperand(0);
21085
21086 // Rewrite for pairwise fadd pattern
21087 // (f32 (extract_vector_elt
21088 // (fadd (vXf32 Other)
21089 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
21090 // ->
21091 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
21092 // (extract_vector_elt (vXf32 Other) 1))
21093 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
21094 // we can only do this when it's used only by the extract_vector_elt.
21095 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
21096 (!IsStrict || N0.hasOneUse())) {
21097 SDLoc DL(N0);
21098 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
21099 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
21100
21102 SDValue Other = N00;
21103
21104 // And handle the commutative case.
21105 if (!Shuffle) {
21106 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
21107 Other = N01;
21108 }
21109
21110 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
21111 Other == Shuffle->getOperand(0)) {
21112 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
21113 DAG.getConstant(0, DL, MVT::i64));
21114 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
21115 DAG.getConstant(1, DL, MVT::i64));
21116 if (!IsStrict)
21117 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
21118
21119 // For strict_fadd we need uses of the final extract_vector to be replaced
21120 // with the strict_fadd, but we also need uses of the chain output of the
21121 // original strict_fadd to use the chain output of the new strict_fadd as
21122 // otherwise it may not be deleted.
21123 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
21124 {VT, MVT::Other},
21125 {N0->getOperand(0), Extract1, Extract2});
21126 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
21127 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
21128 return SDValue(N, 0);
21129 }
21130 }
21131
21132 // Given an extract(load) or extract(extend(load)), produce a scalar load
21133 // instead to avoid the cross-register-bank copies.
21134 if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
21135 VT.isInteger() && isa<ConstantSDNode>(N1)) {
21136 SDValue LoadN0 = N0;
21137 // Look through sext/zext and extract_subvector / insert_subvector if
21138 // required.
21139 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
21140 N0.getOpcode() == ISD::SIGN_EXTEND ||
21141 N0.getOpcode() == ISD::ANY_EXTEND) &&
21142 N0.getOperand(0).hasOneUse())
21143 LoadN0 = N0.getOperand(0);
21144 unsigned OffsetElts = 0;
21145 if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
21146 OffsetElts = LoadN0.getConstantOperandVal(1);
21147 LoadN0 = LoadN0.getOperand(0);
21148 }
21149 if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
21150 LoadN0.getOperand(0).isUndef() &&
21151 isNullConstant(LoadN0.getOperand(2)) &&
21152 LoadN0.getOperand(1).hasOneUse())
21153 LoadN0 = LoadN0.getOperand(1);
21154
21155 // Check all the uses are valid and can be scalarized. We check that all the
21156 // uses are extracts and those extracts are not re-inserted into an
21157 // operation best treated as a vector register.
21158 auto Load = dyn_cast<LoadSDNode>(LoadN0);
21159 if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
21160 Load->getMemoryVT().isByteSized() &&
21161 all_of(N0->uses(), [&](const SDUse &U) {
21162 return U.getResNo() != N0.getResNo() ||
21163 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21164 !any_of(U.getUser()->uses(), [](const SDUse &U2) {
21165 return U2.getUser()->getOpcode() ==
21166 ISD::INSERT_VECTOR_ELT ||
21167 U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
21168 U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
21169 }));
21170 })) {
21171
21172 SDLoc DL(Load);
21173
21174 // Generate a new scalar load.
21175 unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
21176 Load->getValueType(0).getScalarSizeInBits() / 8;
21177 SDValue BasePtr = DAG.getObjectPtrOffset(
21178 DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
21179 ISD::LoadExtType ExtType =
21183 : ISD::EXTLOAD);
21184 SDValue ScalarLoad =
21185 DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
21186 Load->getPointerInfo().getWithOffset(Offset),
21187 Load->getValueType(0).getScalarType(),
21188 commonAlignment(Load->getAlign(), Offset),
21189 Load->getMemOperand()->getFlags(), Load->getAAInfo());
21190 DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
21191 return ScalarLoad;
21192 }
21193 }
21194
21195 return SDValue();
21196}
21197
21200 SelectionDAG &DAG) {
21201 SDLoc DL(N);
21202 EVT VT = N->getValueType(0);
21203 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
21204 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
21205
21206 // For unpacked types:
21207 // concat(zip1(a, b), zip2(a, b)) => trn1(a, b)
21208 if (DCI.isAfterLegalizeDAG() && isUnpackedType(N0.getValueType(), DAG) &&
21209 N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
21210 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
21211 N0.getOperand(1) == N1.getOperand(1)) {
21212 // If the type is unpacked, then each element is separated by a gap at least
21213 // as big as the element size. It is therefore safe to re-interpret the
21214 // inputs with double the elements and ignore odd elements (hence TRN1).
21215 SDValue Op0MoreElems =
21216 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, N0.getOperand(0));
21217 SDValue Op1MoreElems =
21218 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, N0.getOperand(1));
21219 return DAG.getNode(AArch64ISD::TRN1, DL, VT, Op0MoreElems, Op1MoreElems);
21220 }
21221
21222 if (VT.isScalableVector())
21223 return SDValue();
21224
21225 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
21226 N1Opc == ISD::TRUNCATE) {
21227 SDValue N00 = N0->getOperand(0);
21228 SDValue N10 = N1->getOperand(0);
21229 EVT N00VT = N00.getValueType();
21230 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
21231
21232 // Optimize concat_vectors of truncated vectors, where the intermediate
21233 // type is illegal, to avoid said illegality, e.g.,
21234 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
21235 // (v2i16 (truncate (v2i64)))))
21236 // ->
21237 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
21238 // (v4i32 (bitcast (v2i64))),
21239 // <0, 2, 4, 6>)))
21240 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
21241 // on both input and result type, so we might generate worse code.
21242 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
21243 if (N00VT == N10.getValueType() &&
21244 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
21245 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
21246 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
21248 for (size_t i = 0; i < Mask.size(); ++i)
21249 Mask[i] = i * 2;
21250 return DAG.getNode(ISD::TRUNCATE, DL, VT,
21251 DAG.getVectorShuffle(
21252 MidVT, DL,
21253 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
21254 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
21255 }
21256
21257 // Optimize two large shifts and a combine into a single combine and shift
21258 // For AArch64 architectures, sequences like the following:
21259 //
21260 // ushr v0.4s, v0.4s, #20
21261 // ushr v1.4s, v1.4s, #20
21262 // uzp1 v0.8h, v0.8h, v1.8h
21263 //
21264 // Can be optimized to:
21265 //
21266 // uzp2 v0.8h, v0.8h, v1.8h
21267 // ushr v0.8h, v0.8h, #4
21268 //
21269 // This optimization reduces instruction count.
21270 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
21271 N00->getOperand(1) == N10->getOperand(1)) {
21272 SDValue N000 = N00->getOperand(0);
21273 SDValue N100 = N10->getOperand(0);
21274 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
21275 N101ConstVal = N10->getConstantOperandVal(1),
21276 NScalarSize = N->getValueType(0).getScalarSizeInBits();
21277
21278 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
21279 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
21280 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
21281 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
21282 SDValue NewShiftConstant =
21283 DAG.getTargetConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
21284
21285 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
21286 }
21287 }
21288 }
21289
21290 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
21291 N->getOperand(0).getValueType() == MVT::v2i16 ||
21292 N->getOperand(0).getValueType() == MVT::v2i8) {
21293 EVT SrcVT = N->getOperand(0).getValueType();
21294 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
21295 // loads to prevent having to go through the v4i8 load legalization that
21296 // needs to extend each element into a larger type.
21297 if (N->getNumOperands() % 2 == 0 &&
21298 all_of(N->op_values(), [SrcVT](SDValue V) {
21299 if (V.getValueType() != SrcVT)
21300 return false;
21301 if (V.isUndef())
21302 return true;
21303 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
21304 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
21305 LD->getExtensionType() == ISD::NON_EXTLOAD;
21306 })) {
21307 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
21308 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
21310
21311 for (unsigned i = 0; i < N->getNumOperands(); i++) {
21312 SDValue V = N->getOperand(i);
21313 if (V.isUndef())
21314 Ops.push_back(DAG.getUNDEF(FVT));
21315 else {
21317 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
21318 LD->getBasePtr(), LD->getMemOperand());
21319 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
21320 Ops.push_back(NewLoad);
21321 }
21322 }
21323 return DAG.getBitcast(N->getValueType(0),
21324 DAG.getBuildVector(NVT, DL, Ops));
21325 }
21326 }
21327
21328 // Canonicalise concat_vectors to replace concatenations of truncated nots
21329 // with nots of concatenated truncates. This in some cases allows for multiple
21330 // redundant negations to be eliminated.
21331 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
21332 // (v4i16 (truncate (not (v4i32)))))
21333 // ->
21334 // (not (concat_vectors (v4i16 (truncate (v4i32))),
21335 // (v4i16 (truncate (v4i32)))))
21336 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
21337 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
21338 N->isOnlyUserOf(N1.getNode())) {
21339 auto isBitwiseVectorNegate = [](SDValue V) {
21340 return V->getOpcode() == ISD::XOR &&
21341 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
21342 };
21343 SDValue N00 = N0->getOperand(0);
21344 SDValue N10 = N1->getOperand(0);
21345 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
21346 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
21347 return DAG.getNOT(
21348 DL,
21351 N00->getOperand(0)),
21353 N10->getOperand(0))),
21354 VT);
21355 }
21356 }
21357
21358 // Wait till after everything is legalized to try this. That way we have
21359 // legal vector types and such.
21360 if (DCI.isBeforeLegalizeOps())
21361 return SDValue();
21362
21363 // Optimise concat_vectors of two identical binops with a 128-bit destination
21364 // size, combine into an binop of two contacts of the source vectors. eg:
21365 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
21366 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
21367 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
21368 isVectorizedBinOp(N0Opc)) &&
21369 N0->hasOneUse() && N1->hasOneUse()) {
21370 SDValue N00 = N0->getOperand(0);
21371 SDValue N01 = N0->getOperand(1);
21372 SDValue N10 = N1->getOperand(0);
21373 SDValue N11 = N1->getOperand(1);
21374
21375 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
21376 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
21377 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
21378 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
21379 }
21380 }
21381
21382 auto IsRSHRN = [](SDValue Shr) {
21383 if (Shr.getOpcode() != AArch64ISD::VLSHR)
21384 return false;
21385 SDValue Op = Shr.getOperand(0);
21386 EVT VT = Op.getValueType();
21387 unsigned ShtAmt = Shr.getConstantOperandVal(1);
21388 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
21389 return false;
21390
21391 APInt Imm;
21392 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
21393 Imm = APInt(VT.getScalarSizeInBits(),
21394 Op.getOperand(1).getConstantOperandVal(0)
21395 << Op.getOperand(1).getConstantOperandVal(1));
21396 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
21397 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
21398 Imm = APInt(VT.getScalarSizeInBits(),
21399 Op.getOperand(1).getConstantOperandVal(0));
21400 else
21401 return false;
21402
21403 if (Imm != 1ULL << (ShtAmt - 1))
21404 return false;
21405 return true;
21406 };
21407
21408 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
21409 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
21410 ((IsRSHRN(N1) &&
21412 N1.isUndef())) {
21413 SDValue X = N0.getOperand(0).getOperand(0);
21414 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
21415 : N1.getOperand(0).getOperand(0);
21416 EVT BVT =
21417 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
21418 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
21419 SDValue Add = DAG.getNode(
21420 ISD::ADD, DL, BVT, CC,
21421 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
21422 SDValue Shr =
21423 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
21424 return Shr;
21425 }
21426
21427 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
21428 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
21429 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
21430 N0.getOperand(1) == N1.getOperand(1)) {
21431 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
21432 DAG.getUNDEF(N0.getValueType()));
21433 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
21434 DAG.getUNDEF(N0.getValueType()));
21435 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
21436 }
21437
21438 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
21439 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
21440 // canonicalise to that.
21441 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
21442 assert(VT.getScalarSizeInBits() == 64);
21443 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
21444 DAG.getConstant(0, DL, MVT::i64));
21445 }
21446
21447 // Canonicalise concat_vectors so that the right-hand vector has as few
21448 // bit-casts as possible before its real operation. The primary matching
21449 // destination for these operations will be the narrowing "2" instructions,
21450 // which depend on the operation being performed on this right-hand vector.
21451 // For example,
21452 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
21453 // becomes
21454 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
21455
21456 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
21457 return SDValue();
21458 SDValue RHS = N1->getOperand(0);
21459 MVT RHSTy = RHS.getValueType().getSimpleVT();
21460 // If the RHS is not a vector, this is not the pattern we're looking for.
21461 if (!RHSTy.isVector())
21462 return SDValue();
21463
21464 LLVM_DEBUG(
21465 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
21466
21467 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
21468 RHSTy.getVectorNumElements() * 2);
21469 return DAG.getNode(ISD::BITCAST, DL, VT,
21470 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
21471 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
21472 RHS));
21473}
21474
21475static SDValue
21477 SelectionDAG &DAG) {
21478 if (DCI.isBeforeLegalizeOps())
21479 return SDValue();
21480
21481 EVT VT = N->getValueType(0);
21482 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
21483 return SDValue();
21484
21485 SDValue V = N->getOperand(0);
21486
21487 // NOTE: This combine exists in DAGCombiner, but that version's legality check
21488 // blocks this combine because the non-const case requires custom lowering.
21489 //
21490 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
21491 if (V.getOpcode() == ISD::SPLAT_VECTOR)
21492 if (isa<ConstantSDNode>(V.getOperand(0)))
21493 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
21494
21495 return SDValue();
21496}
21497
21498static SDValue
21500 SelectionDAG &DAG) {
21501 SDLoc DL(N);
21502 SDValue Vec = N->getOperand(0);
21503 SDValue SubVec = N->getOperand(1);
21504 uint64_t IdxVal = N->getConstantOperandVal(2);
21505 EVT VecVT = Vec.getValueType();
21506 EVT SubVT = SubVec.getValueType();
21507
21508 // Promote fixed length vector zeros.
21509 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
21510 Vec.isUndef() && isZerosVector(SubVec.getNode()))
21511 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
21512 : DAG.getConstantFP(0, DL, VecVT);
21513
21514 // Only do this for legal fixed vector types.
21515 if (!VecVT.isFixedLengthVector() ||
21516 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
21517 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
21518 return SDValue();
21519
21520 // Ignore widening patterns.
21521 if (IdxVal == 0 && Vec.isUndef())
21522 return SDValue();
21523
21524 // Subvector must be half the width and an "aligned" insertion.
21525 unsigned NumSubElts = SubVT.getVectorNumElements();
21526 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
21527 (IdxVal != 0 && IdxVal != NumSubElts))
21528 return SDValue();
21529
21530 // Fold insert_subvector -> concat_vectors
21531 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
21532 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
21533 SDValue Lo, Hi;
21534 if (IdxVal == 0) {
21535 Lo = SubVec;
21536 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
21537 DAG.getVectorIdxConstant(NumSubElts, DL));
21538 } else {
21539 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
21540 DAG.getVectorIdxConstant(0, DL));
21541 Hi = SubVec;
21542 }
21543 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
21544}
21545
21548 SelectionDAG &DAG) {
21549 // Wait until after everything is legalized to try this. That way we have
21550 // legal vector types and such.
21551 if (DCI.isBeforeLegalizeOps())
21552 return SDValue();
21553 // Transform a scalar conversion of a value from a lane extract into a
21554 // lane extract of a vector conversion. E.g., from foo1 to foo2:
21555 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
21556 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
21557 //
21558 // The second form interacts better with instruction selection and the
21559 // register allocator to avoid cross-class register copies that aren't
21560 // coalescable due to a lane reference.
21561
21562 // Check the operand and see if it originates from a lane extract.
21563 SDValue Op1 = N->getOperand(1);
21565 return SDValue();
21566
21567 // Yep, no additional predication needed. Perform the transform.
21568 SDValue IID = N->getOperand(0);
21569 SDValue Shift = N->getOperand(2);
21570 SDValue Vec = Op1.getOperand(0);
21571 SDValue Lane = Op1.getOperand(1);
21572 EVT ResTy = N->getValueType(0);
21573 EVT VecResTy;
21574 SDLoc DL(N);
21575
21576 // The vector width should be 128 bits by the time we get here, even
21577 // if it started as 64 bits (the extract_vector handling will have
21578 // done so). Bail if it is not.
21579 if (Vec.getValueSizeInBits() != 128)
21580 return SDValue();
21581
21582 if (Vec.getValueType() == MVT::v4i32)
21583 VecResTy = MVT::v4f32;
21584 else if (Vec.getValueType() == MVT::v2i64)
21585 VecResTy = MVT::v2f64;
21586 else
21587 return SDValue();
21588
21589 SDValue Convert =
21590 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
21591 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
21592}
21593
21594// AArch64 high-vector "long" operations are formed by performing the non-high
21595// version on an extract_subvector of each operand which gets the high half:
21596//
21597// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
21598//
21599// However, there are cases which don't have an extract_high explicitly, but
21600// have another operation that can be made compatible with one for free. For
21601// example:
21602//
21603// (dupv64 scalar) --> (extract_high (dup128 scalar))
21604//
21605// This routine does the actual conversion of such DUPs, once outer routines
21606// have determined that everything else is in order.
21607// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
21608// similarly here.
21610 MVT VT = N.getSimpleValueType();
21611 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21612 N.getConstantOperandVal(1) == 0)
21613 N = N.getOperand(0);
21614
21615 switch (N.getOpcode()) {
21616 case AArch64ISD::DUP:
21617 case AArch64ISD::DUPLANE8:
21618 case AArch64ISD::DUPLANE16:
21619 case AArch64ISD::DUPLANE32:
21620 case AArch64ISD::DUPLANE64:
21621 case AArch64ISD::MOVI:
21622 case AArch64ISD::MOVIshift:
21623 case AArch64ISD::MOVIedit:
21624 case AArch64ISD::MOVImsl:
21625 case AArch64ISD::MVNIshift:
21626 case AArch64ISD::MVNImsl:
21627 break;
21628 default:
21629 // FMOV could be supported, but isn't very useful, as it would only occur
21630 // if you passed a bitcast' floating point immediate to an eligible long
21631 // integer op (addl, smull, ...).
21632 return SDValue();
21633 }
21634
21635 if (!VT.is64BitVector())
21636 return SDValue();
21637
21638 SDLoc DL(N);
21639 unsigned NumElems = VT.getVectorNumElements();
21640 if (N.getValueType().is64BitVector()) {
21641 MVT ElementTy = VT.getVectorElementType();
21642 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
21643 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
21644 }
21645
21646 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
21647 DAG.getConstant(NumElems, DL, MVT::i64));
21648}
21649
21651 if (N.getOpcode() == ISD::BITCAST)
21652 N = N.getOperand(0);
21653 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21654 return false;
21655 if (N.getOperand(0).getValueType().isScalableVector())
21656 return false;
21657 return N.getConstantOperandAPInt(1) ==
21658 N.getOperand(0).getValueType().getVectorNumElements() / 2;
21659}
21660
21661/// Helper structure to keep track of ISD::SET_CC operands.
21667
21668/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
21673
21674/// Helper structure to keep track of SetCC information.
21679
21680/// Helper structure to be able to read SetCC information. If set to
21681/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
21682/// GenericSetCCInfo.
21687
21688/// Check whether or not \p Op is a SET_CC operation, either a generic or
21689/// an
21690/// AArch64 lowered one.
21691/// \p SetCCInfo is filled accordingly.
21692/// \post SetCCInfo is meanginfull only when this function returns true.
21693/// \return True when Op is a kind of SET_CC operation.
21695 // If this is a setcc, this is straight forward.
21696 if (Op.getOpcode() == ISD::SETCC) {
21697 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
21698 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
21699 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
21700 SetCCInfo.IsAArch64 = false;
21701 return true;
21702 }
21703 // Otherwise, check if this is a matching csel instruction.
21704 // In other words:
21705 // - csel 1, 0, cc
21706 // - csel 0, 1, !cc
21707 if (Op.getOpcode() != AArch64ISD::CSEL)
21708 return false;
21709 // Set the information about the operands.
21710 // TODO: we want the operands of the Cmp not the csel
21711 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
21712 SetCCInfo.IsAArch64 = true;
21713 SetCCInfo.Info.AArch64.CC =
21714 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21715
21716 // Check that the operands matches the constraints:
21717 // (1) Both operands must be constants.
21718 // (2) One must be 1 and the other must be 0.
21719 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
21720 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
21721
21722 // Check (1).
21723 if (!TValue || !FValue)
21724 return false;
21725
21726 // Check (2).
21727 if (!TValue->isOne()) {
21728 // Update the comparison when we are interested in !cc.
21729 std::swap(TValue, FValue);
21730 SetCCInfo.Info.AArch64.CC =
21732 }
21733 return TValue->isOne() && FValue->isZero();
21734}
21735
21736// Returns true if Op is setcc or zext of setcc.
21738 if (isSetCC(Op, Info))
21739 return true;
21740 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
21741 isSetCC(Op->getOperand(0), Info));
21742}
21743
21744// The folding we want to perform is:
21745// (add x, [zext] (setcc cc ...) )
21746// -->
21747// (csel x, (add x, 1), !cc ...)
21748//
21749// The latter will get matched to a CSINC instruction.
21751 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
21752 SDValue LHS = Op->getOperand(0);
21753 SDValue RHS = Op->getOperand(1);
21754 SetCCInfoAndKind InfoAndKind;
21755
21756 // If both operands are a SET_CC, then we don't want to perform this
21757 // folding and create another csel as this results in more instructions
21758 // (and higher register usage).
21759 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
21760 isSetCCOrZExtSetCC(RHS, InfoAndKind))
21761 return SDValue();
21762
21763 // If neither operand is a SET_CC, give up.
21764 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
21765 std::swap(LHS, RHS);
21766 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
21767 return SDValue();
21768 }
21769
21770 // FIXME: This could be generatized to work for FP comparisons.
21771 EVT CmpVT = InfoAndKind.IsAArch64
21772 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
21773 : InfoAndKind.Info.Generic.Opnd0->getValueType();
21774 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
21775 return SDValue();
21776
21777 SDValue CCVal;
21778 SDValue Cmp;
21779 SDLoc DL(Op);
21780 if (InfoAndKind.IsAArch64) {
21781 CCVal = DAG.getConstant(
21783 MVT::i32);
21784 Cmp = *InfoAndKind.Info.AArch64.Cmp;
21785 } else
21786 Cmp = getAArch64Cmp(
21787 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
21788 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
21789 DL);
21790
21791 EVT VT = Op->getValueType(0);
21792 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
21793 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
21794}
21795
21796// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
21798 EVT VT = N->getValueType(0);
21799 // Only scalar integer and vector types.
21800 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
21801 return SDValue();
21802
21803 SDValue LHS = N->getOperand(0);
21804 SDValue RHS = N->getOperand(1);
21805 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21806 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
21807 return SDValue();
21808
21809 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
21810 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
21811 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
21812 return SDValue();
21813
21814 SDValue Op1 = LHS->getOperand(0);
21815 SDValue Op2 = RHS->getOperand(0);
21816 EVT OpVT1 = Op1.getValueType();
21817 EVT OpVT2 = Op2.getValueType();
21818 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
21819 Op2.getOpcode() != AArch64ISD::UADDV ||
21820 OpVT1.getVectorElementType() != VT)
21821 return SDValue();
21822
21823 SDValue Val1 = Op1.getOperand(0);
21824 SDValue Val2 = Op2.getOperand(0);
21825 EVT ValVT = Val1->getValueType(0);
21826 SDLoc DL(N);
21827 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
21828 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
21829 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
21830 DAG.getConstant(0, DL, MVT::i64));
21831}
21832
21833/// Perform the scalar expression combine in the form of:
21834/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
21835/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
21837 EVT VT = N->getValueType(0);
21838 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
21839 return SDValue();
21840
21841 SDValue LHS = N->getOperand(0);
21842 SDValue RHS = N->getOperand(1);
21843
21844 // Handle commutivity.
21845 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21846 LHS.getOpcode() != AArch64ISD::CSNEG) {
21847 std::swap(LHS, RHS);
21848 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21849 LHS.getOpcode() != AArch64ISD::CSNEG) {
21850 return SDValue();
21851 }
21852 }
21853
21854 if (!LHS.hasOneUse())
21855 return SDValue();
21856
21858 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
21859
21860 // The CSEL should include a const one operand, and the CSNEG should include
21861 // One or NegOne operand.
21862 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
21863 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
21864 if (!CTVal || !CFVal)
21865 return SDValue();
21866
21867 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
21868 (CTVal->isOne() || CFVal->isOne())) &&
21869 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
21870 (CTVal->isOne() || CFVal->isAllOnes())))
21871 return SDValue();
21872
21873 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
21874 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
21875 !CFVal->isOne()) {
21876 std::swap(CTVal, CFVal);
21878 }
21879
21880 SDLoc DL(N);
21881 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
21882 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
21883 !CFVal->isAllOnes()) {
21884 APInt C = -1 * CFVal->getAPIntValue();
21885 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
21886 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
21888 }
21889
21890 // It might be neutral for larger constants, as the immediate need to be
21891 // materialized in a register.
21892 APInt ADDC = CTVal->getAPIntValue();
21893 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21894 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
21895 return SDValue();
21896
21897 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
21898 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
21899 "Unexpected constant value");
21900
21901 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
21902 SDValue CCVal = getCondCode(DAG, AArch64CC);
21903 SDValue Cmp = LHS.getOperand(3);
21904
21905 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
21906}
21907
21908// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
21910 EVT VT = N->getValueType(0);
21911 if (N->getOpcode() != ISD::ADD)
21912 return SDValue();
21913
21914 SDValue Dot = N->getOperand(0);
21915 SDValue A = N->getOperand(1);
21916 // Handle commutivity
21917 auto isZeroDot = [](SDValue Dot) {
21918 return (Dot.getOpcode() == AArch64ISD::UDOT ||
21919 Dot.getOpcode() == AArch64ISD::SDOT ||
21920 Dot.getOpcode() == AArch64ISD::USDOT) &&
21922 };
21923 if (!isZeroDot(Dot))
21924 std::swap(Dot, A);
21925 if (!isZeroDot(Dot))
21926 return SDValue();
21927
21928 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
21929 Dot.getOperand(2));
21930}
21931
21933 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
21934}
21935
21936// Try to fold
21937//
21938// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
21939//
21940// The folding helps csel to be matched with csneg without generating
21941// redundant neg instruction, which includes negation of the csel expansion
21942// of abs node lowered by lowerABS.
21944 if (!isNegatedInteger(SDValue(N, 0)))
21945 return SDValue();
21946
21947 SDValue CSel = N->getOperand(1);
21948 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
21949 return SDValue();
21950
21951 SDValue N0 = CSel.getOperand(0);
21952 SDValue N1 = CSel.getOperand(1);
21953
21954 // If neither of them are negations, it's not worth the folding as it
21955 // introduces two additional negations while reducing one negation.
21956 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
21957 return SDValue();
21958
21959 SDLoc DL(N);
21960 EVT VT = CSel.getValueType();
21961
21962 SDValue N0N = DAG.getNegative(N0, DL, VT);
21963 SDValue N1N = DAG.getNegative(N1, DL, VT);
21964
21965 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
21966 CSel.getOperand(3));
21967}
21968
21969// The basic add/sub long vector instructions have variants with "2" on the end
21970// which act on the high-half of their inputs. They are normally matched by
21971// patterns like:
21972//
21973// (add (zeroext (extract_high LHS)),
21974// (zeroext (extract_high RHS)))
21975// -> uaddl2 vD, vN, vM
21976//
21977// However, if one of the extracts is something like a duplicate, this
21978// instruction can still be used profitably. This function puts the DAG into a
21979// more appropriate form for those patterns to trigger.
21982 SelectionDAG &DAG = DCI.DAG;
21983 if (DCI.isBeforeLegalizeOps())
21984 return SDValue();
21985
21986 MVT VT = N->getSimpleValueType(0);
21987 if (!VT.is128BitVector()) {
21988 if (N->getOpcode() == ISD::ADD)
21989 return performSetccAddFolding(N, DAG);
21990 return SDValue();
21991 }
21992
21993 // Make sure both branches are extended in the same way.
21994 SDValue LHS = N->getOperand(0);
21995 SDValue RHS = N->getOperand(1);
21996 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
21997 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
21998 LHS.getOpcode() != RHS.getOpcode())
21999 return SDValue();
22000
22001 unsigned ExtType = LHS.getOpcode();
22002
22003 // It's not worth doing if at least one of the inputs isn't already an
22004 // extract, but we don't know which it'll be so we have to try both.
22005 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
22006 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
22007 if (!RHS.getNode())
22008 return SDValue();
22009
22010 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
22011 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
22012 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
22013 if (!LHS.getNode())
22014 return SDValue();
22015
22016 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
22017 }
22018
22019 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
22020}
22021
22022static bool isCMP(SDValue Op) {
22023 return Op.getOpcode() == AArch64ISD::SUBS &&
22024 !Op.getNode()->hasAnyUseOfValue(0);
22025}
22026
22027// (CSEL 1 0 CC Cond) => CC
22028// (CSEL 0 1 CC Cond) => !CC
22029static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
22030 if (Op.getOpcode() != AArch64ISD::CSEL)
22031 return std::nullopt;
22032 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
22033 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
22034 return std::nullopt;
22035 SDValue OpLHS = Op.getOperand(0);
22036 SDValue OpRHS = Op.getOperand(1);
22037 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
22038 return CC;
22039 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
22040 return getInvertedCondCode(CC);
22041
22042 return std::nullopt;
22043}
22044
22045// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
22046// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
22047static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
22048 SDValue CmpOp = Op->getOperand(2);
22049 if (!isCMP(CmpOp))
22050 return SDValue();
22051
22052 if (IsAdd) {
22053 if (!isOneConstant(CmpOp.getOperand(1)))
22054 return SDValue();
22055 } else {
22056 if (!isNullConstant(CmpOp.getOperand(0)))
22057 return SDValue();
22058 }
22059
22060 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
22061 auto CC = getCSETCondCode(CsetOp);
22062 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
22063 return SDValue();
22064
22065 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
22066 Op->getOperand(0), Op->getOperand(1),
22067 CsetOp.getOperand(3));
22068}
22069
22070// (ADC x 0 cond) => (CINC x HS cond)
22072 SDValue LHS = N->getOperand(0);
22073 SDValue RHS = N->getOperand(1);
22074 SDValue Cond = N->getOperand(2);
22075
22076 if (!isNullConstant(RHS))
22077 return SDValue();
22078
22079 EVT VT = N->getValueType(0);
22080 SDLoc DL(N);
22081
22082 // (CINC x cc cond) <=> (CSINC x x !cc cond)
22084 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
22085}
22086
22089 SelectionDAG &DAG) {
22090 SDLoc DL(N);
22091 EVT VT = N->getValueType(0);
22092
22094 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
22095 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
22096 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
22097 if (Elt0->getOpcode() == ISD::FP_ROUND &&
22098 Elt1->getOpcode() == ISD::FP_ROUND &&
22099 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
22100 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
22101 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
22103 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22104 // Constant index.
22106 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
22107 Elt0->getOperand(0)->getOperand(0) ==
22108 Elt1->getOperand(0)->getOperand(0) &&
22109 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
22110 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
22111 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
22112 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
22113 SDValue HighLanes;
22114 if (Elt2->isUndef() && Elt3->isUndef()) {
22115 HighLanes = DAG.getPOISON(MVT::v2f32);
22116 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
22117 Elt3->getOpcode() == ISD::FP_ROUND &&
22118 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
22119 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
22120 Elt2->getConstantOperandVal(1) ==
22121 Elt3->getConstantOperandVal(1) &&
22122 Elt2->getOperand(0)->getOpcode() ==
22124 Elt3->getOperand(0)->getOpcode() ==
22126 // Constant index.
22127 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
22128 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
22129 Elt2->getOperand(0)->getOperand(0) ==
22130 Elt3->getOperand(0)->getOperand(0) &&
22131 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
22132 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
22133 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
22134 HighLanes =
22135 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
22136 }
22137 if (HighLanes) {
22138 SDValue DoubleToSingleSticky =
22139 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
22140 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
22141 DoubleToSingleSticky, HighLanes);
22142 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
22143 Elt0->getOperand(1));
22144 }
22145 }
22146 }
22147 }
22148
22149 if (VT == MVT::v2f64) {
22150 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
22151 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
22152 Elt1->getOpcode() == ISD::FP_EXTEND &&
22154 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22155 Elt0->getOperand(0)->getOperand(0) ==
22156 Elt1->getOperand(0)->getOperand(0) &&
22157 // Constant index.
22159 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
22160 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
22161 Elt1->getOperand(0)->getConstantOperandVal(1) &&
22162 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
22163 // ResultType's known minimum vector length.
22164 Elt0->getOperand(0)->getConstantOperandVal(1) %
22166 0) {
22167 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
22168 if (SrcVec.getValueType() == MVT::v4f16 ||
22169 SrcVec.getValueType() == MVT::v4bf16) {
22170 SDValue HalfToSingle =
22171 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
22172 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
22173 SDValue Extract =
22175 VT.changeVectorElementType(*DAG.getContext(), MVT::f32),
22176 HalfToSingle, SubvectorIdx);
22177 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
22178 }
22179 }
22180 }
22181
22182 // A build vector of two extracted elements is equivalent to an
22183 // extract subvector where the inner vector is any-extended to the
22184 // extract_vector_elt VT.
22185 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
22186 // (extract_elt_iXX_to_i32 vec Idx+1))
22187 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
22188
22189 // For now, only consider the v2i32 case, which arises as a result of
22190 // legalization.
22191 if (VT != MVT::v2i32)
22192 return SDValue();
22193
22194 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
22195 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
22196 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22197 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22198 // Constant index.
22199 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
22200 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
22201 // Both EXTRACT_VECTOR_ELT from same vector...
22202 Elt0->getOperand(0) == Elt1->getOperand(0) &&
22203 // ... and contiguous. First element's index +1 == second element's index.
22204 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
22205 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
22206 // ResultType's known minimum vector length.
22207 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
22208 SDValue VecToExtend = Elt0->getOperand(0);
22209 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(
22210 *DAG.getContext(), MVT::i32);
22211 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
22212 return SDValue();
22213
22214 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
22215
22216 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
22217 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
22218 SubvectorIdx);
22219 }
22220
22221 return SDValue();
22222}
22223
22224// A special combine for the sqdmulh family of instructions.
22225// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
22226// SATURATING_VAL ) can be reduced to sqdmulh(...)
22228
22229 if (N->getOpcode() != ISD::SMIN)
22230 return SDValue();
22231
22232 EVT DestVT = N->getValueType(0);
22233
22234 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
22235 DestVT.isScalableVector())
22236 return SDValue();
22237
22238 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
22239
22240 if (!Clamp)
22241 return SDValue();
22242
22243 MVT ScalarType;
22244 unsigned ShiftAmt = 0;
22245 switch (Clamp->getSExtValue()) {
22246 case (1ULL << 15) - 1:
22247 ScalarType = MVT::i16;
22248 ShiftAmt = 16;
22249 break;
22250 case (1ULL << 31) - 1:
22251 ScalarType = MVT::i32;
22252 ShiftAmt = 32;
22253 break;
22254 default:
22255 return SDValue();
22256 }
22257
22258 SDValue Sra = N->getOperand(0);
22259 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
22260 return SDValue();
22261
22262 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
22263 if (!RightShiftVec)
22264 return SDValue();
22265 unsigned SExtValue = RightShiftVec->getSExtValue();
22266
22267 if (SExtValue != (ShiftAmt - 1))
22268 return SDValue();
22269
22270 SDValue Mul = Sra.getOperand(0);
22271 if (Mul.getOpcode() != ISD::MUL)
22272 return SDValue();
22273
22274 SDValue SExt0 = Mul.getOperand(0);
22275 SDValue SExt1 = Mul.getOperand(1);
22276
22277 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
22278 SExt1.getOpcode() != ISD::SIGN_EXTEND)
22279 return SDValue();
22280
22281 EVT SExt0Type = SExt0.getOperand(0).getValueType();
22282 EVT SExt1Type = SExt1.getOperand(0).getValueType();
22283
22284 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
22285 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
22286 SExt0Type.getVectorNumElements() == 1)
22287 return SDValue();
22288
22289 SDLoc DL(N);
22290 SDValue V0 = SExt0.getOperand(0);
22291 SDValue V1 = SExt1.getOperand(0);
22292
22293 // Ensure input vectors are extended to legal types
22294 if (SExt0Type.getFixedSizeInBits() < 64) {
22295 unsigned VecNumElements = SExt0Type.getVectorNumElements();
22296 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
22297 VecNumElements);
22298 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
22299 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
22300 }
22301
22302 SDValue SQDMULH =
22303 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
22304
22305 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
22306}
22307
22309 if (SDValue V = trySQDMULHCombine(N, DAG)) {
22310 return V;
22311 }
22312
22313 return SDValue();
22314}
22315
22318 SDLoc DL(N);
22319 EVT VT = N->getValueType(0);
22320 SDValue N0 = N->getOperand(0);
22321 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
22322 N0.getOpcode() == AArch64ISD::DUP) {
22323 SDValue Op = N0.getOperand(0);
22324 if (VT.getScalarType() == MVT::i32 &&
22325 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
22326 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
22327 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
22328 }
22329
22330 // Performing the following combine produces a preferable form for ISEL.
22331 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
22333 N0.hasOneUse()) {
22334 SDValue Op = N0.getOperand(0);
22335 SDValue ExtractIndexNode = N0.getOperand(1);
22336 if (!isa<ConstantSDNode>(ExtractIndexNode))
22337 return SDValue();
22338
22339 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
22340 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
22341 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
22342 "Unexpected legalisation result!");
22343
22344 EVT SrcVectorType = Op.getValueType();
22345 // We also assume that SrcVectorType cannot be a V64 (see
22346 // LowerEXTRACT_VECTOR_ELT).
22347 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
22348 "Unexpected legalisation result!");
22349
22350 unsigned ExtractIndex =
22351 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
22352 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
22353
22354 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
22355 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
22356 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
22357 }
22358
22359 return SDValue();
22360}
22361
22362// Check an node is an extend or shift operand
22364 unsigned Opcode = N.getOpcode();
22365 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
22366 EVT SrcVT;
22367 if (Opcode == ISD::SIGN_EXTEND_INREG)
22368 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
22369 else
22370 SrcVT = N.getOperand(0).getValueType();
22371
22372 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
22373 } else if (Opcode == ISD::AND) {
22374 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
22375 if (!CSD)
22376 return false;
22377 uint64_t AndMask = CSD->getZExtValue();
22378 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
22379 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
22380 return isa<ConstantSDNode>(N.getOperand(1));
22381 }
22382
22383 return false;
22384}
22385
22386// (N - Y) + Z --> (Z - Y) + N
22387// when N is an extend or shift operand
22389 SelectionDAG &DAG) {
22390 auto IsOneUseExtend = [](SDValue N) {
22391 return N.hasOneUse() && isExtendOrShiftOperand(N);
22392 };
22393
22394 // DAGCombiner will revert the combination when Z is constant cause
22395 // dead loop. So don't enable the combination when Z is constant.
22396 // If Z is one use shift C, we also can't do the optimization.
22397 // It will falling to self infinite loop.
22398 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
22399 return SDValue();
22400
22401 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
22402 return SDValue();
22403
22404 SDValue Shift = SUB.getOperand(0);
22405 if (!IsOneUseExtend(Shift))
22406 return SDValue();
22407
22408 SDLoc DL(N);
22409 EVT VT = N->getValueType(0);
22410
22411 SDValue Y = SUB.getOperand(1);
22412 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
22413 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
22414}
22415
22417 SelectionDAG &DAG) {
22418 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
22419 // commutative.
22420 if (N->getOpcode() != ISD::ADD)
22421 return SDValue();
22422
22423 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
22424 // shifted register is only available for i32 and i64.
22425 EVT VT = N->getValueType(0);
22426 if (VT != MVT::i32 && VT != MVT::i64)
22427 return SDValue();
22428
22429 SDLoc DL(N);
22430 SDValue LHS = N->getOperand(0);
22431 SDValue RHS = N->getOperand(1);
22432
22433 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
22434 return Val;
22435 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
22436 return Val;
22437
22438 uint64_t LHSImm = 0, RHSImm = 0;
22439 // If both operand are shifted by imm and shift amount is not greater than 4
22440 // for one operand, swap LHS and RHS to put operand with smaller shift amount
22441 // on RHS.
22442 //
22443 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
22444 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
22445 // with LSL (shift > 4). For the rest of processors, this is no-op for
22446 // performance or correctness.
22447 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
22448 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
22449 RHSImm > 4 && LHS.hasOneUse())
22450 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
22451
22452 return SDValue();
22453}
22454
22455// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
22456// This reassociates it back to allow the creation of more mls instructions.
22458 if (N->getOpcode() != ISD::SUB)
22459 return SDValue();
22460
22461 SDValue Add = N->getOperand(1);
22462 SDValue X = N->getOperand(0);
22463 if (Add.getOpcode() != ISD::ADD)
22464 return SDValue();
22465
22466 if (!Add.hasOneUse())
22467 return SDValue();
22469 return SDValue();
22470
22471 SDValue M1 = Add.getOperand(0);
22472 SDValue M2 = Add.getOperand(1);
22473 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
22474 M1.getOpcode() != AArch64ISD::UMULL)
22475 return SDValue();
22476 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
22477 M2.getOpcode() != AArch64ISD::UMULL)
22478 return SDValue();
22479
22480 EVT VT = N->getValueType(0);
22481 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
22482 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
22483}
22484
22485// Combine into mla/mls.
22486// This works on the patterns of:
22487// add v1, (mul v2, v3)
22488// sub v1, (mul v2, v3)
22489// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
22490// It will transform the add/sub to a scalable version, so that we can
22491// make use of SVE's MLA/MLS that will be generated for that pattern
22492static SDValue
22494 SelectionDAG &DAG = DCI.DAG;
22495 // Make sure that the types are legal
22496 if (!DCI.isAfterLegalizeDAG())
22497 return SDValue();
22498 // Before using SVE's features, check first if it's available.
22499 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
22500 return SDValue();
22501
22502 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
22503 return SDValue();
22504
22505 if (!N->getValueType(0).isFixedLengthVector())
22506 return SDValue();
22507
22508 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
22509 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
22510 return SDValue();
22511
22512 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
22513 return SDValue();
22514
22515 SDValue MulValue = Op1->getOperand(0);
22516 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
22517 return SDValue();
22518
22519 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
22520 return SDValue();
22521
22522 EVT ScalableVT = MulValue.getValueType();
22523 if (!ScalableVT.isScalableVector())
22524 return SDValue();
22525
22526 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
22527 SDValue NewValue =
22528 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
22529 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
22530 };
22531
22532 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
22533 return res;
22534 else if (N->getOpcode() == ISD::ADD)
22535 return performOpt(N->getOperand(1), N->getOperand(0));
22536
22537 return SDValue();
22538}
22539
22540// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
22541// help, for example, to produce ssra from sshr+add.
22543 EVT VT = N->getValueType(0);
22544 if (VT != MVT::i64 ||
22545 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
22546 return SDValue();
22547 SDValue Op0 = N->getOperand(0);
22548 SDValue Op1 = N->getOperand(1);
22549
22550 // At least one of the operands should be an extract, and the other should be
22551 // something that is easy to convert to v1i64 type (in this case a load).
22552 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
22553 Op0.getOpcode() != ISD::LOAD)
22554 return SDValue();
22555 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
22556 Op1.getOpcode() != ISD::LOAD)
22557 return SDValue();
22558
22559 SDLoc DL(N);
22560 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22561 Op0.getOperand(0).getValueType() == MVT::v1i64) {
22562 Op0 = Op0.getOperand(0);
22563 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
22564 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22565 Op1.getOperand(0).getValueType() == MVT::v1i64) {
22566 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
22567 Op1 = Op1.getOperand(0);
22568 } else
22569 return SDValue();
22570
22571 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
22572 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
22573 DAG.getConstant(0, DL, MVT::i64));
22574}
22575
22578 if (!BV->hasOneUse())
22579 return false;
22580 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
22581 if (!Ld || !Ld->isSimple())
22582 return false;
22583 Loads.push_back(Ld);
22584 return true;
22585 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
22587 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
22588 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
22589 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
22590 return false;
22591 Loads.push_back(Ld);
22592 }
22593 return true;
22594 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
22595 // Try to find a tree of shuffles and concats from how IR shuffles of loads
22596 // are lowered. Note that this only comes up because we do not always visit
22597 // operands before uses. After that is fixed this can be removed and in the
22598 // meantime this is fairly specific to the lowering we expect from IR.
22599 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
22600 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
22601 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
22602 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
22603 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
22604 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
22605 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
22606 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
22607 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
22608 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
22609 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
22610 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22611 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22612 B.getOperand(1).getNumOperands() != 4)
22613 return false;
22614 auto SV1 = cast<ShuffleVectorSDNode>(B);
22615 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
22616 int NumElts = B.getValueType().getVectorNumElements();
22617 int NumSubElts = NumElts / 4;
22618 for (int I = 0; I < NumSubElts; I++) {
22619 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
22620 if (SV1->getMaskElt(I) != I ||
22621 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22622 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
22623 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
22624 return false;
22625 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
22626 if (SV2->getMaskElt(I) != I ||
22627 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22628 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
22629 return false;
22630 }
22631 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
22632 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
22633 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
22634 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
22635 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
22636 !Ld2->isSimple() || !Ld3->isSimple())
22637 return false;
22638 Loads.push_back(Ld0);
22639 Loads.push_back(Ld1);
22640 Loads.push_back(Ld2);
22641 Loads.push_back(Ld3);
22642 return true;
22643 }
22644 return false;
22645}
22646
22648 SelectionDAG &DAG,
22649 unsigned &NumSubLoads) {
22650 if (!Op0.hasOneUse() || !Op1.hasOneUse())
22651 return false;
22652
22653 SmallVector<LoadSDNode *> Loads0, Loads1;
22654 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22655 isLoadOrMultipleLoads(Op1, Loads1)) {
22656 if (NumSubLoads && Loads0.size() != NumSubLoads)
22657 return false;
22658 NumSubLoads = Loads0.size();
22659 return Loads0.size() == Loads1.size() &&
22660 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
22661 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
22662 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
22664 Size / 8, 1);
22665 });
22666 }
22667
22668 if (Op0.getOpcode() != Op1.getOpcode())
22669 return false;
22670
22671 switch (Op0.getOpcode()) {
22672 case ISD::ADD:
22673 case ISD::SUB:
22675 DAG, NumSubLoads) &&
22677 DAG, NumSubLoads);
22678 case ISD::SIGN_EXTEND:
22679 case ISD::ANY_EXTEND:
22680 case ISD::ZERO_EXTEND:
22681 EVT XVT = Op0.getOperand(0).getValueType();
22682 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
22683 XVT.getScalarSizeInBits() != 32)
22684 return false;
22686 DAG, NumSubLoads);
22687 }
22688 return false;
22689}
22690
22691// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
22692// into a single load of twice the size, that we extract the bottom part and top
22693// part so that the shl can use a shll2 instruction. The two loads in that
22694// example can also be larger trees of instructions, which are identical except
22695// for the leaves which are all loads offset from the LHS, including
22696// buildvectors of multiple loads. For example the RHS tree could be
22697// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
22698// Whilst it can be common for the larger loads to replace LDP instructions
22699// (which doesn't gain anything on it's own), the larger loads can help create
22700// more efficient code, and in buildvectors prevent the need for ld1 lane
22701// inserts which can be slower than normal loads.
22703 EVT VT = N->getValueType(0);
22704 if (!VT.isFixedLengthVector() ||
22705 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
22706 VT.getScalarSizeInBits() != 64))
22707 return SDValue();
22708
22709 SDValue Other = N->getOperand(0);
22710 SDValue Shift = N->getOperand(1);
22711 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
22712 std::swap(Shift, Other);
22713 APInt ShiftAmt;
22714 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
22715 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
22716 return SDValue();
22717
22718 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
22719 !ISD::isExtOpcode(Other.getOpcode()) ||
22720 Shift.getOperand(0).getOperand(0).getValueType() !=
22721 Other.getOperand(0).getValueType() ||
22722 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
22723 return SDValue();
22724
22725 SDValue Op0 = Other.getOperand(0);
22726 SDValue Op1 = Shift.getOperand(0).getOperand(0);
22727
22728 unsigned NumSubLoads = 0;
22729 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
22730 return SDValue();
22731
22732 // Attempt to rule out some unprofitable cases using heuristics (some working
22733 // around suboptimal code generation), notably if the extend not be able to
22734 // use ushll2 instructions as the types are not large enough. Otherwise zip's
22735 // will need to be created which can increase the instruction count.
22736 unsigned NumElts = Op0.getValueType().getVectorNumElements();
22737 unsigned NumSubElts = NumElts / NumSubLoads;
22738 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
22739 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
22740 Op0.getValueType().getSizeInBits() < 128 &&
22742 return SDValue();
22743
22744 // Recreate the tree with the new combined loads.
22745 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
22746 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
22747 EVT DVT =
22749
22750 SmallVector<LoadSDNode *> Loads0, Loads1;
22751 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22752 isLoadOrMultipleLoads(Op1, Loads1)) {
22753 EVT LoadVT = EVT::getVectorVT(
22754 *DAG.getContext(), Op0.getValueType().getScalarType(),
22755 Op0.getValueType().getVectorNumElements() / Loads0.size());
22756 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22757
22758 SmallVector<SDValue> NewLoads;
22759 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
22760 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
22761 L0->getBasePtr(), L0->getPointerInfo(),
22762 L0->getBaseAlign());
22763 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
22764 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
22765 NewLoads.push_back(Load);
22766 }
22767 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
22768 }
22769
22771 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
22772 Ops.push_back(GenCombinedTree(O0, O1, DAG));
22773 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
22774 };
22775 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
22776
22777 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
22778 int Hi = NumSubElts, Lo = 0;
22779 for (unsigned i = 0; i < NumSubLoads; i++) {
22780 for (unsigned j = 0; j < NumSubElts; j++) {
22781 LowMask[i * NumSubElts + j] = Lo++;
22782 HighMask[i * NumSubElts + j] = Hi++;
22783 }
22784 Lo += NumSubElts;
22785 Hi += NumSubElts;
22786 }
22787 SDLoc DL(N);
22788 SDValue Ext0, Ext1;
22789 // Extract the top and bottom lanes, then extend the result. Possibly extend
22790 // the result then extract the lanes if the two operands match as it produces
22791 // slightly smaller code.
22792 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
22794 NewOp, DAG.getConstant(0, DL, MVT::i64));
22795 SDValue SubH =
22796 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
22797 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22798 SDValue Extr0 =
22799 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
22800 SDValue Extr1 =
22801 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
22802 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
22803 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
22804 } else {
22806 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
22807 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22808 DAG.getConstant(0, DL, MVT::i64));
22809 SDValue SubH =
22810 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22811 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22812 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
22813 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
22814 }
22815 SDValue NShift =
22816 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
22817 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
22818}
22819
22820// Attempt to combine the following patterns:
22821// SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
22822// SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)
22823// The CSET may be preceded by a ZEXT.
22825 if (N->getOpcode() != ISD::SUB)
22826 return SDValue();
22827
22828 EVT VT = N->getValueType(0);
22829 if (VT != MVT::i32 && VT != MVT::i64)
22830 return SDValue();
22831
22832 SDValue N1 = N->getOperand(1);
22833 if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse())
22834 N1 = N1.getOperand(0);
22835 if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO)
22836 return SDValue();
22837
22838 SDValue Flags = N1.getOperand(3);
22839 if (Flags.getOpcode() != AArch64ISD::SUBS)
22840 return SDValue();
22841
22842 SDLoc DL(N);
22843 SDValue N0 = N->getOperand(0);
22844 if (N0->getOpcode() == ISD::SUB)
22845 return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0),
22846 N0.getOperand(1), Flags);
22847 return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT),
22848 Flags);
22849}
22850
22851// add(trunc(ashr(A, C)), trunc(lshr(A, BW-1))), with C >= BW
22852// ->
22853// X = trunc(ashr(A, C)); add(x, lshr(X, BW-1)
22854// The original converts into ashr+lshr+xtn+xtn+add. The second becomes
22855// ashr+xtn+usra. The first form has less total latency due to more parallelism,
22856// but more micro-ops and seems to be slower in practice.
22858 using namespace llvm::SDPatternMatch;
22859 EVT VT = N->getValueType(0);
22860 if (VT != MVT::v2i32 && VT != MVT::v4i16 && VT != MVT::v8i8)
22861 return SDValue();
22862
22863 SDValue AShr, LShr;
22864 if (!sd_match(N, m_Add(m_Trunc(m_Value(AShr)), m_Trunc(m_Value(LShr)))))
22865 return SDValue();
22866 if (AShr.getOpcode() != AArch64ISD::VASHR)
22867 std::swap(AShr, LShr);
22868 if (AShr.getOpcode() != AArch64ISD::VASHR ||
22869 LShr.getOpcode() != AArch64ISD::VLSHR ||
22870 AShr.getOperand(0) != LShr.getOperand(0) ||
22872 LShr.getConstantOperandVal(1) != VT.getScalarSizeInBits() * 2 - 1)
22873 return SDValue();
22874
22875 SDLoc DL(N);
22876 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, AShr);
22877 SDValue Shift = DAG.getNode(
22878 AArch64ISD::VLSHR, DL, VT, Trunc,
22879 DAG.getTargetConstant(VT.getScalarSizeInBits() - 1, DL, MVT::i32));
22880 return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
22881}
22882
22885 // Try to change sum of two reductions.
22886 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
22887 return Val;
22888 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
22889 return Val;
22890 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
22891 return Val;
22892 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
22893 return Val;
22894 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
22895 return Val;
22897 return Val;
22898 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
22899 return Val;
22900 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
22901 return Val;
22902 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
22903 return Val;
22904 if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
22905 return Val;
22906 if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
22907 return Val;
22908
22909 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
22910 return Val;
22911
22912 return performAddSubLongCombine(N, DCI);
22913}
22914
22915// Massage DAGs which we can use the high-half "long" operations on into
22916// something isel will recognize better. E.g.
22917//
22918// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
22919// (aarch64_neon_umull (extract_high (v2i64 vec)))
22920// (extract_high (v2i64 (dup128 scalar)))))
22921//
22924 SelectionDAG &DAG) {
22925 if (DCI.isBeforeLegalizeOps())
22926 return SDValue();
22927
22928 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
22929 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
22930 assert(LHS.getValueType().is64BitVector() &&
22931 RHS.getValueType().is64BitVector() &&
22932 "unexpected shape for long operation");
22933
22934 // Either node could be a DUP, but it's not worth doing both of them (you'd
22935 // just as well use the non-high version) so look for a corresponding extract
22936 // operation on the other "wing".
22939 if (!RHS.getNode())
22940 return SDValue();
22943 if (!LHS.getNode())
22944 return SDValue();
22945 } else
22946 return SDValue();
22947
22948 if (IID == Intrinsic::not_intrinsic)
22949 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
22950
22951 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
22952 N->getOperand(0), LHS, RHS);
22953}
22954
22955static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22956 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
22957 unsigned ElemBits = ElemTy.getSizeInBits();
22958
22959 int64_t ShiftAmount;
22960 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
22961 APInt SplatValue, SplatUndef;
22962 unsigned SplatBitSize;
22963 bool HasAnyUndefs;
22964 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
22965 HasAnyUndefs, ElemBits) ||
22966 SplatBitSize != ElemBits)
22967 return SDValue();
22968
22969 ShiftAmount = SplatValue.getSExtValue();
22970 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
22971 ShiftAmount = CVN->getSExtValue();
22972 } else
22973 return SDValue();
22974
22975 // If the shift amount is zero, remove the shift intrinsic.
22976 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
22977 return N->getOperand(1);
22978
22979 unsigned Opcode;
22980 bool IsRightShift;
22981 switch (IID) {
22982 default:
22983 llvm_unreachable("Unknown shift intrinsic");
22984 case Intrinsic::aarch64_neon_sqshl:
22985 Opcode = AArch64ISD::SQSHL_I;
22986 IsRightShift = false;
22987 break;
22988 case Intrinsic::aarch64_neon_uqshl:
22989 Opcode = AArch64ISD::UQSHL_I;
22990 IsRightShift = false;
22991 break;
22992 case Intrinsic::aarch64_neon_srshl:
22993 Opcode = AArch64ISD::SRSHR_I;
22994 IsRightShift = true;
22995 break;
22996 case Intrinsic::aarch64_neon_urshl:
22997 Opcode = AArch64ISD::URSHR_I;
22998 IsRightShift = true;
22999 break;
23000 case Intrinsic::aarch64_neon_sqshlu:
23001 Opcode = AArch64ISD::SQSHLU_I;
23002 IsRightShift = false;
23003 break;
23004 case Intrinsic::aarch64_neon_sshl:
23005 case Intrinsic::aarch64_neon_ushl:
23006 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
23007 // left shift for positive shift amounts. For negative shifts we can use a
23008 // VASHR/VLSHR as appropriate.
23009 if (ShiftAmount < 0) {
23010 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
23011 : AArch64ISD::VLSHR;
23012 ShiftAmount = -ShiftAmount;
23013 } else
23014 Opcode = AArch64ISD::VSHL;
23015 IsRightShift = false;
23016 break;
23017 }
23018
23019 EVT VT = N->getValueType(0);
23020 SDValue Op = N->getOperand(1);
23021 SDLoc DL(N);
23022 if (VT == MVT::i64) {
23023 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
23024 VT = MVT::v1i64;
23025 }
23026
23027 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
23028 Op = DAG.getNode(Opcode, DL, VT, Op,
23029 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32, true));
23030 if (N->getValueType(0) == MVT::i64)
23031 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
23032 DAG.getConstant(0, DL, MVT::i64));
23033 return Op;
23034 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
23035 Op = DAG.getNode(Opcode, DL, VT, Op,
23036 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
23037 if (N->getValueType(0) == MVT::i64)
23038 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
23039 DAG.getConstant(0, DL, MVT::i64));
23040 return Op;
23041 }
23042
23043 return SDValue();
23044}
23045
23046// The CRC32[BH] instructions ignore the high bits of their data operand. Since
23047// the intrinsics must be legal and take an i32, this means there's almost
23048// certainly going to be a zext in the DAG which we can eliminate.
23049static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
23050 SDValue AndN = N->getOperand(2);
23051 if (AndN.getOpcode() != ISD::AND)
23052 return SDValue();
23053
23055 if (!CMask || CMask->getZExtValue() != Mask)
23056 return SDValue();
23057
23058 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
23059 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
23060}
23061
23063 SelectionDAG &DAG) {
23064 SDLoc DL(N);
23065 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
23066 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
23067 N->getOperand(1)),
23068 DAG.getConstant(0, DL, MVT::i64));
23069}
23070
23072 SDLoc DL(N);
23073 SDValue Op1 = N->getOperand(1);
23074 SDValue Op2 = N->getOperand(2);
23075 EVT ScalarTy = Op2.getValueType();
23076 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
23077 ScalarTy = MVT::i32;
23078
23079 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
23080 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
23081 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
23082 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
23083 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
23084 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
23085}
23086
23088 SDLoc DL(N);
23089 SDValue Scalar = N->getOperand(3);
23090 EVT ScalarTy = Scalar.getValueType();
23091
23092 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
23093 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
23094
23095 SDValue Passthru = N->getOperand(1);
23096 SDValue Pred = N->getOperand(2);
23097 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
23098 Pred, Scalar, Passthru);
23099}
23100
23102 SDLoc DL(N);
23103 LLVMContext &Ctx = *DAG.getContext();
23104 EVT VT = N->getValueType(0);
23105
23106 assert(VT.isScalableVector() && "Expected a scalable vector.");
23107
23108 // Current lowering only supports the SVE-ACLE types.
23110 return SDValue();
23111
23112 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
23113 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
23114 EVT ByteVT =
23115 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
23116
23117 // Convert everything to the domain of EXT (i.e bytes).
23118 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
23119 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
23120 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
23121 DAG.getConstant(ElemSize, DL, MVT::i32));
23122
23123 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
23124 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
23125}
23126
23129 SelectionDAG &DAG) {
23130 if (DCI.isBeforeLegalize())
23131 return SDValue();
23132
23133 SDValue Comparator = N->getOperand(3);
23134 if (Comparator.getOpcode() == AArch64ISD::DUP ||
23135 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
23136 unsigned IID = getIntrinsicID(N);
23137 EVT VT = N->getValueType(0);
23138 EVT CmpVT = N->getOperand(2).getValueType();
23139 SDValue Pred = N->getOperand(1);
23140 SDValue Imm;
23141 SDLoc DL(N);
23142
23143 switch (IID) {
23144 default:
23145 llvm_unreachable("Called with wrong intrinsic!");
23146 break;
23147
23148 // Signed comparisons
23149 case Intrinsic::aarch64_sve_cmpeq_wide:
23150 case Intrinsic::aarch64_sve_cmpne_wide:
23151 case Intrinsic::aarch64_sve_cmpge_wide:
23152 case Intrinsic::aarch64_sve_cmpgt_wide:
23153 case Intrinsic::aarch64_sve_cmplt_wide:
23154 case Intrinsic::aarch64_sve_cmple_wide: {
23155 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
23156 int64_t ImmVal = CN->getSExtValue();
23157 if (ImmVal >= -16 && ImmVal <= 15)
23158 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
23159 else
23160 return SDValue();
23161 }
23162 break;
23163 }
23164 // Unsigned comparisons
23165 case Intrinsic::aarch64_sve_cmphs_wide:
23166 case Intrinsic::aarch64_sve_cmphi_wide:
23167 case Intrinsic::aarch64_sve_cmplo_wide:
23168 case Intrinsic::aarch64_sve_cmpls_wide: {
23169 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
23170 uint64_t ImmVal = CN->getZExtValue();
23171 if (ImmVal <= 127)
23172 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
23173 else
23174 return SDValue();
23175 }
23176 break;
23177 }
23178 }
23179
23180 if (!Imm)
23181 return SDValue();
23182
23183 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
23184 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
23185 N->getOperand(2), Splat, DAG.getCondCode(CC));
23186 }
23187
23188 return SDValue();
23189}
23190
23193 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23194
23195 SDLoc DL(Op);
23196 assert(Op.getValueType().isScalableVector() &&
23197 TLI.isTypeLegal(Op.getValueType()) &&
23198 "Expected legal scalable vector type!");
23199 assert(Op.getValueType() == Pg.getValueType() &&
23200 "Expected same type for PTEST operands");
23201
23202 // Ensure target specific opcodes are using legal type.
23203 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
23204 SDValue TVal = DAG.getConstant(1, DL, OutVT);
23205 SDValue FVal = DAG.getConstant(0, DL, OutVT);
23206
23207 // Ensure operands have type nxv16i1.
23208 if (Op.getValueType() != MVT::nxv16i1) {
23211 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
23212 else
23213 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
23214 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
23215 }
23216
23217 unsigned PTest = AArch64ISD::PTEST;
23219 PTest = AArch64ISD::PTEST_ANY;
23220 else if (Cond == AArch64CC::FIRST_ACTIVE)
23221 PTest = AArch64ISD::PTEST_FIRST;
23222
23223 // Set condition code (CC) flags.
23224 SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
23225
23226 // Convert CC to integer based on requested condition.
23227 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
23228 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
23229 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
23230 return DAG.getZExtOrTrunc(Res, DL, VT);
23231}
23232
23234 SelectionDAG &DAG) {
23235 SDLoc DL(N);
23236
23237 SDValue Pred = N->getOperand(1);
23238 SDValue VecToReduce = N->getOperand(2);
23239
23240 // NOTE: The integer reduction's result type is not always linked to the
23241 // operand's element type so we construct it from the intrinsic's result type.
23242 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
23243 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
23244
23245 // SVE reductions set the whole vector register with the first element
23246 // containing the reduction result, which we'll now extract.
23247 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23248 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23249 Zero);
23250}
23251
23253 SelectionDAG &DAG) {
23254 SDLoc DL(N);
23255
23256 SDValue Pred = N->getOperand(1);
23257 SDValue VecToReduce = N->getOperand(2);
23258
23259 EVT ReduceVT = VecToReduce.getValueType();
23260 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
23261
23262 // SVE reductions set the whole vector register with the first element
23263 // containing the reduction result, which we'll now extract.
23264 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23265 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23266 Zero);
23267}
23268
23270 SelectionDAG &DAG) {
23271 SDLoc DL(N);
23272
23273 SDValue Pred = N->getOperand(1);
23274 SDValue InitVal = N->getOperand(2);
23275 SDValue VecToReduce = N->getOperand(3);
23276 EVT ReduceVT = VecToReduce.getValueType();
23277
23278 // Ordered reductions use the first lane of the result vector as the
23279 // reduction's initial value.
23280 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23281 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
23282 DAG.getUNDEF(ReduceVT), InitVal, Zero);
23283
23284 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
23285
23286 // SVE reductions set the whole vector register with the first element
23287 // containing the reduction result, which we'll now extract.
23288 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23289 Zero);
23290}
23291
23293 SelectionDAG &DAG) {
23294 if (N->getValueType(0) != MVT::i16)
23295 return SDValue();
23296
23297 SDLoc DL(N);
23298 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
23299 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
23300 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
23301}
23302
23303// If a merged operation has no inactive lanes we can relax it to a predicated
23304// or unpredicated operation, which potentially allows better isel (perhaps
23305// using immediate forms) or relaxing register reuse requirements.
23307 SelectionDAG &DAG, bool UnpredOp = false,
23308 bool SwapOperands = false) {
23309 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
23310 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
23311 SDValue Pg = N->getOperand(1);
23312 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
23313 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
23314
23315 // ISD way to specify an all active predicate.
23316 if (isAllActivePredicate(DAG, Pg)) {
23317 if (UnpredOp)
23318 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
23319
23320 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
23321 }
23322
23323 // FUTURE: SplatVector(true)
23324 return SDValue();
23325}
23326
23327static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
23328 SDLoc DL(N);
23329 EVT VT = N->getValueType(0);
23330 SDValue Op1 = N->getOperand(1);
23331 SDValue Op2 = N->getOperand(2);
23332 SDValue Op3 = N->getOperand(3);
23333
23334 switch (IID) {
23335 default:
23336 llvm_unreachable("Called with wrong intrinsic!");
23337 case Intrinsic::aarch64_sve_bsl:
23338 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
23339 case Intrinsic::aarch64_sve_bsl1n:
23340 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
23341 Op2);
23342 case Intrinsic::aarch64_sve_bsl2n:
23343 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
23344 DAG.getNOT(DL, Op2, VT));
23345 case Intrinsic::aarch64_sve_nbsl:
23346 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
23347 VT);
23348 }
23349}
23350
23353 const AArch64Subtarget *Subtarget) {
23354 SelectionDAG &DAG = DCI.DAG;
23355 unsigned IID = getIntrinsicID(N);
23356 switch (IID) {
23357 default:
23358 break;
23359 case Intrinsic::aarch64_neon_vcvtfxs2fp:
23360 case Intrinsic::aarch64_neon_vcvtfxu2fp:
23361 return tryCombineFixedPointConvert(N, DCI, DAG);
23362 case Intrinsic::aarch64_neon_saddv:
23363 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
23364 case Intrinsic::aarch64_neon_uaddv:
23365 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
23366 case Intrinsic::aarch64_neon_sminv:
23367 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
23368 case Intrinsic::aarch64_neon_uminv:
23369 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
23370 case Intrinsic::aarch64_neon_smaxv:
23371 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
23372 case Intrinsic::aarch64_neon_umaxv:
23373 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
23374 case Intrinsic::aarch64_neon_fmax:
23375 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
23376 N->getOperand(1), N->getOperand(2));
23377 case Intrinsic::aarch64_neon_fmin:
23378 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
23379 N->getOperand(1), N->getOperand(2));
23380 case Intrinsic::aarch64_neon_fmaxnm:
23381 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
23382 N->getOperand(1), N->getOperand(2));
23383 case Intrinsic::aarch64_neon_fminnm:
23384 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
23385 N->getOperand(1), N->getOperand(2));
23386 case Intrinsic::aarch64_neon_smull:
23387 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
23388 N->getOperand(1), N->getOperand(2));
23389 case Intrinsic::aarch64_neon_umull:
23390 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
23391 N->getOperand(1), N->getOperand(2));
23392 case Intrinsic::aarch64_neon_pmull:
23393 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
23394 N->getOperand(1), N->getOperand(2));
23395 case Intrinsic::aarch64_neon_sqdmull:
23396 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
23397 case Intrinsic::aarch64_neon_sqshl:
23398 case Intrinsic::aarch64_neon_uqshl:
23399 case Intrinsic::aarch64_neon_sqshlu:
23400 case Intrinsic::aarch64_neon_srshl:
23401 case Intrinsic::aarch64_neon_urshl:
23402 case Intrinsic::aarch64_neon_sshl:
23403 case Intrinsic::aarch64_neon_ushl:
23404 return tryCombineShiftImm(IID, N, DAG);
23405 case Intrinsic::aarch64_neon_sabd:
23406 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
23407 N->getOperand(1), N->getOperand(2));
23408 case Intrinsic::aarch64_neon_uabd:
23409 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
23410 N->getOperand(1), N->getOperand(2));
23411 case Intrinsic::aarch64_neon_fcvtzs:
23412 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
23413 case Intrinsic::aarch64_neon_fcvtzu:
23414 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
23415 case Intrinsic::aarch64_neon_fcvtas:
23416 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
23417 case Intrinsic::aarch64_neon_fcvtau:
23418 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
23419 case Intrinsic::aarch64_neon_fcvtms:
23420 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
23421 case Intrinsic::aarch64_neon_fcvtmu:
23422 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
23423 case Intrinsic::aarch64_neon_fcvtns:
23424 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
23425 case Intrinsic::aarch64_neon_fcvtnu:
23426 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
23427 case Intrinsic::aarch64_neon_fcvtps:
23428 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
23429 case Intrinsic::aarch64_neon_fcvtpu:
23430 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
23431 case Intrinsic::aarch64_crc32b:
23432 case Intrinsic::aarch64_crc32cb:
23433 return tryCombineCRC32(0xff, N, DAG);
23434 case Intrinsic::aarch64_crc32h:
23435 case Intrinsic::aarch64_crc32ch:
23436 return tryCombineCRC32(0xffff, N, DAG);
23437 case Intrinsic::aarch64_sve_saddv:
23438 // There is no i64 version of SADDV because the sign is irrelevant.
23439 if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
23440 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
23441 else
23442 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
23443 case Intrinsic::aarch64_sve_uaddv:
23444 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
23445 case Intrinsic::aarch64_sve_smaxv:
23446 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
23447 case Intrinsic::aarch64_sve_umaxv:
23448 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
23449 case Intrinsic::aarch64_sve_sminv:
23450 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
23451 case Intrinsic::aarch64_sve_uminv:
23452 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
23453 case Intrinsic::aarch64_sve_orv:
23454 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
23455 case Intrinsic::aarch64_sve_eorv:
23456 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
23457 case Intrinsic::aarch64_sve_andv:
23458 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
23459 case Intrinsic::aarch64_sve_index:
23460 return LowerSVEIntrinsicIndex(N, DAG);
23461 case Intrinsic::aarch64_sve_dup:
23462 return LowerSVEIntrinsicDUP(N, DAG);
23463 case Intrinsic::aarch64_sve_dup_x:
23464 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
23465 N->getOperand(1));
23466 case Intrinsic::aarch64_sve_ext:
23467 return LowerSVEIntrinsicEXT(N, DAG);
23468 case Intrinsic::aarch64_sve_mul_u:
23469 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
23470 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23471 case Intrinsic::aarch64_sve_smulh_u:
23472 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
23473 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23474 case Intrinsic::aarch64_sve_umulh_u:
23475 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
23476 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23477 case Intrinsic::aarch64_sve_smin_u:
23478 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
23479 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23480 case Intrinsic::aarch64_sve_umin_u:
23481 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
23482 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23483 case Intrinsic::aarch64_sve_smax_u:
23484 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
23485 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23486 case Intrinsic::aarch64_sve_umax_u:
23487 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
23488 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23489 case Intrinsic::aarch64_sve_lsl_u:
23490 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
23491 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23492 case Intrinsic::aarch64_sve_lsr_u:
23493 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
23494 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23495 case Intrinsic::aarch64_sve_asr_u:
23496 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
23497 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23498 case Intrinsic::aarch64_sve_fadd_u:
23499 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
23500 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23501 case Intrinsic::aarch64_sve_fdiv_u:
23502 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
23503 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23504 case Intrinsic::aarch64_sve_fmax_u:
23505 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
23506 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23507 case Intrinsic::aarch64_sve_fmaxnm_u:
23508 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
23509 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23510 case Intrinsic::aarch64_sve_fmla_u:
23511 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
23512 N->getOperand(1), N->getOperand(3), N->getOperand(4),
23513 N->getOperand(2));
23514 case Intrinsic::aarch64_sve_fmin_u:
23515 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
23516 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23517 case Intrinsic::aarch64_sve_fminnm_u:
23518 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
23519 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23520 case Intrinsic::aarch64_sve_fmul_u:
23521 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
23522 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23523 case Intrinsic::aarch64_sve_fsub_u:
23524 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
23525 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23526 case Intrinsic::aarch64_sve_add_u:
23527 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
23528 N->getOperand(3));
23529 case Intrinsic::aarch64_sve_sub_u:
23530 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
23531 N->getOperand(3));
23532 case Intrinsic::aarch64_sve_subr:
23533 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
23534 case Intrinsic::aarch64_sve_and_u:
23535 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
23536 N->getOperand(3));
23537 case Intrinsic::aarch64_sve_bic_u:
23538 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
23539 N->getOperand(2), N->getOperand(3));
23540 case Intrinsic::aarch64_sve_saddwb:
23541 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
23542 N->getOperand(1), N->getOperand(2));
23543 case Intrinsic::aarch64_sve_saddwt:
23544 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
23545 N->getOperand(1), N->getOperand(2));
23546 case Intrinsic::aarch64_sve_uaddwb:
23547 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
23548 N->getOperand(1), N->getOperand(2));
23549 case Intrinsic::aarch64_sve_uaddwt:
23550 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
23551 N->getOperand(1), N->getOperand(2));
23552 case Intrinsic::aarch64_sve_eor_u:
23553 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
23554 N->getOperand(3));
23555 case Intrinsic::aarch64_sve_orr_u:
23556 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
23557 N->getOperand(3));
23558 case Intrinsic::aarch64_sve_sabd_u:
23559 if (SDValue V = convertMergedOpToPredOp(N, ISD::ABDS, DAG, true))
23560 return V;
23561 return DAG.getNode(AArch64ISD::ABDS_PRED, SDLoc(N), N->getValueType(0),
23562 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23563 case Intrinsic::aarch64_sve_uabd_u:
23564 if (SDValue V = convertMergedOpToPredOp(N, ISD::ABDU, DAG, true))
23565 return V;
23566 return DAG.getNode(AArch64ISD::ABDU_PRED, SDLoc(N), N->getValueType(0),
23567 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23568 case Intrinsic::aarch64_sve_sdiv_u:
23569 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
23570 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23571 case Intrinsic::aarch64_sve_udiv_u:
23572 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
23573 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23574 case Intrinsic::aarch64_sve_sqadd:
23575 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
23576 case Intrinsic::aarch64_sve_sqsub_u:
23577 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
23578 N->getOperand(2), N->getOperand(3));
23579 case Intrinsic::aarch64_sve_uqadd:
23580 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
23581 case Intrinsic::aarch64_sve_uqsub_u:
23582 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
23583 N->getOperand(2), N->getOperand(3));
23584 case Intrinsic::aarch64_sve_sqadd_x:
23585 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
23586 N->getOperand(1), N->getOperand(2));
23587 case Intrinsic::aarch64_sve_sqsub_x:
23588 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
23589 N->getOperand(1), N->getOperand(2));
23590 case Intrinsic::aarch64_sve_uqadd_x:
23591 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
23592 N->getOperand(1), N->getOperand(2));
23593 case Intrinsic::aarch64_sve_uqsub_x:
23594 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
23595 N->getOperand(1), N->getOperand(2));
23596 case Intrinsic::aarch64_sve_asrd:
23597 return DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, SDLoc(N), N->getValueType(0),
23598 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23599 case Intrinsic::aarch64_sve_cmphs:
23600 if (!N->getOperand(2).getValueType().isFloatingPoint())
23601 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23602 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23603 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
23604 break;
23605 case Intrinsic::aarch64_sve_cmphi:
23606 if (!N->getOperand(2).getValueType().isFloatingPoint())
23607 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23608 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23609 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
23610 break;
23611 case Intrinsic::aarch64_sve_fcmpge:
23612 case Intrinsic::aarch64_sve_cmpge:
23613 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23614 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23615 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
23616 break;
23617 case Intrinsic::aarch64_sve_fcmpgt:
23618 case Intrinsic::aarch64_sve_cmpgt:
23619 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23620 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23621 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
23622 break;
23623 case Intrinsic::aarch64_sve_fcmpeq:
23624 case Intrinsic::aarch64_sve_cmpeq:
23625 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23626 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23627 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
23628 break;
23629 case Intrinsic::aarch64_sve_fcmpne:
23630 case Intrinsic::aarch64_sve_cmpne:
23631 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23632 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23633 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
23634 break;
23635 case Intrinsic::aarch64_sve_fcmpuo:
23636 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23637 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23638 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
23639 break;
23640 case Intrinsic::aarch64_sve_fadda:
23641 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
23642 case Intrinsic::aarch64_sve_faddv:
23643 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
23644 case Intrinsic::aarch64_sve_fmaxnmv:
23645 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
23646 case Intrinsic::aarch64_sve_fmaxv:
23647 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
23648 case Intrinsic::aarch64_sve_fminnmv:
23649 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
23650 case Intrinsic::aarch64_sve_fminv:
23651 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
23652 case Intrinsic::aarch64_sve_sel:
23653 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
23654 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23655 case Intrinsic::aarch64_sve_cmpeq_wide:
23656 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
23657 case Intrinsic::aarch64_sve_cmpne_wide:
23658 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
23659 case Intrinsic::aarch64_sve_cmpge_wide:
23660 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
23661 case Intrinsic::aarch64_sve_cmpgt_wide:
23662 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
23663 case Intrinsic::aarch64_sve_cmplt_wide:
23664 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
23665 case Intrinsic::aarch64_sve_cmple_wide:
23666 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
23667 case Intrinsic::aarch64_sve_cmphs_wide:
23668 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
23669 case Intrinsic::aarch64_sve_cmphi_wide:
23670 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
23671 case Intrinsic::aarch64_sve_cmplo_wide:
23672 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
23673 case Intrinsic::aarch64_sve_cmpls_wide:
23674 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
23675 case Intrinsic::aarch64_sve_ptest_any:
23676 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23678 case Intrinsic::aarch64_sve_ptest_first:
23679 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23681 case Intrinsic::aarch64_sve_ptest_last:
23682 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23684 case Intrinsic::aarch64_sve_whilelo:
23685 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
23686 N->getOperand(1), N->getOperand(2));
23687 case Intrinsic::aarch64_sve_bsl:
23688 case Intrinsic::aarch64_sve_bsl1n:
23689 case Intrinsic::aarch64_sve_bsl2n:
23690 case Intrinsic::aarch64_sve_nbsl:
23691 return combineSVEBitSel(IID, N, DAG);
23692 }
23693 return SDValue();
23694}
23695
23696static bool isCheapToExtend(const SDValue &N) {
23697 unsigned OC = N->getOpcode();
23698 return OC == ISD::LOAD || OC == ISD::MLOAD ||
23700}
23701
23702static SDValue
23704 SelectionDAG &DAG) {
23705 // If we have (sext (setcc A B)) and A and B are cheap to extend,
23706 // we can move the sext into the arguments and have the same result. For
23707 // example, if A and B are both loads, we can make those extending loads and
23708 // avoid an extra instruction. This pattern appears often in VLS code
23709 // generation where the inputs to the setcc have a different size to the
23710 // instruction that wants to use the result of the setcc.
23711 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
23712 N->getOperand(0)->getOpcode() == ISD::SETCC);
23713 const SDValue SetCC = N->getOperand(0);
23714
23715 const SDValue CCOp0 = SetCC.getOperand(0);
23716 const SDValue CCOp1 = SetCC.getOperand(1);
23717 if (!CCOp0->getValueType(0).isInteger() ||
23718 !CCOp1->getValueType(0).isInteger())
23719 return SDValue();
23720
23721 ISD::CondCode Code =
23722 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
23723
23724 ISD::NodeType ExtType =
23725 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23726
23727 if (isCheapToExtend(SetCC.getOperand(0)) &&
23728 isCheapToExtend(SetCC.getOperand(1))) {
23729 const SDValue Ext1 =
23730 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
23731 const SDValue Ext2 =
23732 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
23733
23734 return DAG.getSetCC(
23735 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
23736 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
23737 }
23738
23739 return SDValue();
23740}
23741
23742// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
23743// This comes from interleaved vectorization. It is performed late to capture
23744// uitofp converts too.
23746 SelectionDAG &DAG) {
23747 EVT VT = N->getValueType(0);
23748 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
23749 N->getOpcode() != ISD::ZERO_EXTEND ||
23750 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
23751 return SDValue();
23752
23753 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
23754 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23755 return SDValue();
23756
23757 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
23758 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
23759 if (!Shuffle ||
23760 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
23761 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
23762 return SDValue();
23763
23764 unsigned Idx;
23766 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
23767 // An undef interleave shuffle can come up after other canonicalizations,
23768 // where the shuffle has been converted to
23769 // zext(extract(shuffle b, undef, [u,u,0,4]))
23770 bool IsUndefDeInterleave = false;
23771 if (!IsDeInterleave)
23772 IsUndefDeInterleave =
23773 Shuffle->getOperand(1).isUndef() &&
23774 all_of(
23775 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
23776 [](int M) { return M < 0; }) &&
23778 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
23779 VT.getVectorNumElements() / 2),
23780 4, Idx);
23781 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
23782 return SDValue();
23783 SDLoc DL(N);
23784 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23785 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
23786 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23787 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
23788 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
23789 VT, BC1, BC2);
23790 if ((Idx & 1) == 1)
23791 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
23792 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
23793 return DAG.getNode(
23794 ISD::AND, DL, VT, UZP,
23795 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
23796}
23797
23798// This comes up similar to the above when lowering deinterleaving shuffles from
23799// zexts. We have legalized the operations in the generally case to
23800// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
23801// the extract is to the low half and the uzp is uzp1. There would be an extra
23802// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
23803// there could also be an existing and / shift that can be combined in, either
23804// before of after the extract.
23806 EVT VT = N->getValueType(0);
23807 if (N->getOpcode() != ISD::ZERO_EXTEND ||
23808 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
23809 return SDValue();
23810
23811 SDValue Op = N->getOperand(0);
23812 unsigned ExtOffset = (unsigned)-1;
23813 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23814 ExtOffset = Op.getConstantOperandVal(1);
23815 Op = Op.getOperand(0);
23816 }
23817
23818 unsigned Shift = 0;
23820 Op.getValueType().getScalarSizeInBits());
23821
23822 if (Op.getOpcode() == AArch64ISD::VLSHR) {
23823 Shift = Op.getConstantOperandVal(1);
23824 Op = Op.getOperand(0);
23825 Mask = Mask.lshr(Shift);
23826 }
23827 if (Op.getOpcode() == ISD::AND &&
23828 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
23829 Op = Op.getOperand(0);
23830 Mask = Mask.zext(VT.getScalarSizeInBits());
23831 } else if (Op.getOpcode() == AArch64ISD::BICi) {
23832 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
23833 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
23834 Mask = Mask.zext(VT.getScalarSizeInBits());
23835 Op = Op.getOperand(0);
23836 }
23837
23838 if (ExtOffset == (unsigned)-1) {
23839 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23840 ExtOffset = Op.getConstantOperandVal(1);
23841 Op = Op.getOperand(0);
23842 } else
23843 return SDValue();
23844 }
23845 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23846 return SDValue();
23847
23848 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
23849 return SDValue();
23850 if (Op.getOpcode() == AArch64ISD::UZP2)
23851 Shift += VT.getScalarSizeInBits() / 2;
23852
23853 SDLoc DL(N);
23854 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23855 Op.getOperand(ExtOffset == 0 ? 0 : 1));
23856 if (Shift != 0)
23857 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
23858 DAG.getTargetConstant(Shift, DL, MVT::i32));
23859 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
23860}
23861
23864 SelectionDAG &DAG) {
23865 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
23866 // we can convert that DUP into another extract_high (of a bigger DUP), which
23867 // helps the backend to decide that an sabdl2 would be useful, saving a real
23868 // extract_high operation.
23869 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
23870 N->getOperand(0).getValueType().is64BitVector() &&
23871 (N->getOperand(0).getOpcode() == ISD::ABDU ||
23872 N->getOperand(0).getOpcode() == ISD::ABDS)) {
23873 SDNode *ABDNode = N->getOperand(0).getNode();
23874 SDValue NewABD =
23876 if (!NewABD.getNode())
23877 return SDValue();
23878
23879 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
23880 }
23881
23883 return R;
23884 if (SDValue R = performZExtUZPCombine(N, DAG))
23885 return R;
23886
23887 if (N->getValueType(0).isFixedLengthVector() &&
23888 N->getOpcode() == ISD::SIGN_EXTEND &&
23889 N->getOperand(0)->getOpcode() == ISD::SETCC)
23890 return performSignExtendSetCCCombine(N, DCI, DAG);
23891
23892 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
23893 // that the top half of the result register must be unused, due to the
23894 // any_extend. This means that we can replace this pattern with (rev16
23895 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
23896 // ...)), which is what this pattern would otherwise be lowered to.
23897 // Only apply this optimisation if any_extend in original pattern to i32 or
23898 // i64, because this type will become the input type to REV16 in the new
23899 // pattern, so must be a legitimate REV16 input type.
23900 SDValue Bswap = N->getOperand(0);
23901 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
23902 Bswap.getValueType() == MVT::i16 &&
23903 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
23904 SDLoc DL(N);
23905 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
23906 Bswap->getOperand(0));
23907 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
23908 NewAnyExtend);
23909 }
23910
23911 return SDValue();
23912}
23913
23915 SDValue SplatVal, unsigned NumVecElts) {
23916 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
23917 Align OrigAlignment = St.getAlign();
23918 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
23919
23920 // Create scalar stores. This is at least as good as the code sequence for a
23921 // split unaligned store which is a dup.s, ext.b, and two stores.
23922 // Most of the time the three stores should be replaced by store pair
23923 // instructions (stp).
23924 SDLoc DL(&St);
23925 SDValue BasePtr = St.getBasePtr();
23926 uint64_t BaseOffset = 0;
23927
23928 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
23929 SDValue NewST1 =
23930 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
23931 OrigAlignment, St.getMemOperand()->getFlags());
23932
23933 // As this in ISel, we will not merge this add which may degrade results.
23934 if (BasePtr->getOpcode() == ISD::ADD &&
23935 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
23936 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
23937 BasePtr = BasePtr->getOperand(0);
23938 }
23939
23940 unsigned Offset = EltOffset;
23941 while (--NumVecElts) {
23942 Align Alignment = commonAlignment(OrigAlignment, Offset);
23943 SDValue OffsetPtr =
23944 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23945 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
23946 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
23947 PtrInfo.getWithOffset(Offset), Alignment,
23948 St.getMemOperand()->getFlags());
23949 Offset += EltOffset;
23950 }
23951 return NewST1;
23952}
23953
23954// Returns an SVE type that ContentTy can be trivially sign or zero extended
23955// into.
23956static MVT getSVEContainerType(EVT ContentTy) {
23957 assert(ContentTy.isSimple() && "No SVE containers for extended types");
23958
23959 switch (ContentTy.getSimpleVT().SimpleTy) {
23960 default:
23961 llvm_unreachable("No known SVE container for this MVT type");
23962 case MVT::nxv2i8:
23963 case MVT::nxv2i16:
23964 case MVT::nxv2i32:
23965 case MVT::nxv2i64:
23966 case MVT::nxv2f32:
23967 case MVT::nxv2f64:
23968 return MVT::nxv2i64;
23969 case MVT::nxv4i8:
23970 case MVT::nxv4i16:
23971 case MVT::nxv4i32:
23972 case MVT::nxv4f32:
23973 return MVT::nxv4i32;
23974 case MVT::nxv8i8:
23975 case MVT::nxv8i16:
23976 case MVT::nxv8f16:
23977 case MVT::nxv8bf16:
23978 return MVT::nxv8i16;
23979 case MVT::nxv16i8:
23980 return MVT::nxv16i8;
23981 }
23982}
23983
23985 SDLoc DL(N);
23986 EVT VT = N->getValueType(0);
23987
23989 return SDValue();
23990
23991 EVT ContainerVT = VT;
23992 if (ContainerVT.isInteger())
23993 ContainerVT = getSVEContainerType(ContainerVT);
23994
23995 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23996 SDValue Ops[] = { N->getOperand(0), // Chain
23997 N->getOperand(2), // Pg
23998 N->getOperand(3), // Base
23999 DAG.getValueType(VT) };
24000
24001 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
24002 SDValue LoadChain = SDValue(Load.getNode(), 1);
24003
24004 if (ContainerVT.isInteger() && (VT != ContainerVT))
24005 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
24006
24007 return DAG.getMergeValues({ Load, LoadChain }, DL);
24008}
24009
24011 SDLoc DL(N);
24012 EVT VT = N->getValueType(0);
24013 EVT PtrTy = N->getOperand(3).getValueType();
24014
24015 EVT LoadVT = VT;
24016 if (VT.isFloatingPoint())
24017 LoadVT = VT.changeTypeToInteger();
24018
24019 auto *MINode = cast<MemIntrinsicSDNode>(N);
24020 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
24021 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
24022 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
24023 MINode->getOperand(2), PassThru,
24024 MINode->getMemoryVT(), MINode->getMemOperand(),
24026
24027 if (VT.isFloatingPoint()) {
24028 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
24029 return DAG.getMergeValues(Ops, DL);
24030 }
24031
24032 return L;
24033}
24034
24035template <unsigned Opcode>
24037 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
24038 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
24039 "Unsupported opcode.");
24040 SDLoc DL(N);
24041 EVT VT = N->getValueType(0);
24042
24043 EVT LoadVT = VT;
24044 if (VT.isFloatingPoint())
24045 LoadVT = VT.changeTypeToInteger();
24046
24047 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
24048 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
24049 SDValue LoadChain = SDValue(Load.getNode(), 1);
24050
24051 if (VT.isFloatingPoint())
24052 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
24053
24054 return DAG.getMergeValues({Load, LoadChain}, DL);
24055}
24056
24058 SDLoc DL(N);
24059 SDValue Data = N->getOperand(2);
24060 EVT DataVT = Data.getValueType();
24061 EVT HwSrcVt = getSVEContainerType(DataVT);
24062 SDValue InputVT = DAG.getValueType(DataVT);
24063
24064 if (DataVT.isFloatingPoint())
24065 InputVT = DAG.getValueType(HwSrcVt);
24066
24067 SDValue SrcNew;
24068 if (Data.getValueType().isFloatingPoint())
24069 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
24070 else
24071 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
24072
24073 SDValue Ops[] = { N->getOperand(0), // Chain
24074 SrcNew,
24075 N->getOperand(4), // Base
24076 N->getOperand(3), // Pg
24077 InputVT
24078 };
24079
24080 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
24081}
24082
24084 SDLoc DL(N);
24085
24086 SDValue Data = N->getOperand(2);
24087 EVT DataVT = Data.getValueType();
24088 EVT PtrTy = N->getOperand(4).getValueType();
24089
24090 if (DataVT.isFloatingPoint())
24092
24093 auto *MINode = cast<MemIntrinsicSDNode>(N);
24094 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
24095 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
24096 MINode->getMemoryVT(), MINode->getMemOperand(),
24097 ISD::UNINDEXED, false, false);
24098}
24099
24100/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
24101/// load store optimizer pass will merge them to store pair stores. This should
24102/// be better than a movi to create the vector zero followed by a vector store
24103/// if the zero constant is not re-used, since one instructions and one register
24104/// live range will be removed.
24105///
24106/// For example, the final generated code should be:
24107///
24108/// stp xzr, xzr, [x0]
24109///
24110/// instead of:
24111///
24112/// movi v0.2d, #0
24113/// str q0, [x0]
24114///
24116 SDValue StVal = St.getValue();
24117 EVT VT = StVal.getValueType();
24118
24119 // Avoid scalarizing zero splat stores for scalable vectors.
24120 if (VT.isScalableVector())
24121 return SDValue();
24122
24123 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
24124 // 2, 3 or 4 i32 elements.
24125 int NumVecElts = VT.getVectorNumElements();
24126 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
24127 VT.getVectorElementType().getSizeInBits() == 64) ||
24128 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
24129 VT.getVectorElementType().getSizeInBits() == 32)))
24130 return SDValue();
24131
24132 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
24133 return SDValue();
24134
24135 // If the zero constant has more than one use then the vector store could be
24136 // better since the constant mov will be amortized and stp q instructions
24137 // should be able to be formed.
24138 if (!StVal.hasOneUse())
24139 return SDValue();
24140
24141 // If the store is truncating then it's going down to i16 or smaller, which
24142 // means it can be implemented in a single store anyway.
24143 if (St.isTruncatingStore())
24144 return SDValue();
24145
24146 // If the immediate offset of the address operand is too large for the stp
24147 // instruction, then bail out.
24148 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
24149 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
24151 return SDValue();
24152 }
24153
24154 for (int I = 0; I < NumVecElts; ++I) {
24155 SDValue EltVal = StVal.getOperand(I);
24156 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
24157 return SDValue();
24158 }
24159
24160 // Use a CopyFromReg WZR/XZR here to prevent
24161 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
24162 SDLoc DL(&St);
24163 unsigned ZeroReg;
24164 EVT ZeroVT;
24165 if (VT.getVectorElementType().getSizeInBits() == 32) {
24166 ZeroReg = AArch64::WZR;
24167 ZeroVT = MVT::i32;
24168 } else {
24169 ZeroReg = AArch64::XZR;
24170 ZeroVT = MVT::i64;
24171 }
24172 SDValue SplatVal =
24173 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
24174 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
24175}
24176
24177/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
24178/// value. The load store optimizer pass will merge them to store pair stores.
24179/// This has better performance than a splat of the scalar followed by a split
24180/// vector store. Even if the stores are not merged it is four stores vs a dup,
24181/// followed by an ext.b and two stores.
24183 SDValue StVal = St.getValue();
24184 EVT VT = StVal.getValueType();
24185
24186 // Don't replace floating point stores, they possibly won't be transformed to
24187 // stp because of the store pair suppress pass.
24188 if (VT.isFloatingPoint())
24189 return SDValue();
24190
24191 // We can express a splat as store pair(s) for 2 or 4 elements.
24192 unsigned NumVecElts = VT.getVectorNumElements();
24193 if (NumVecElts != 4 && NumVecElts != 2)
24194 return SDValue();
24195
24196 // If the store is truncating then it's going down to i16 or smaller, which
24197 // means it can be implemented in a single store anyway.
24198 if (St.isTruncatingStore())
24199 return SDValue();
24200
24201 // Check that this is a splat.
24202 // Make sure that each of the relevant vector element locations are inserted
24203 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
24204 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
24205 SDValue SplatVal;
24206 for (unsigned I = 0; I < NumVecElts; ++I) {
24207 // Check for insert vector elements.
24208 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
24209 return SDValue();
24210
24211 // Check that same value is inserted at each vector element.
24212 if (I == 0)
24213 SplatVal = StVal.getOperand(1);
24214 else if (StVal.getOperand(1) != SplatVal)
24215 return SDValue();
24216
24217 // Check insert element index.
24219 if (!CIndex)
24220 return SDValue();
24221 uint64_t IndexVal = CIndex->getZExtValue();
24222 if (IndexVal >= NumVecElts)
24223 return SDValue();
24224 IndexNotInserted.reset(IndexVal);
24225
24226 StVal = StVal.getOperand(0);
24227 }
24228 // Check that all vector element locations were inserted to.
24229 if (IndexNotInserted.any())
24230 return SDValue();
24231
24232 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
24233}
24234
24236 SelectionDAG &DAG,
24237 const AArch64Subtarget *Subtarget) {
24238
24240 if (S->isVolatile() || S->isIndexed())
24241 return SDValue();
24242
24243 SDValue StVal = S->getValue();
24244 EVT VT = StVal.getValueType();
24245
24246 if (!VT.isFixedLengthVector())
24247 return SDValue();
24248
24249 // If we get a splat of zeros, convert this vector store to a store of
24250 // scalars. They will be merged into store pairs of xzr thereby removing one
24251 // instruction and one register.
24252 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
24253 return ReplacedZeroSplat;
24254
24255 // FIXME: The logic for deciding if an unaligned store should be split should
24256 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
24257 // a call to that function here.
24258
24259 if (!Subtarget->isMisaligned128StoreSlow())
24260 return SDValue();
24261
24262 // Don't split at -Oz.
24264 return SDValue();
24265
24266 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
24267 // those up regresses performance on micro-benchmarks and olden/bh.
24268 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
24269 return SDValue();
24270
24271 // Split unaligned 16B stores. They are terrible for performance.
24272 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
24273 // extensions can use this to mark that it does not want splitting to happen
24274 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
24275 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
24276 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
24277 S->getAlign() <= Align(2))
24278 return SDValue();
24279
24280 // If we get a splat of a scalar convert this vector store to a store of
24281 // scalars. They will be merged into store pairs thereby removing two
24282 // instructions.
24283 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
24284 return ReplacedSplat;
24285
24286 SDLoc DL(S);
24287
24288 // Split VT into two.
24289 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
24290 unsigned NumElts = HalfVT.getVectorNumElements();
24291 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
24292 DAG.getConstant(0, DL, MVT::i64));
24293 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
24294 DAG.getConstant(NumElts, DL, MVT::i64));
24295 SDValue BasePtr = S->getBasePtr();
24296 SDValue NewST1 =
24297 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
24298 S->getAlign(), S->getMemOperand()->getFlags());
24299 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
24300 DAG.getConstant(8, DL, MVT::i64));
24301 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
24302 S->getPointerInfo(), S->getAlign(),
24303 S->getMemOperand()->getFlags());
24304}
24305
24307 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
24308
24309 // splice(pg, op1, undef) -> op1
24310 if (N->getOperand(2).isUndef())
24311 return N->getOperand(1);
24312
24313 return SDValue();
24314}
24315
24317 const AArch64Subtarget *Subtarget) {
24318 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
24319 N->getOpcode() == AArch64ISD::UUNPKLO) &&
24320 "Unexpected Opcode!");
24321
24322 // uunpklo/hi undef -> undef
24323 if (N->getOperand(0).isUndef())
24324 return DAG.getUNDEF(N->getValueType(0));
24325
24326 // If this is a masked load followed by an UUNPKLO, fold this into a masked
24327 // extending load. We can do this even if this is already a masked
24328 // {z,}extload.
24329 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
24330 N->getOpcode() == AArch64ISD::UUNPKLO) {
24331 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
24332 SDValue Mask = MLD->getMask();
24333 SDLoc DL(N);
24334
24335 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
24336 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24337 (MLD->getPassThru()->isUndef() ||
24338 isZerosVector(MLD->getPassThru().getNode()))) {
24339 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24340 unsigned PgPattern = Mask->getConstantOperandVal(0);
24341 EVT VT = N->getValueType(0);
24342
24343 // Ensure we can double the size of the predicate pattern
24344 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24345 if (NumElts &&
24346 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
24347 Mask = getPTrue(DAG, DL,
24348 VT.changeVectorElementType(*DAG.getContext(), MVT::i1),
24349 PgPattern);
24350 SDValue PassThru = DAG.getConstant(0, DL, VT);
24351 SDValue NewLoad = DAG.getMaskedLoad(
24352 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
24353 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
24355
24356 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
24357
24358 return NewLoad;
24359 }
24360 }
24361 }
24362
24363 return SDValue();
24364}
24365
24367 if (N->getOpcode() != AArch64ISD::UZP1)
24368 return false;
24369 SDValue Op0 = N->getOperand(0);
24370 EVT SrcVT = Op0->getValueType(0);
24371 EVT DstVT = N->getValueType(0);
24372 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
24373 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
24374 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
24375}
24376
24377// Try to combine rounding shifts where the operands come from an extend, and
24378// the result is truncated and combined into one vector.
24379// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
24381 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
24382 SDValue Op0 = N->getOperand(0);
24383 SDValue Op1 = N->getOperand(1);
24384 EVT ResVT = N->getValueType(0);
24385
24386 unsigned RshOpc = Op0.getOpcode();
24387 if (RshOpc != AArch64ISD::RSHRNB_I)
24388 return SDValue();
24389
24390 // Same op code and imm value?
24391 SDValue ShiftValue = Op0.getOperand(1);
24392 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
24393 return SDValue();
24394
24395 // Same unextended operand value?
24396 SDValue Lo = Op0.getOperand(0);
24397 SDValue Hi = Op1.getOperand(0);
24398 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
24399 Hi.getOpcode() != AArch64ISD::UUNPKHI)
24400 return SDValue();
24401 SDValue OrigArg = Lo.getOperand(0);
24402 if (OrigArg != Hi.getOperand(0))
24403 return SDValue();
24404
24405 SDLoc DL(N);
24406 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
24407 getPredicateForVector(DAG, DL, ResVT), OrigArg,
24408 ShiftValue);
24409}
24410
24411// Try to simplify:
24412// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
24413// t2 = nxv8i16 srl(t1, ShiftValue)
24414// to
24415// t1 = nxv8i16 rshrnb(X, shiftvalue).
24416// rshrnb will zero the top half bits of each element. Therefore, this combine
24417// should only be performed when a following instruction with the rshrnb
24418// as an operand does not care about the top half of each element. For example,
24419// a uzp1 or a truncating store.
24421 const AArch64Subtarget *Subtarget) {
24422 EVT VT = Srl->getValueType(0);
24423 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
24424 return SDValue();
24425
24426 EVT ResVT;
24427 if (VT == MVT::nxv8i16)
24428 ResVT = MVT::nxv16i8;
24429 else if (VT == MVT::nxv4i32)
24430 ResVT = MVT::nxv8i16;
24431 else if (VT == MVT::nxv2i64)
24432 ResVT = MVT::nxv4i32;
24433 else
24434 return SDValue();
24435
24436 SDLoc DL(Srl);
24437 unsigned ShiftValue;
24438 SDValue RShOperand;
24439 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
24440 return SDValue();
24441 SDValue Rshrnb = DAG.getNode(
24442 AArch64ISD::RSHRNB_I, DL, ResVT,
24443 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
24444 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
24445}
24446
24448 if (V.getOpcode() != AArch64ISD::NVCAST)
24449 return SDValue();
24450
24451 SDValue Op = V.getOperand(0);
24452 if (!Op.getValueType().isVector() ||
24453 V.getValueType().getVectorElementCount() !=
24454 Op.getValueType().getVectorElementCount() * 2)
24455 return SDValue();
24456
24457 return Op;
24458}
24459
24461 const AArch64Subtarget *Subtarget) {
24462 SDLoc DL(N);
24463 SDValue Op0 = N->getOperand(0);
24464 SDValue Op1 = N->getOperand(1);
24465 EVT ResVT = N->getValueType(0);
24466
24467 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
24468 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
24470 Op0.getOperand(0) == Op1.getOperand(0)) {
24471
24472 SDValue SourceVec = Op0.getOperand(0);
24473 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
24474 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
24475 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
24476 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
24477 EVT OpVT = Op0.getOperand(1).getValueType();
24478 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24479 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
24480 DAG.getUNDEF(WidenedResVT));
24481 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
24482 DAG.getConstant(0, DL, OpVT));
24483 }
24484 }
24485
24486 // Following optimizations only work with uzp1.
24487 if (N->getOpcode() == AArch64ISD::UZP2)
24488 return SDValue();
24489
24490 // uzp1(x, undef) -> concat(truncate(x), undef)
24491 if (Op1.isUndef()) {
24492 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
24493 switch (ResVT.getSimpleVT().SimpleTy) {
24494 default:
24495 break;
24496 case MVT::v16i8:
24497 BCVT = MVT::v8i16;
24498 HalfVT = MVT::v8i8;
24499 break;
24500 case MVT::v8i16:
24501 BCVT = MVT::v4i32;
24502 HalfVT = MVT::v4i16;
24503 break;
24504 case MVT::v4i32:
24505 BCVT = MVT::v2i64;
24506 HalfVT = MVT::v2i32;
24507 break;
24508 }
24509 if (BCVT != MVT::Other) {
24510 SDValue BC = DAG.getBitcast(BCVT, Op0);
24511 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
24512 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
24513 DAG.getUNDEF(HalfVT));
24514 }
24515 }
24516
24517 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
24518 return Urshr;
24519
24520 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
24521 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
24522 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
24523 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
24524 }
24525 }
24526
24527 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
24528 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
24529 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
24530 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
24531 }
24532 }
24533
24534 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
24535 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
24536 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
24537 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
24538 SDValue X = PreCast.getOperand(0).getOperand(0);
24539 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
24540 }
24541 }
24542 }
24543
24544 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
24545 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
24546 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
24547 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
24548 SDValue Z = PreCast.getOperand(0).getOperand(1);
24549 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
24550 }
24551 }
24552 }
24553
24554 // These optimizations only work on little endian.
24555 if (!DAG.getDataLayout().isLittleEndian())
24556 return SDValue();
24557
24558 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
24559 // Example:
24560 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
24561 // to
24562 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
24564 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
24565 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
24566 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
24567 Op1.getOperand(0));
24568 }
24569 }
24570
24571 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
24572 return SDValue();
24573
24574 SDValue SourceOp0 = peekThroughBitcasts(Op0);
24575 SDValue SourceOp1 = peekThroughBitcasts(Op1);
24576
24577 // truncating uzp1(x, y) -> xtn(concat (x, y))
24578 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
24579 EVT Op0Ty = SourceOp0.getValueType();
24580 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
24581 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
24582 SDValue Concat =
24585 SourceOp0, SourceOp1);
24586 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
24587 }
24588 }
24589
24590 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
24591 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
24592 SourceOp1.getOpcode() != ISD::TRUNCATE)
24593 return SDValue();
24594 SourceOp0 = SourceOp0.getOperand(0);
24595 SourceOp1 = SourceOp1.getOperand(0);
24596
24597 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
24598 !SourceOp0.getValueType().isSimple())
24599 return SDValue();
24600
24601 EVT ResultTy;
24602
24603 switch (SourceOp0.getSimpleValueType().SimpleTy) {
24604 case MVT::v2i64:
24605 ResultTy = MVT::v4i32;
24606 break;
24607 case MVT::v4i32:
24608 ResultTy = MVT::v8i16;
24609 break;
24610 case MVT::v8i16:
24611 ResultTy = MVT::v16i8;
24612 break;
24613 default:
24614 return SDValue();
24615 }
24616
24617 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
24618 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
24619 SDValue UzpResult =
24620 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
24621
24622 EVT BitcastResultTy;
24623
24624 switch (ResVT.getSimpleVT().SimpleTy) {
24625 case MVT::v2i32:
24626 BitcastResultTy = MVT::v2i64;
24627 break;
24628 case MVT::v4i16:
24629 BitcastResultTy = MVT::v4i32;
24630 break;
24631 case MVT::v8i8:
24632 BitcastResultTy = MVT::v8i16;
24633 break;
24634 default:
24635 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
24636 }
24637
24638 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
24639 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
24640}
24641
24643 unsigned Opc = N->getOpcode();
24644
24645 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
24646 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
24647 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
24648 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
24649 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
24650 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
24651 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
24652 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
24653
24654 SDLoc DL(N);
24655 SDValue Chain = N->getOperand(0);
24656 SDValue Pg = N->getOperand(1);
24657 SDValue Base = N->getOperand(2);
24658 SDValue Offset = N->getOperand(3);
24659 SDValue Ty = N->getOperand(4);
24660
24661 EVT ResVT = N->getValueType(0);
24662
24663 const auto OffsetOpc = Offset.getOpcode();
24664 const bool OffsetIsZExt =
24665 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
24666 const bool OffsetIsSExt =
24667 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
24668
24669 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
24670 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
24671 SDValue ExtPg = Offset.getOperand(0);
24672 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
24673 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
24674
24675 // If the predicate for the sign- or zero-extended offset is the
24676 // same as the predicate used for this load and the sign-/zero-extension
24677 // was from a 32-bits...
24678 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
24679 SDValue UnextendedOffset = Offset.getOperand(1);
24680
24681 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
24682 if (Signed)
24683 NewOpc = getSignExtendedGatherOpcode(NewOpc);
24684
24685 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
24686 {Chain, Pg, Base, UnextendedOffset, Ty});
24687 }
24688 }
24689
24690 return SDValue();
24691}
24692
24693/// Optimize a vector shift instruction and its operand if shifted out
24694/// bits are not used.
24696 const AArch64TargetLowering &TLI,
24698 assert(N->getOpcode() == AArch64ISD::VASHR ||
24699 N->getOpcode() == AArch64ISD::VLSHR);
24700
24701 SDValue Op = N->getOperand(0);
24702 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
24703
24704 unsigned ShiftImm = N->getConstantOperandVal(1);
24705 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
24706
24707 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
24708 if (N->getOpcode() == AArch64ISD::VASHR &&
24709 Op.getOpcode() == AArch64ISD::VSHL &&
24710 N->getOperand(1) == Op.getOperand(1))
24711 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
24712 return Op.getOperand(0);
24713
24714 // If the shift is exact, the shifted out bits matter.
24715 if (N->getFlags().hasExact())
24716 return SDValue();
24717
24718 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
24719 APInt DemandedMask = ~ShiftedOutBits;
24720
24721 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
24722 return SDValue(N, 0);
24723
24724 return SDValue();
24725}
24726
24728 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
24729 // This transform works in partnership with performSetCCPunpkCombine to
24730 // remove unnecessary transfer of predicates into standard registers and back
24731 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
24732 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
24733 MVT::i1) {
24734 SDValue CC = N->getOperand(0)->getOperand(0);
24735 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
24736 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
24737 DAG.getVectorIdxConstant(0, SDLoc(N)));
24738 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
24739 }
24740
24741 return SDValue();
24742}
24743
24744/// Target-specific DAG combine function for post-increment LD1 (lane) and
24745/// post-increment LD1R.
24748 bool IsLaneOp) {
24749 if (DCI.isBeforeLegalizeOps())
24750 return SDValue();
24751
24752 SelectionDAG &DAG = DCI.DAG;
24753 EVT VT = N->getValueType(0);
24754
24755 if (!VT.is128BitVector() && !VT.is64BitVector())
24756 return SDValue();
24757
24758 // If it is not LOAD, can not do such combine.
24759 unsigned LoadIdx = IsLaneOp ? 1 : 0;
24760 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
24761 if (!LD)
24762 return SDValue();
24763
24764 // If the Generic combiner already helped form a pre- or post-indexed load,
24765 // skip forming one here.
24766 if (LD->isIndexed())
24767 return SDValue();
24768
24769 // The vector lane must be a constant in the LD1LANE opcode.
24770 SDValue Lane;
24771 if (IsLaneOp) {
24772 Lane = N->getOperand(2);
24773 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
24774 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
24775 return SDValue();
24776 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
24777 return SDValue();
24778 }
24779
24780 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
24781 EVT MemVT = LoadSDN->getMemoryVT();
24782 // Check if memory operand is the same type as the vector element.
24783 if (MemVT != VT.getVectorElementType())
24784 return SDValue();
24785
24786 // Check if there are other uses. If so, do not combine as it will introduce
24787 // an extra load.
24788 for (SDUse &U : LD->uses()) {
24789 if (U.getResNo() == 1) // Ignore uses of the chain result.
24790 continue;
24791 if (U.getUser() != N)
24792 return SDValue();
24793 }
24794
24795 // If there is one use and it can splat the value, prefer that operation.
24796 // TODO: This could be expanded to more operations if they reliably use the
24797 // index variants.
24798 if (N->hasOneUse()) {
24799 unsigned UseOpc = N->user_begin()->getOpcode();
24800 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
24801 return SDValue();
24802 }
24803
24804 SDValue Addr = LD->getOperand(1);
24805 SDValue Vector = N->getOperand(0);
24806 // Search for a use of the address operand that is an increment.
24807 for (SDUse &Use : Addr->uses()) {
24808 SDNode *User = Use.getUser();
24809 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24810 continue;
24811
24812 // If the increment is a constant, it must match the memory ref size.
24813 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24814 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24815 uint32_t IncVal = CInc->getZExtValue();
24816 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
24817 if (IncVal != NumBytes)
24818 continue;
24819 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24820 }
24821
24822 // To avoid cycle construction make sure that neither the load nor the add
24823 // are predecessors to each other or the Vector.
24826 Visited.insert(Addr.getNode());
24827 Worklist.push_back(User);
24828 Worklist.push_back(LD);
24829 Worklist.push_back(Vector.getNode());
24830 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
24831 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24832 continue;
24833
24835 Ops.push_back(LD->getOperand(0)); // Chain
24836 if (IsLaneOp) {
24837 Ops.push_back(Vector); // The vector to be inserted
24838 Ops.push_back(Lane); // The lane to be inserted in the vector
24839 }
24840 Ops.push_back(Addr);
24841 Ops.push_back(Inc);
24842
24843 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
24844 SDVTList SDTys = DAG.getVTList(Tys);
24845 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
24846 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
24847 MemVT,
24848 LoadSDN->getMemOperand());
24849
24850 // Update the uses.
24851 SDValue NewResults[] = {
24852 SDValue(LD, 0), // The result of load
24853 SDValue(UpdN.getNode(), 2) // Chain
24854 };
24855 DCI.CombineTo(LD, NewResults);
24856 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
24857 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
24858
24859 break;
24860 }
24861 return SDValue();
24862}
24863
24864/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
24865/// address translation.
24866static bool performTBISimplification(SDValue Addr,
24868 SelectionDAG &DAG) {
24869 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24870 // If MTE is enabled, TBI only applies to the top 4 bits.
24871 // Both arm64 and arm64e processes on Darwin may run with MTE enabled.
24872 unsigned NumIgnoreBits =
24873 Subtarget.hasMTE() || Subtarget.isTargetDarwin() ? 4 : 8;
24874 APInt DemandedMask = APInt::getLowBitsSet(64, 64 - NumIgnoreBits);
24875 KnownBits Known;
24877 !DCI.isBeforeLegalizeOps());
24878 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24879 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
24880 DCI.CommitTargetLoweringOpt(TLO);
24881 return true;
24882 }
24883 return false;
24884}
24885
24886static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
24887 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
24888 "Expected STORE dag node in input!");
24889
24890 if (auto Store = dyn_cast<StoreSDNode>(N)) {
24891 if (!Store->isTruncatingStore() || Store->isIndexed())
24892 return SDValue();
24893 SDValue Ext = Store->getValue();
24894 auto ExtOpCode = Ext.getOpcode();
24895 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
24896 ExtOpCode != ISD::ANY_EXTEND)
24897 return SDValue();
24898 SDValue Orig = Ext->getOperand(0);
24899 if (Store->getMemoryVT() != Orig.getValueType())
24900 return SDValue();
24901 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
24902 Store->getBasePtr(), Store->getMemOperand());
24903 }
24904
24905 return SDValue();
24906}
24907
24908// A custom combine to lower load <3 x i8> as the more efficient sequence
24909// below:
24910// ldrb wX, [x0, #2]
24911// ldrh wY, [x0]
24912// orr wX, wY, wX, lsl #16
24913// fmov s0, wX
24914//
24915// Note that an alternative sequence with even fewer (although usually more
24916// complex/expensive) instructions would be:
24917// ld1r.4h { v0 }, [x0], #2
24918// ld1.b { v0 }[2], [x0]
24919//
24920// Generating this sequence unfortunately results in noticeably worse codegen
24921// for code that extends the loaded v3i8, due to legalization breaking vector
24922// shuffle detection in a way that is very difficult to work around.
24923// TODO: Revisit once v3i8 legalization has been improved in general.
24924static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
24925 EVT MemVT = LD->getMemoryVT();
24926 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
24927 LD->getBaseAlign() >= 4)
24928 return SDValue();
24929
24930 SDLoc DL(LD);
24932 SDValue Chain = LD->getChain();
24933 SDValue BasePtr = LD->getBasePtr();
24934 MachineMemOperand *MMO = LD->getMemOperand();
24935 assert(LD->getOffset().isUndef() && "undef offset expected");
24936
24937 // Load 2 x i8, then 1 x i8.
24938 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
24939 TypeSize Offset2 = TypeSize::getFixed(2);
24940 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
24941 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
24942 MF.getMachineMemOperand(MMO, 2, 1));
24943
24944 // Extend to i32.
24945 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
24946 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
24947
24948 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
24949 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
24950 DAG.getConstant(16, DL, MVT::i32));
24951 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
24952 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
24953
24954 // Extract v3i8 again.
24955 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
24956 DAG.getConstant(0, DL, MVT::i64));
24958 ISD::TokenFactor, DL, MVT::Other,
24959 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
24960 return DAG.getMergeValues({Extract, TokenFactor}, DL);
24961}
24962
24963// Perform TBI simplification if supported by the target and try to break up
24964// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
24965// load instructions can be selected.
24966static SDValue performLOADCombine(SDNode *N,
24968 SelectionDAG &DAG,
24969 const AArch64Subtarget *Subtarget) {
24970 if (Subtarget->supportsAddressTopByteIgnored())
24971 performTBISimplification(N->getOperand(1), DCI, DAG);
24972
24974 EVT RegVT = LD->getValueType(0);
24975 EVT MemVT = LD->getMemoryVT();
24976 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24977 SDLoc DL(LD);
24978
24979 // Cast ptr32 and ptr64 pointers to the default address space before a load.
24980 unsigned AddrSpace = LD->getAddressSpace();
24981 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24982 AddrSpace == ARM64AS::PTR32_UPTR) {
24983 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24984 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
24985 SDValue Cast =
24986 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
24987 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
24988 Cast, LD->getPointerInfo(), MemVT,
24989 LD->getBaseAlign(),
24990 LD->getMemOperand()->getFlags());
24991 }
24992 }
24993
24994 if (LD->isVolatile() || !Subtarget->isLittleEndian())
24995 return SDValue(N, 0);
24996
24997 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24998 return Res;
24999
25000 if (!LD->isNonTemporal())
25001 return SDValue(N, 0);
25002
25003 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
25004 MemVT.getSizeInBits() % 256 == 0 ||
25005 256 % MemVT.getScalarSizeInBits() != 0)
25006 return SDValue(N, 0);
25007
25008 SDValue Chain = LD->getChain();
25009 SDValue BasePtr = LD->getBasePtr();
25010 SDNodeFlags Flags = LD->getFlags();
25012 SmallVector<SDValue, 4> LoadOpsChain;
25013 // Replace any non temporal load over 256-bit with a series of 256 bit loads
25014 // and a scalar/vector load less than 256. This way we can utilize 256-bit
25015 // loads and reduce the amount of load instructions generated.
25016 MVT NewVT =
25018 256 / MemVT.getVectorElementType().getSizeInBits());
25019 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
25020 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
25021 for (unsigned I = 0; I < Num256Loads; I++) {
25022 unsigned PtrOffset = I * 32;
25023 SDValue NewPtr = DAG.getMemBasePlusOffset(
25024 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
25025 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
25026 SDValue NewLoad = DAG.getLoad(
25027 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
25028 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
25029 LoadOps.push_back(NewLoad);
25030 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
25031 }
25032
25033 // Process remaining bits of the load operation.
25034 // This is done by creating an UNDEF vector to match the size of the
25035 // 256-bit loads and inserting the remaining load to it. We extract the
25036 // original load type at the end using EXTRACT_SUBVECTOR instruction.
25037 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
25038 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
25039 MVT RemainingVT = MVT::getVectorVT(
25041 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
25042 SDValue NewPtr = DAG.getMemBasePlusOffset(
25043 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
25044 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
25045 SDValue RemainingLoad =
25046 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
25047 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
25048 LD->getMemOperand()->getFlags(), LD->getAAInfo());
25049 SDValue UndefVector = DAG.getUNDEF(NewVT);
25050 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
25051 SDValue ExtendedRemainingLoad =
25052 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
25053 {UndefVector, RemainingLoad, InsertIdx});
25054 LoadOps.push_back(ExtendedRemainingLoad);
25055 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
25056 EVT ConcatVT =
25058 LoadOps.size() * NewVT.getVectorNumElements());
25059 SDValue ConcatVectors =
25060 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
25061 // Extract the original vector type size.
25062 SDValue ExtractSubVector =
25063 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
25064 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
25066 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
25067 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
25068}
25069
25070static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
25071 EVT VecVT = Op.getValueType();
25072 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
25073 "Need boolean vector type.");
25074
25075 if (Depth > 3)
25077
25078 // We can get the base type from a vector compare or truncate.
25079 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
25080 return Op.getOperand(0).getValueType();
25081
25082 // If an operand is a bool vector, continue looking.
25084 for (SDValue Operand : Op->op_values()) {
25085 if (Operand.getValueType() != VecVT)
25086 continue;
25087
25088 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
25089 if (!BaseVT.isSimple())
25090 BaseVT = OperandVT;
25091 else if (OperandVT != BaseVT)
25093 }
25094
25095 return BaseVT;
25096}
25097
25098// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
25099// iN, we can use a trick that extracts the i^th bit from the i^th element and
25100// then performs a vector add to get a scalar bitmask. This requires that each
25101// element's bits are either all 1 or all 0.
25102static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
25103 SDLoc DL(N);
25104 SDValue ComparisonResult(N, 0);
25105 EVT VecVT = ComparisonResult.getValueType();
25106 assert(VecVT.isVector() && "Must be a vector type");
25107
25108 unsigned NumElts = VecVT.getVectorNumElements();
25109 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
25110 return SDValue();
25111
25112 if (VecVT.getVectorElementType() != MVT::i1 &&
25113 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
25114 return SDValue();
25115
25116 // If we can find the original types to work on instead of a vector of i1,
25117 // we can avoid extend/extract conversion instructions.
25118 if (VecVT.getVectorElementType() == MVT::i1) {
25119 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
25120 if (!VecVT.isSimple()) {
25121 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
25122 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
25123 }
25124 }
25125 VecVT = VecVT.changeVectorElementTypeToInteger();
25126
25127 // Large vectors don't map directly to this conversion, so to avoid too many
25128 // edge cases, we don't apply it here. The conversion will likely still be
25129 // applied later via multiple smaller vectors, whose results are concatenated.
25130 if (VecVT.getSizeInBits() > 128)
25131 return SDValue();
25132
25133 // Ensure that all elements' bits are either 0s or 1s.
25134 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
25135
25136 bool IsLE = DAG.getDataLayout().isLittleEndian();
25137 SmallVector<SDValue, 16> MaskConstants;
25139 VecVT == MVT::v16i8) {
25140 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
25141 // per entry. We split it into two halves, apply the mask, zip the halves to
25142 // create 8x 16-bit values, and the perform the vector reduce.
25143 for (unsigned Half = 0; Half < 2; ++Half) {
25144 for (unsigned I = 0; I < 8; ++I) {
25145 // On big-endian targets, the lane order in sub-byte vector elements
25146 // gets reversed, so we need to flip the bit index.
25147 unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
25148 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
25149 }
25150 }
25151 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
25152 SDValue RepresentativeBits =
25153 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
25154
25155 SDValue UpperRepresentativeBits =
25156 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
25157 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
25158 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
25159 RepresentativeBits, UpperRepresentativeBits);
25160 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
25161 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
25162 }
25163
25164 // All other vector sizes.
25165 unsigned NumEl = VecVT.getVectorNumElements();
25166 for (unsigned I = 0; I < NumEl; ++I) {
25167 unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
25168 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
25169 }
25170
25171 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
25172 SDValue RepresentativeBits =
25173 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
25174 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
25175 NumElts, VecVT.getVectorElementType().getSizeInBits()));
25176 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
25177}
25178
25179static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
25180 StoreSDNode *Store) {
25181 if (!Store->isTruncatingStore())
25182 return SDValue();
25183
25184 SDLoc DL(Store);
25185 SDValue VecOp = Store->getValue();
25186 EVT VT = VecOp.getValueType();
25187 EVT MemVT = Store->getMemoryVT();
25188
25189 if (!MemVT.isVector() || !VT.isVector() ||
25190 MemVT.getVectorElementType() != MVT::i1)
25191 return SDValue();
25192
25193 // If we are storing a vector that we are currently building, let
25194 // `scalarizeVectorStore()` handle this more efficiently.
25195 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
25196 return SDValue();
25197
25198 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
25199 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
25200 if (!VectorBits)
25201 return SDValue();
25202
25203 EVT StoreVT =
25205 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
25206 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
25207 Store->getMemOperand());
25208}
25209
25210// Combine store (fp_to_int X) to use vector semantics around the conversion
25211// when NEON is available. This allows us to store the in-vector result directly
25212// without transferring the result into a GPR in the process.
25213static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
25215 SelectionDAG &DAG,
25216 const AArch64Subtarget *Subtarget) {
25217 // Limit to post-legalization in order to avoid peeling truncating stores.
25218 if (DCI.isBeforeLegalize())
25219 return SDValue();
25220 if (!Subtarget->isNeonAvailable())
25221 return SDValue();
25222 // Source operand is already a vector.
25223 SDValue Value = ST->getValue();
25224 if (Value.getValueType().isVector())
25225 return SDValue();
25226
25227 // Look through potential assertions.
25228 while (Value->isAssert())
25229 Value = Value.getOperand(0);
25230
25231 if (Value.getOpcode() != ISD::FP_TO_SINT &&
25232 Value.getOpcode() != ISD::FP_TO_UINT)
25233 return SDValue();
25234 if (!Value->hasOneUse())
25235 return SDValue();
25236
25237 SDValue FPSrc = Value.getOperand(0);
25238 EVT SrcVT = FPSrc.getValueType();
25239 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
25240 return SDValue();
25241
25242 // No support for assignments such as i64 = fp_to_sint i32
25243 EVT VT = Value.getSimpleValueType();
25244 if (VT != SrcVT.changeTypeToInteger())
25245 return SDValue();
25246
25247 // Create a 128-bit element vector to avoid widening. The floating point
25248 // conversion is transformed into a single element conversion via a pattern.
25249 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
25250 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
25251 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
25252 SDLoc DL(ST);
25253 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
25254 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
25255
25257 SDValue Extracted =
25258 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
25259
25260 DCI.CombineTo(ST->getValue().getNode(), Extracted);
25261 return SDValue(ST, 0);
25262}
25263
25264bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
25265 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
25266 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
25267 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
25268}
25269
25270// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
25271static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
25272 const AArch64Subtarget *Subtarget) {
25273 SDValue Value = ST->getValue();
25274 EVT ValueVT = Value.getValueType();
25275
25276 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
25277 Value.getOpcode() != ISD::TRUNCATE ||
25278 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
25279 return SDValue();
25280
25281 assert(ST->getOffset().isUndef() && "undef offset expected");
25282 SDLoc DL(ST);
25283 auto WideVT = EVT::getVectorVT(
25284 *DAG.getContext(),
25285 Value->getOperand(0).getValueType().getVectorElementType(), 4);
25286 SDValue UndefVector = DAG.getUNDEF(WideVT);
25287 SDValue WideTrunc = DAG.getNode(
25288 ISD::INSERT_SUBVECTOR, DL, WideVT,
25289 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
25290 SDValue Cast = DAG.getNode(
25291 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
25292 WideTrunc);
25293
25295 SDValue Chain = ST->getChain();
25296 MachineMemOperand *MMO = ST->getMemOperand();
25297 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
25298 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
25299 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
25300 TypeSize Offset2 = TypeSize::getFixed(2);
25301 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
25302 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
25303
25304 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
25305 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
25306 TypeSize Offset1 = TypeSize::getFixed(1);
25307 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
25308 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
25309
25310 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
25311 DAG.getConstant(0, DL, MVT::i64));
25312 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
25313 MF.getMachineMemOperand(MMO, 0, 1));
25314 return Chain;
25315}
25316
25317static unsigned getFPSubregForVT(EVT VT) {
25318 assert(VT.isSimple() && "Expected simple VT");
25319 switch (VT.getSimpleVT().SimpleTy) {
25320 case MVT::aarch64mfp8:
25321 return AArch64::bsub;
25322 case MVT::f16:
25323 return AArch64::hsub;
25324 case MVT::f32:
25325 return AArch64::ssub;
25326 case MVT::f64:
25327 return AArch64::dsub;
25328 default:
25329 llvm_unreachable("Unexpected VT!");
25330 }
25331}
25332
25333static SDValue performSTORECombine(SDNode *N,
25335 SelectionDAG &DAG,
25336 const AArch64Subtarget *Subtarget) {
25338 SDValue Chain = ST->getChain();
25339 SDValue Value = ST->getValue();
25340 SDValue Ptr = ST->getBasePtr();
25341 EVT ValueVT = Value.getValueType();
25342 EVT MemVT = ST->getMemoryVT();
25343 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25344 SDLoc DL(ST);
25345
25346 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
25347 return Res;
25348
25349 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
25350 EVT EltVT = VT.getVectorElementType();
25351 return EltVT == MVT::f32 || EltVT == MVT::f64;
25352 };
25353
25354 // Cast ptr32 and ptr64 pointers to the default address space before a store.
25355 unsigned AddrSpace = ST->getAddressSpace();
25356 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
25357 AddrSpace == ARM64AS::PTR32_UPTR) {
25358 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25359 if (PtrVT != Ptr.getSimpleValueType()) {
25360 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
25361 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
25362 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
25363 ST->getAAInfo());
25364 }
25365 }
25366
25367 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
25368 return Res;
25369
25370 // If this is an FP_ROUND followed by a store, fold this into a truncating
25371 // store. We can do this even if this is already a truncstore.
25372 // We purposefully don't care about legality of the nodes here as we know
25373 // they can be split down into something legal.
25374 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
25375 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
25376 Subtarget->useSVEForFixedLengthVectors() &&
25377 ValueVT.isFixedLengthVector() &&
25378 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
25379 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
25380 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
25381 ST->getMemOperand());
25382
25383 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
25384 return Split;
25385
25386 if (Subtarget->supportsAddressTopByteIgnored() &&
25387 performTBISimplification(N->getOperand(2), DCI, DAG))
25388 return SDValue(N, 0);
25389
25390 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
25391 return Store;
25392
25393 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
25394 return Store;
25395
25396 if (ST->isTruncatingStore() &&
25397 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
25398 if (SDValue Rshrnb =
25399 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
25400 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
25401 MemVT, ST->getMemOperand());
25402 }
25403 }
25404
25405 // This is an integer vector_extract_elt followed by a (possibly truncating)
25406 // store. We may be able to replace this with a store of an FP subregister.
25407 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
25408 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25409
25410 SDValue Vector = Value.getOperand(0);
25411 SDValue ExtIdx = Value.getOperand(1);
25412 EVT VectorVT = Vector.getValueType();
25413 EVT ElemVT = VectorVT.getVectorElementType();
25414
25415 if (!ValueVT.isInteger())
25416 return SDValue();
25417
25418 // Propagate zero constants (applying this fold may miss optimizations).
25420 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
25421 DAG.ReplaceAllUsesWith(Value, ZeroElt);
25422 return SDValue();
25423 }
25424
25425 if (ValueVT != MemVT && !ST->isTruncatingStore())
25426 return SDValue();
25427
25428 // This could generate an additional extract if the index is non-zero and
25429 // the extracted value has multiple uses.
25430 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
25431 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
25432 return SDValue();
25433
25434 // These can lower to st1, which is preferable if we're unlikely to fold the
25435 // addressing into the store.
25436 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
25437 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
25438 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
25439 return SDValue();
25440
25441 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
25442 // Heuristic: If there are other users of w/x integer scalars extracted
25443 // from this vector that won't fold into the store -- abandon folding.
25444 // Applying this fold may disrupt paired stores.
25445 for (const auto &Use : Vector->uses()) {
25446 if (Use.getResNo() != Vector.getResNo())
25447 continue;
25448 const SDNode *User = Use.getUser();
25449 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25450 (!User->hasOneUse() ||
25451 (*User->user_begin())->getOpcode() != ISD::STORE))
25452 return SDValue();
25453 }
25454 }
25455
25456 SDValue ExtVector = Vector;
25457 if (!ExtCst || !ExtCst->isZero()) {
25458 // Handle extracting from lanes != 0.
25460 Value.getValueType(), Vector, ExtIdx);
25462 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
25463 DAG.getUNDEF(VectorVT), Ext, Zero);
25464 }
25465
25466 EVT FPMemVT = MemVT == MVT::i8
25467 ? MVT::aarch64mfp8
25469 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
25470 FPMemVT, ExtVector);
25471
25472 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
25473 ST->getMemOperand());
25474 }
25475
25476 return SDValue();
25477}
25478
25479static bool
25480isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
25481 if (N->getOpcode() != ISD::CONCAT_VECTORS)
25482 return false;
25483
25484 unsigned NumParts = N->getNumOperands();
25485
25486 // We should be concatenating each sequential result from a
25487 // VECTOR_INTERLEAVE.
25488 SDNode *InterleaveOp = N->getOperand(0).getNode();
25489 if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
25490 InterleaveOp->getNumOperands() != NumParts)
25491 return false;
25492
25493 for (unsigned I = 0; I < NumParts; I++)
25494 if (N->getOperand(I) != SDValue(InterleaveOp, I))
25495 return false;
25496
25497 Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
25498 return true;
25499}
25500
25501static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
25502 SDValue WideMask,
25503 unsigned RequiredNumParts) {
25504 if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
25505 SmallVector<SDValue, 4> MaskInterleaveOps;
25506 if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
25507 MaskInterleaveOps))
25508 return SDValue();
25509
25510 if (MaskInterleaveOps.size() != RequiredNumParts)
25511 return SDValue();
25512
25513 // Make sure the inputs to the vector interleave are identical.
25514 if (!llvm::all_equal(MaskInterleaveOps))
25515 return SDValue();
25516
25517 return MaskInterleaveOps[0];
25518 }
25519
25520 if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
25521 return SDValue();
25522
25524 assert(EC.isKnownMultipleOf(RequiredNumParts) &&
25525 "Expected element count divisible by number of parts");
25526 EC = EC.divideCoefficientBy(RequiredNumParts);
25527 return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
25528 WideMask->getOperand(0));
25529}
25530
25531static SDValue performInterleavedMaskedStoreCombine(
25533 if (!DCI.isBeforeLegalize())
25534 return SDValue();
25535
25537 SDValue WideValue = MST->getValue();
25538
25539 // Bail out if the stored value has an unexpected number of uses, since we'll
25540 // have to perform manual interleaving and may as well just use normal masked
25541 // stores. Also, discard masked stores that are truncating or indexed.
25542 if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
25543 !MST->isSimple() || !MST->getOffset().isUndef())
25544 return SDValue();
25545
25546 SmallVector<SDValue, 4> ValueInterleaveOps;
25547 if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
25548 ValueInterleaveOps))
25549 return SDValue();
25550
25551 unsigned NumParts = ValueInterleaveOps.size();
25552 if (NumParts != 2 && NumParts != 4)
25553 return SDValue();
25554
25555 // At the moment we're unlikely to see a fixed-width vector interleave as
25556 // we usually generate shuffles instead.
25557 EVT SubVecTy = ValueInterleaveOps[0].getValueType();
25558 if (!SubVecTy.isScalableVT() ||
25559 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
25560 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
25561 return SDValue();
25562
25563 SDLoc DL(N);
25564 SDValue NarrowMask =
25565 getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
25566 if (!NarrowMask)
25567 return SDValue();
25568
25569 const Intrinsic::ID IID =
25570 NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
25571 SmallVector<SDValue, 8> NewStOps;
25572 NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
25573 NewStOps.append(ValueInterleaveOps);
25574 NewStOps.append({NarrowMask, MST->getBasePtr()});
25575 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
25576}
25577
25578static SDValue performMSTORECombine(SDNode *N,
25580 SelectionDAG &DAG,
25581 const AArch64Subtarget *Subtarget) {
25583 SDValue Value = MST->getValue();
25584 SDValue Mask = MST->getMask();
25585 SDLoc DL(N);
25586
25587 if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
25588 return Res;
25589
25590 // If this is a UZP1 followed by a masked store, fold this into a masked
25591 // truncating store. We can do this even if this is already a masked
25592 // truncstore.
25593 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
25594 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
25595 Value.getValueType().isInteger()) {
25596 Value = Value.getOperand(0);
25597 if (Value.getOpcode() == ISD::BITCAST) {
25598 EVT HalfVT =
25599 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
25600 EVT InVT = Value.getOperand(0).getValueType();
25601
25602 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
25603 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
25604 unsigned PgPattern = Mask->getConstantOperandVal(0);
25605
25606 // Ensure we can double the size of the predicate pattern
25607 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
25608 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
25609 MinSVESize) {
25610 Mask = getPTrue(
25611 DAG, DL, InVT.changeVectorElementType(*DAG.getContext(), MVT::i1),
25612 PgPattern);
25613 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
25614 MST->getBasePtr(), MST->getOffset(), Mask,
25615 MST->getMemoryVT(), MST->getMemOperand(),
25616 MST->getAddressingMode(),
25617 /*IsTruncating=*/true);
25618 }
25619 }
25620 }
25621 }
25622
25623 if (MST->isTruncatingStore()) {
25624 EVT ValueVT = Value->getValueType(0);
25625 EVT MemVT = MST->getMemoryVT();
25626 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
25627 return SDValue();
25628 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
25629 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
25630 MST->getOffset(), MST->getMask(),
25631 MST->getMemoryVT(), MST->getMemOperand(),
25632 MST->getAddressingMode(), true);
25633 }
25634 }
25635
25636 return SDValue();
25637}
25638
25639/// \return true if part of the index was folded into the Base.
25640static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
25641 SDLoc DL, SelectionDAG &DAG) {
25642 // This function assumes a vector of i64 indices.
25643 EVT IndexVT = Index.getValueType();
25644 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
25645 return false;
25646
25647 // Simplify:
25648 // BasePtr = Ptr
25649 // Index = X + splat(Offset)
25650 // ->
25651 // BasePtr = Ptr + Offset * scale.
25652 // Index = X
25653 if (Index.getOpcode() == ISD::ADD) {
25654 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
25655 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
25656 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
25657 Index = Index.getOperand(0);
25658 return true;
25659 }
25660 }
25661
25662 // Simplify:
25663 // BasePtr = Ptr
25664 // Index = (X + splat(Offset)) << splat(Shift)
25665 // ->
25666 // BasePtr = Ptr + (Offset << Shift) * scale)
25667 // Index = X << splat(shift)
25668 if (Index.getOpcode() == ISD::SHL &&
25669 Index.getOperand(0).getOpcode() == ISD::ADD) {
25670 SDValue Add = Index.getOperand(0);
25671 SDValue ShiftOp = Index.getOperand(1);
25672 SDValue OffsetOp = Add.getOperand(1);
25673 if (auto Shift = DAG.getSplatValue(ShiftOp))
25674 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
25675 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
25676 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
25677 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
25678 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
25679 Add.getOperand(0), ShiftOp);
25680 return true;
25681 }
25682 }
25683
25684 return false;
25685}
25686
25687// Analyse the specified address returning true if a more optimal addressing
25688// mode is available. When returning true all parameters are updated to reflect
25689// their recommended values.
25691 SDValue &BasePtr, SDValue &Index,
25692 SelectionDAG &DAG) {
25693 // Try to iteratively fold parts of the index into the base pointer to
25694 // simplify the index as much as possible.
25695 bool Changed = false;
25696 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
25697 Changed = true;
25698
25699 // Only consider element types that are pointer sized as smaller types can
25700 // be easily promoted.
25701 EVT IndexVT = Index.getValueType();
25702 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
25703 return Changed;
25704
25705 // Can indices be trivially shrunk?
25706 EVT DataVT = N->getOperand(1).getValueType();
25707 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
25708 // will later be re-extended to 64 bits in legalization
25709 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
25710 return Changed;
25711 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
25712 EVT NewIndexVT =
25713 IndexVT.changeVectorElementType(*DAG.getContext(), MVT::i32);
25714 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
25715 return true;
25716 }
25717
25718 // Match:
25719 // Index = step(const)
25720 int64_t Stride = 0;
25721 if (Index.getOpcode() == ISD::STEP_VECTOR) {
25722 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
25723 }
25724 // Match:
25725 // Index = step(const) << shift(const)
25726 else if (Index.getOpcode() == ISD::SHL &&
25727 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
25728 SDValue RHS = Index.getOperand(1);
25729 if (auto *Shift =
25731 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
25732 Stride = Step << Shift->getZExtValue();
25733 }
25734 }
25735
25736 // Return early because no supported pattern is found.
25737 if (Stride == 0)
25738 return Changed;
25739
25740 if (Stride < std::numeric_limits<int32_t>::min() ||
25741 Stride > std::numeric_limits<int32_t>::max())
25742 return Changed;
25743
25744 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25745 unsigned MaxVScale =
25747 int64_t LastElementOffset =
25748 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
25749
25750 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
25751 LastElementOffset > std::numeric_limits<int32_t>::max())
25752 return Changed;
25753
25754 EVT NewIndexVT = IndexVT.changeVectorElementType(*DAG.getContext(), MVT::i32);
25755 // Stride does not scale explicitly by 'Scale', because it happens in
25756 // the gather/scatter addressing mode.
25757 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
25758 return true;
25759}
25760
25763 if (!DCI.isBeforeLegalize())
25764 return SDValue();
25766
25767 SDLoc DL(MGS);
25768 SDValue Chain = MGS->getChain();
25769 SDValue Scale = MGS->getScale();
25770 SDValue Index = MGS->getIndex();
25771 SDValue Mask = MGS->getMask();
25772 SDValue BasePtr = MGS->getBasePtr();
25773 ISD::MemIndexType IndexType = MGS->getIndexType();
25774
25775 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
25776 return SDValue();
25777
25778 // Here we catch such cases early and change MGATHER's IndexType to allow
25779 // the use of an Index that's more legalisation friendly.
25780 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
25781 SDValue PassThru = MGT->getPassThru();
25782 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
25783 return DAG.getMaskedGather(
25784 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
25785 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
25786 }
25787 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
25788 SDValue Data = MSC->getValue();
25789 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
25790 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
25791 DL, Ops, MSC->getMemOperand(), IndexType,
25792 MSC->isTruncatingStore());
25793 }
25794 auto *HG = cast<MaskedHistogramSDNode>(MGS);
25795 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
25796 Index, Scale, HG->getIntID()};
25797 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
25798 DL, Ops, HG->getMemOperand(), IndexType);
25799}
25800
25801/// Target-specific DAG combine function for NEON load/store intrinsics
25802/// to merge base address updates.
25805 SelectionDAG &DAG) {
25806 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
25807 return SDValue();
25808
25809 unsigned AddrOpIdx = N->getNumOperands() - 1;
25810 SDValue Addr = N->getOperand(AddrOpIdx);
25811
25812 // Search for a use of the address operand that is an increment.
25813 for (SDUse &Use : Addr->uses()) {
25814 SDNode *User = Use.getUser();
25815 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
25816 continue;
25817
25818 // Check that the add is independent of the load/store. Otherwise, folding
25819 // it would create a cycle.
25822 Visited.insert(Addr.getNode());
25823 Worklist.push_back(N);
25824 Worklist.push_back(User);
25825 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
25826 SDNode::hasPredecessorHelper(User, Visited, Worklist))
25827 continue;
25828
25829 // Find the new opcode for the updating load/store.
25830 bool IsStore = false;
25831 bool IsLaneOp = false;
25832 bool IsDupOp = false;
25833 unsigned NewOpc = 0;
25834 unsigned NumVecs = 0;
25835 unsigned IntNo = N->getConstantOperandVal(1);
25836 switch (IntNo) {
25837 default: llvm_unreachable("unexpected intrinsic for Neon base update");
25838 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
25839 NumVecs = 2; break;
25840 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
25841 NumVecs = 3; break;
25842 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
25843 NumVecs = 4; break;
25844 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
25845 NumVecs = 2; IsStore = true; break;
25846 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
25847 NumVecs = 3; IsStore = true; break;
25848 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
25849 NumVecs = 4; IsStore = true; break;
25850 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
25851 NumVecs = 2; break;
25852 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
25853 NumVecs = 3; break;
25854 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
25855 NumVecs = 4; break;
25856 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
25857 NumVecs = 2; IsStore = true; break;
25858 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
25859 NumVecs = 3; IsStore = true; break;
25860 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
25861 NumVecs = 4; IsStore = true; break;
25862 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
25863 NumVecs = 2; IsDupOp = true; break;
25864 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
25865 NumVecs = 3; IsDupOp = true; break;
25866 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
25867 NumVecs = 4; IsDupOp = true; break;
25868 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
25869 NumVecs = 2; IsLaneOp = true; break;
25870 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
25871 NumVecs = 3; IsLaneOp = true; break;
25872 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
25873 NumVecs = 4; IsLaneOp = true; break;
25874 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
25875 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
25876 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
25877 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
25878 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
25879 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
25880 }
25881
25882 EVT VecTy;
25883 if (IsStore)
25884 VecTy = N->getOperand(2).getValueType();
25885 else
25886 VecTy = N->getValueType(0);
25887
25888 // If the increment is a constant, it must match the memory ref size.
25889 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
25890 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
25891 uint32_t IncVal = CInc->getZExtValue();
25892 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
25893 if (IsLaneOp || IsDupOp)
25894 NumBytes /= VecTy.getVectorNumElements();
25895 if (IncVal != NumBytes)
25896 continue;
25897 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
25898 }
25900 Ops.push_back(N->getOperand(0)); // Incoming chain
25901 // Load lane and store have vector list as input.
25902 if (IsLaneOp || IsStore)
25903 for (unsigned i = 2; i < AddrOpIdx; ++i)
25904 Ops.push_back(N->getOperand(i));
25905 Ops.push_back(Addr); // Base register
25906 Ops.push_back(Inc);
25907
25908 // Return Types.
25909 EVT Tys[6];
25910 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
25911 unsigned n;
25912 for (n = 0; n < NumResultVecs; ++n)
25913 Tys[n] = VecTy;
25914 Tys[n++] = MVT::i64; // Type of write back register
25915 Tys[n] = MVT::Other; // Type of the chain
25916 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
25917
25919 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
25920 MemInt->getMemoryVT(),
25921 MemInt->getMemOperand());
25922
25923 // Update the uses.
25924 std::vector<SDValue> NewResults;
25925 for (unsigned i = 0; i < NumResultVecs; ++i) {
25926 NewResults.push_back(SDValue(UpdN.getNode(), i));
25927 }
25928 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
25929 DCI.CombineTo(N, NewResults);
25930 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
25931
25932 break;
25933 }
25934 return SDValue();
25935}
25936
25937// Checks to see if the value is the prescribed width and returns information
25938// about its extension mode.
25939static
25940bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
25941 ExtType = ISD::NON_EXTLOAD;
25942 switch(V.getNode()->getOpcode()) {
25943 default:
25944 return false;
25945 case ISD::LOAD: {
25946 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
25947 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
25948 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
25949 ExtType = LoadNode->getExtensionType();
25950 return true;
25951 }
25952 return false;
25953 }
25954 case ISD::AssertSext: {
25955 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25956 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25957 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25958 ExtType = ISD::SEXTLOAD;
25959 return true;
25960 }
25961 return false;
25962 }
25963 case ISD::AssertZext: {
25964 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25965 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25966 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25967 ExtType = ISD::ZEXTLOAD;
25968 return true;
25969 }
25970 return false;
25971 }
25972 case ISD::Constant:
25973 case ISD::TargetConstant: {
25974 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
25975 1LL << (width - 1);
25976 }
25977 }
25978
25979 return true;
25980}
25981
25982// This function does a whole lot of voodoo to determine if the tests are
25983// equivalent without and with a mask. Essentially what happens is that given a
25984// DAG resembling:
25985//
25986// +-------------+ +-------------+ +-------------+ +-------------+
25987// | Input | | AddConstant | | CompConstant| | CC |
25988// +-------------+ +-------------+ +-------------+ +-------------+
25989// | | | |
25990// V V | +----------+
25991// +-------------+ +----+ | |
25992// | ADD | |0xff| | |
25993// +-------------+ +----+ | |
25994// | | | |
25995// V V | |
25996// +-------------+ | |
25997// | AND | | |
25998// +-------------+ | |
25999// | | |
26000// +-----+ | |
26001// | | |
26002// V V V
26003// +-------------+
26004// | CMP |
26005// +-------------+
26006//
26007// The AND node may be safely removed for some combinations of inputs. In
26008// particular we need to take into account the extension type of the Input,
26009// the exact values of AddConstant, CompConstant, and CC, along with the nominal
26010// width of the input (this can work for any width inputs, the above graph is
26011// specific to 8 bits.
26012//
26013// The specific equations were worked out by generating output tables for each
26014// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
26015// problem was simplified by working with 4 bit inputs, which means we only
26016// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
26017// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
26018// patterns present in both extensions (0,7). For every distinct set of
26019// AddConstant and CompConstants bit patterns we can consider the masked and
26020// unmasked versions to be equivalent if the result of this function is true for
26021// all 16 distinct bit patterns of for the current extension type of Input (w0).
26022//
26023// sub w8, w0, w1
26024// and w10, w8, #0x0f
26025// cmp w8, w2
26026// cset w9, AArch64CC
26027// cmp w10, w2
26028// cset w11, AArch64CC
26029// cmp w9, w11
26030// cset w0, eq
26031// ret
26032//
26033// Since the above function shows when the outputs are equivalent it defines
26034// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
26035// would be expensive to run during compiles. The equations below were written
26036// in a test harness that confirmed they gave equivalent outputs to the above
26037// for all inputs function, so they can be used determine if the removal is
26038// legal instead.
26039//
26040// isEquivalentMaskless() is the code for testing if the AND can be removed
26041// factored out of the DAG recognition as the DAG can take several forms.
26042
26043static bool isEquivalentMaskless(unsigned CC, unsigned width,
26044 ISD::LoadExtType ExtType, int AddConstant,
26045 int CompConstant) {
26046 // By being careful about our equations and only writing the in term
26047 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
26048 // make them generally applicable to all bit widths.
26049 int MaxUInt = (1 << width);
26050
26051 // For the purposes of these comparisons sign extending the type is
26052 // equivalent to zero extending the add and displacing it by half the integer
26053 // width. Provided we are careful and make sure our equations are valid over
26054 // the whole range we can just adjust the input and avoid writing equations
26055 // for sign extended inputs.
26056 if (ExtType == ISD::SEXTLOAD)
26057 AddConstant -= (1 << (width-1));
26058
26059 switch(CC) {
26060 case AArch64CC::LE:
26061 case AArch64CC::GT:
26062 if ((AddConstant == 0) ||
26063 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
26064 (AddConstant >= 0 && CompConstant < 0) ||
26065 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
26066 return true;
26067 break;
26068 case AArch64CC::LT:
26069 case AArch64CC::GE:
26070 if ((AddConstant == 0) ||
26071 (AddConstant >= 0 && CompConstant <= 0) ||
26072 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
26073 return true;
26074 break;
26075 case AArch64CC::HI:
26076 case AArch64CC::LS:
26077 if ((AddConstant >= 0 && CompConstant < 0) ||
26078 (AddConstant <= 0 && CompConstant >= -1 &&
26079 CompConstant < AddConstant + MaxUInt))
26080 return true;
26081 break;
26082 case AArch64CC::PL:
26083 case AArch64CC::MI:
26084 if ((AddConstant == 0) ||
26085 (AddConstant > 0 && CompConstant <= 0) ||
26086 (AddConstant < 0 && CompConstant <= AddConstant))
26087 return true;
26088 break;
26089 case AArch64CC::LO:
26090 case AArch64CC::HS:
26091 if ((AddConstant >= 0 && CompConstant <= 0) ||
26092 (AddConstant <= 0 && CompConstant >= 0 &&
26093 CompConstant <= AddConstant + MaxUInt))
26094 return true;
26095 break;
26096 case AArch64CC::EQ:
26097 case AArch64CC::NE:
26098 if ((AddConstant > 0 && CompConstant < 0) ||
26099 (AddConstant < 0 && CompConstant >= 0 &&
26100 CompConstant < AddConstant + MaxUInt) ||
26101 (AddConstant >= 0 && CompConstant >= 0 &&
26102 CompConstant >= AddConstant) ||
26103 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
26104 return true;
26105 break;
26106 case AArch64CC::VS:
26107 case AArch64CC::VC:
26108 case AArch64CC::AL:
26109 case AArch64CC::NV:
26110 return true;
26111 case AArch64CC::Invalid:
26112 break;
26113 }
26114
26115 return false;
26116}
26117
26118// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
26119// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
26121 SDNode *AndNode, SelectionDAG &DAG,
26122 unsigned CCIndex, unsigned CmpIndex,
26123 unsigned CC) {
26124 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
26125 if (!SubsC)
26126 return SDValue();
26127
26128 APInt SubsAP = SubsC->getAPIntValue();
26129 if (CC == AArch64CC::HI) {
26130 if (!SubsAP.isMask())
26131 return SDValue();
26132 } else if (CC == AArch64CC::LO) {
26133 if (!SubsAP.isPowerOf2())
26134 return SDValue();
26135 } else
26136 return SDValue();
26137
26139 if (!AndC)
26140 return SDValue();
26141
26142 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
26143
26144 SDLoc DL(N);
26145 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
26146 SDValue ANDS = DAG.getNode(
26147 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
26148 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
26149 SDValue AArch64_CC =
26151 N->getOperand(CCIndex)->getValueType(0));
26152
26153 // For now, only performCSELCombine and performBRCONDCombine call this
26154 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
26155 // operands. So just init the ops direct to simplify the code. If we have some
26156 // other case with different CCIndex, CmpIndex, we need to use for loop to
26157 // rewrite the code here.
26158 // TODO: Do we need to assert number of operand is 4 here?
26159 assert((CCIndex == 2 && CmpIndex == 3) &&
26160 "Expected CCIndex to be 2 and CmpIndex to be 3.");
26161 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
26162 ANDS.getValue(1)};
26163 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
26164}
26165
26166static
26169 SelectionDAG &DAG, unsigned CCIndex,
26170 unsigned CmpIndex) {
26171 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
26172 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
26173 unsigned CondOpcode = SubsNode->getOpcode();
26174
26175 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
26176 !SubsNode->hasOneUse())
26177 return SDValue();
26178
26179 // There is a SUBS feeding this condition. Is it fed by a mask we can
26180 // use?
26181
26182 SDNode *AndNode = SubsNode->getOperand(0).getNode();
26183 unsigned MaskBits = 0;
26184
26185 if (AndNode->getOpcode() != ISD::AND)
26186 return SDValue();
26187
26188 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
26189 CmpIndex, CC))
26190 return Val;
26191
26192 // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty
26193 // sequence of ones starting at the least significant bit with the remainder
26194 // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised
26195 // into a SUBS (immediate). The transformed form can be matched into a SUBS
26196 // (shifted register).
26197 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&
26198 isa<ConstantSDNode>(AndNode->getOperand(1)) &&
26199 isa<ConstantSDNode>(SubsNode->getOperand(1))) {
26200 SDValue X = AndNode->getOperand(0);
26201 APInt M = AndNode->getConstantOperandAPInt(1);
26202 APInt C = SubsNode->getConstantOperandAPInt(1);
26203
26204 if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {
26205 SDLoc DL(SubsNode);
26206 EVT VT = SubsNode->getValueType(0);
26207 unsigned ShiftAmt = M.countl_zero();
26208 SDValue ShiftedX = DAG.getNode(
26209 ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
26210 SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);
26211 SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),
26212 ShiftedC, ShiftedX);
26213 DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));
26214 return SDValue(N, 0);
26215 }
26216 }
26217
26218 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
26219 uint32_t CNV = CN->getZExtValue();
26220 if (CNV == 255)
26221 MaskBits = 8;
26222 else if (CNV == 65535)
26223 MaskBits = 16;
26224 }
26225
26226 if (!MaskBits)
26227 return SDValue();
26228
26229 SDValue AddValue = AndNode->getOperand(0);
26230
26231 if (AddValue.getOpcode() != ISD::ADD)
26232 return SDValue();
26233
26234 // The basic dag structure is correct, grab the inputs and validate them.
26235
26236 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
26237 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
26238 SDValue SubsInputValue = SubsNode->getOperand(1);
26239
26240 // The mask is present and the provenance of all the values is a smaller type,
26241 // lets see if the mask is superfluous.
26242
26243 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
26244 !isa<ConstantSDNode>(SubsInputValue.getNode()))
26245 return SDValue();
26246
26247 ISD::LoadExtType ExtType;
26248
26249 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
26250 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
26251 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
26252 return SDValue();
26253
26254 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
26255 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
26256 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
26257 return SDValue();
26258
26259 // The AND is not necessary, remove it.
26260
26261 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
26262 SubsNode->getValueType(1));
26263 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
26264
26265 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
26266 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
26267
26268 return SDValue(N, 0);
26269}
26270
26271// Optimize compare with zero and branch.
26274 SelectionDAG &DAG) {
26276 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
26277 // will not be produced, as they are conditional branch instructions that do
26278 // not set flags.
26279 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
26280 return SDValue();
26281
26282 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
26283 N = NV.getNode();
26284 SDValue Chain = N->getOperand(0);
26285 SDValue Dest = N->getOperand(1);
26286 SDValue CCVal = N->getOperand(2);
26287 SDValue Cmp = N->getOperand(3);
26288
26289 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
26290 unsigned CC = CCVal->getAsZExtVal();
26291 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
26292 return SDValue();
26293
26294 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
26295 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
26296 SDValue CSel = Cmp.getOperand(0);
26297 auto CSelCC = getCSETCondCode(CSel);
26298 if (CSelCC) {
26299 SDLoc DL(N);
26300 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
26301 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
26302 CSel.getOperand(3));
26303 }
26304 }
26305
26306 unsigned CmpOpc = Cmp.getOpcode();
26307 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
26308 return SDValue();
26309
26310 // Only attempt folding if there is only one use of the flag and no use of the
26311 // value.
26312 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
26313 return SDValue();
26314
26315 SDValue LHS = Cmp.getOperand(0);
26316 SDValue RHS = Cmp.getOperand(1);
26317
26318 assert(LHS.getValueType() == RHS.getValueType() &&
26319 "Expected the value type to be the same for both operands!");
26320 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
26321 return SDValue();
26322
26323 if (isNullConstant(LHS))
26324 std::swap(LHS, RHS);
26325
26326 if (!isNullConstant(RHS))
26327 return SDValue();
26328
26329 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
26330 LHS.getOpcode() == ISD::SRL)
26331 return SDValue();
26332
26333 // Fold the compare into the branch instruction.
26334 SDValue BR;
26335 if (CC == AArch64CC::EQ)
26336 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
26337 else
26338 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
26339
26340 // Do not add new nodes to DAG combiner worklist.
26341 DCI.CombineTo(N, BR, false);
26342
26343 return SDValue();
26344}
26345
26347 unsigned CC = N->getConstantOperandVal(2);
26348 SDValue SUBS = N->getOperand(3);
26349 SDValue Zero, CTTZ;
26350
26351 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
26352 Zero = N->getOperand(0);
26353 CTTZ = N->getOperand(1);
26354 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
26355 Zero = N->getOperand(1);
26356 CTTZ = N->getOperand(0);
26357 } else
26358 return SDValue();
26359
26360 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
26361 (CTTZ.getOpcode() == ISD::TRUNCATE &&
26362 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
26363 return SDValue();
26364
26365 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
26366 "Illegal type in CTTZ folding");
26367
26368 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
26369 return SDValue();
26370
26371 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
26372 ? CTTZ.getOperand(0).getOperand(0)
26373 : CTTZ.getOperand(0);
26374
26375 if (X != SUBS.getOperand(0))
26376 return SDValue();
26377
26378 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
26379 ? CTTZ.getOperand(0).getValueSizeInBits()
26380 : CTTZ.getValueSizeInBits();
26381 SDValue BitWidthMinusOne =
26382 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
26383 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
26384 BitWidthMinusOne);
26385}
26386
26387// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
26388// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
26389// Where x and y are constants and x != y
26390
26391// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
26392// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
26393// Where x and y are constants and x != y
26395 SDValue L = Op->getOperand(0);
26396 SDValue R = Op->getOperand(1);
26397 AArch64CC::CondCode OpCC =
26398 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
26399
26400 SDValue OpCmp = Op->getOperand(3);
26401 if (!isCMP(OpCmp))
26402 return SDValue();
26403
26404 SDValue CmpLHS = OpCmp.getOperand(0);
26405 SDValue CmpRHS = OpCmp.getOperand(1);
26406
26407 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
26408 std::swap(CmpLHS, CmpRHS);
26409 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
26410 return SDValue();
26411
26412 SDValue X = CmpLHS->getOperand(0);
26413 SDValue Y = CmpLHS->getOperand(1);
26414 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
26415 return SDValue();
26416 }
26417
26418 // If one of the constant is opaque constant, x,y sdnode is still different
26419 // but the real value maybe the same. So check APInt here to make sure the
26420 // code is correct.
26423 if (CX->getAPIntValue() == CY->getAPIntValue())
26424 return SDValue();
26425
26427 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
26428 SDValue Cond = CmpLHS->getOperand(3);
26429
26430 if (CmpRHS == Y)
26432 else if (CmpRHS != X)
26433 return SDValue();
26434
26435 if (OpCC == AArch64CC::NE)
26437 else if (OpCC != AArch64CC::EQ)
26438 return SDValue();
26439
26440 SDLoc DL(Op);
26441 EVT VT = Op->getValueType(0);
26442
26443 SDValue CCValue = getCondCode(DAG, CC);
26444 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
26445}
26446
26447// Reassociate the true/false expressions of a CSEL instruction to obtain a
26448// common subexpression with the comparison instruction. For example, change
26449// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
26450// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
26451// subexpression.
26453 SDValue SubsNode = N->getOperand(3);
26454 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
26455 return SDValue();
26456
26457 SDValue CmpOpToMatch = SubsNode.getOperand(1);
26458 SDValue CmpOpOther = SubsNode.getOperand(0);
26459 EVT VT = N->getValueType(0);
26460
26461 unsigned ExpectedOpcode;
26462 SDValue ExpectedOp;
26463 SDValue SubsOp;
26464 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
26465 if (CmpOpConst) {
26466 ExpectedOpcode = ISD::ADD;
26467 ExpectedOp =
26468 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
26469 CmpOpConst->getValueType(0));
26470 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
26471 CmpOpConst->getValueType(0));
26472 } else {
26473 ExpectedOpcode = ISD::SUB;
26474 ExpectedOp = CmpOpToMatch;
26475 SubsOp = CmpOpToMatch;
26476 }
26477
26478 // Get the operand that can be reassociated with the SUBS instruction.
26479 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
26480 if (Op.getOpcode() != ExpectedOpcode)
26481 return SDValue();
26482 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
26483 !Op.getOperand(0).hasOneUse())
26484 return SDValue();
26485 SDValue X = Op.getOperand(0).getOperand(0);
26486 SDValue Y = Op.getOperand(0).getOperand(1);
26487 if (X != CmpOpOther)
26488 std::swap(X, Y);
26489 if (X != CmpOpOther)
26490 return SDValue();
26491 if (ExpectedOp != Op.getOperand(1))
26492 return SDValue();
26493 return Y;
26494 };
26495
26496 // Try the reassociation using the given constant and condition code.
26497 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
26498 SDValue SubsOp) {
26499 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
26500 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
26501 if (!TReassocOp && !FReassocOp)
26502 return SDValue();
26503
26504 SDValue NewCmp =
26505 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
26506 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
26507
26508 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
26509 if (!ReassocOp)
26510 return N->getOperand(OpNum);
26511 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
26512 NewCmp.getValue(0), ReassocOp);
26513 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
26514 return Res;
26515 };
26516
26517 SDValue TValReassoc = Reassociate(TReassocOp, 0);
26518 SDValue FValReassoc = Reassociate(FReassocOp, 1);
26519 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
26520 getCondCode(DAG, NewCC), NewCmp.getValue(1));
26521 };
26522
26523 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
26524
26525 // First, try to eliminate the compare instruction by searching for a
26526 // subtraction with the same constant.
26527 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
26528 return R;
26529
26530 if (!CmpOpConst) {
26531 // Try again with the operands of the SUBS instruction and the condition
26532 // swapped. Due to canonicalization, this only helps for non-constant
26533 // operands of the SUBS instruction.
26534 std::swap(CmpOpToMatch, CmpOpOther);
26535 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
26536 return R;
26537 return SDValue();
26538 }
26539
26540 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
26541 return SDValue();
26542
26543 // Next, search for a subtraction with a slightly different constant. By
26544 // adjusting the condition code, we can still eliminate the compare
26545 // instruction. Adjusting the constant is only valid if it does not result
26546 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
26547 // Since such comparisons are trivially true/false, we should not encounter
26548 // them here but check for them nevertheless to be on the safe side.
26549 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
26550 AArch64CC::CondCode NewCC) {
26551 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
26552 CmpOpConst->getValueType(0));
26553 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
26554 CmpOpConst->getValueType(0));
26555 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
26556 };
26557 switch (CC) {
26558 case AArch64CC::EQ:
26559 case AArch64CC::LS:
26560 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
26561 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
26562 case AArch64CC::NE:
26563 case AArch64CC::HI:
26564 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
26565 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
26566 case AArch64CC::LO:
26567 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
26568 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
26569 case AArch64CC::HS:
26570 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
26571 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
26572 case AArch64CC::LT:
26573 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
26574 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
26575 case AArch64CC::LE:
26576 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
26577 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
26578 case AArch64CC::GT:
26579 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
26580 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
26581 case AArch64CC::GE:
26582 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
26583 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
26584 default:
26585 return SDValue();
26586 }
26587}
26588
26590 AArch64CC::CondCode OpCC =
26591 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
26592
26593 if (OpCC != AArch64CC::NE)
26594 return SDValue();
26595
26596 SDValue PTest = Op->getOperand(3);
26597 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
26598 return SDValue();
26599
26600 SDValue TruePred = PTest.getOperand(0);
26601 SDValue AnyPred = PTest.getOperand(1);
26602
26603 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
26604 TruePred = TruePred.getOperand(0);
26605
26606 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
26607 AnyPred = AnyPred.getOperand(0);
26608
26609 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
26610 return SDValue();
26611
26612 SDValue LastB = Op->getOperand(0);
26613 SDValue Default = Op->getOperand(1);
26614
26615 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
26616 return SDValue();
26617
26618 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
26619 AnyPred, Default, LastB.getOperand(1));
26620}
26621
26622// Optimize CSEL instructions
26625 SelectionDAG &DAG) {
26626 // CSEL x, x, cc -> x
26627 if (N->getOperand(0) == N->getOperand(1))
26628 return N->getOperand(0);
26629
26630 if (SDValue R = foldCSELOfCSEL(N, DAG))
26631 return R;
26632
26633 // Try to reassociate the true/false expressions so that we can do CSE with
26634 // a SUBS instruction used to perform the comparison.
26636 return R;
26637
26638 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
26639 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
26640 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
26641 return Folded;
26642
26643 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
26644 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
26645 SDValue Cond = N->getOperand(3);
26646 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
26647 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
26648 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
26649 {Cond.getOperand(1), Cond.getOperand(0)}) &&
26650 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
26651 {Cond.getOperand(0), Cond.getOperand(1)}) &&
26652 !isNullConstant(Cond.getOperand(1))) {
26653 AArch64CC::CondCode OldCond =
26654 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
26655 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
26656 if (NewCond != AArch64CC::AL) {
26657 SDLoc DL(N);
26658 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
26659 Cond.getOperand(1), Cond.getOperand(0));
26660 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
26661 N->getOperand(1), getCondCode(DAG, NewCond),
26662 Sub.getValue(1));
26663 }
26664 }
26665
26666 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
26667 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
26668 return CondLast;
26669
26670 return performCONDCombine(N, DCI, DAG, 2, 3);
26671}
26672
26673// Try to re-use an already extended operand of a vector SetCC feeding a
26674// extended select. Doing so avoids requiring another full extension of the
26675// SET_CC result when lowering the select.
26677 EVT Op0MVT = Op->getOperand(0).getValueType();
26678 if (!Op0MVT.isVector() || Op->use_empty())
26679 return SDValue();
26680
26681 // Make sure that all uses of Op are VSELECTs with result matching types where
26682 // the result type has a larger element type than the SetCC operand.
26683 SDNode *FirstUse = *Op->user_begin();
26684 if (FirstUse->getOpcode() != ISD::VSELECT)
26685 return SDValue();
26686 EVT UseMVT = FirstUse->getValueType(0);
26687 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
26688 return SDValue();
26689 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
26690 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
26691 }))
26692 return SDValue();
26693
26694 APInt V;
26695 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
26696 return SDValue();
26697
26698 SDLoc DL(Op);
26699 SDValue Op0ExtV;
26700 SDValue Op1ExtV;
26701 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
26702 // Check if the first operand of the SET_CC is already extended. If it is,
26703 // split the SET_CC and re-use the extended version of the operand.
26704 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
26705 Op->getOperand(0));
26706 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
26707 Op->getOperand(0));
26708 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26709 Op0ExtV = SDValue(Op0SExt, 0);
26710 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
26711 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26712 Op0ExtV = SDValue(Op0ZExt, 0);
26713 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
26714 } else
26715 return SDValue();
26716
26717 return DAG.getNode(ISD::SETCC, DL,
26718 UseMVT.changeVectorElementType(*DAG.getContext(), MVT::i1),
26719 Op0ExtV, Op1ExtV, Op->getOperand(2));
26720}
26721
26722static SDValue
26724 SelectionDAG &DAG) {
26725 SDValue Vec = N->getOperand(0);
26726 if (DCI.isBeforeLegalize() &&
26727 Vec.getValueType().getVectorElementType() == MVT::i1 &&
26730 SDLoc DL(N);
26731 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
26732 DAG);
26733 }
26734
26735 return SDValue();
26736}
26737
26740 SelectionDAG &DAG) {
26741 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
26742 SDValue LHS = N->getOperand(0);
26743 SDValue RHS = N->getOperand(1);
26744 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
26745 SDLoc DL(N);
26746 EVT VT = N->getValueType(0);
26747
26748 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
26749 return V;
26750
26751 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
26752 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
26753 LHS->getOpcode() == AArch64ISD::CSEL &&
26754 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
26755 LHS->hasOneUse()) {
26756 // Invert CSEL's condition.
26757 auto OldCond =
26758 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
26759 auto NewCond = getInvertedCondCode(OldCond);
26760
26761 // csel 0, 1, !cond, X
26762 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
26763 LHS.getOperand(0), LHS.getOperand(1),
26764 getCondCode(DAG, NewCond), LHS.getOperand(3));
26765 return DAG.getZExtOrTrunc(CSEL, DL, VT);
26766 }
26767
26768 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
26769 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
26770 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
26771 LHS->hasOneUse()) {
26772 EVT TstVT = LHS->getValueType(0);
26773 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
26774 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
26775 // this pattern will get better opt in emitComparison
26776 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
26777 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
26778 DAG.getSignedConstant(TstImm, DL, TstVT));
26779 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
26780 }
26781 }
26782
26783 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
26784 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
26785 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
26786 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
26787 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
26788 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
26790 LHS->getOpcode() == ISD::BITCAST) {
26791 EVT ToVT = LHS->getValueType(0);
26792 EVT FromVT = LHS->getOperand(0).getValueType();
26793 if (FromVT.isFixedLengthVector() &&
26794 FromVT.getVectorElementType() == MVT::i1) {
26795 bool IsNull = isNullConstant(RHS);
26797 DL, MVT::i1, LHS->getOperand(0));
26798 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
26799 LHS);
26800 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
26801 }
26802 }
26803
26804 // Try to perform the memcmp when the result is tested for [in]equality with 0
26805 if (SDValue V = performOrXorChainCombine(N, DAG))
26806 return V;
26807
26808 EVT CmpVT = LHS.getValueType();
26809
26810 // NOTE: This exists as a combine only because it proved too awkward to match
26811 // splat(1) across all the NEON types during isel.
26812 APInt SplatLHSVal;
26813 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
26814 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
26815 SplatLHSVal.isOne())
26816 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
26817
26818 return SDValue();
26819}
26820
26821// Replace a flag-setting operator (eg ANDS) with the generic version
26822// (eg AND) if the flag is unused.
26825 unsigned GenericOpcode) {
26826 SDLoc DL(N);
26827 SDValue LHS = N->getOperand(0);
26828 SDValue RHS = N->getOperand(1);
26829 EVT VT = N->getValueType(0);
26830
26831 // If the flag result isn't used, convert back to a generic opcode.
26832 if (!N->hasAnyUseOfValue(1)) {
26833 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
26834 return DCI.CombineTo(N, Res, SDValue(N, 1));
26835 }
26836
26837 // Combine equivalent generic nodes into this node, re-using the result.
26838 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
26839 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
26840 /*AllowCommute=*/true))
26841 DCI.CombineTo(Generic, SDValue(N, 0));
26842
26843 return SDValue();
26844}
26845
26848 SelectionDAG &DAG = DCI.DAG;
26850 return R;
26851
26852 // If we have no uses of the AND value, use performANDORCSELCombine to try to
26853 // convert ANDS(CSET(CMP), CSET(CMP)) into CMP(CSET(CCMP(CMP))). The outer
26854 // CMP(CSET should be removed by other combines, folded into the use of the
26855 // CMP.
26856 if (!N->hasAnyUseOfValue(0))
26857 if (SDValue R = performANDORCSELCombine(N, DAG))
26858 return DAG.getNode(AArch64ISD::SUBS, SDLoc(N), N->getVTList(), R,
26859 DAG.getConstant(0, SDLoc(N), N->getValueType(0)));
26860
26861 return SDValue();
26862}
26863
26865 // setcc_merge_zero pred
26866 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
26867 // => extract_subvector (inner setcc_merge_zero)
26868 SDValue Pred = N->getOperand(0);
26869 SDValue LHS = N->getOperand(1);
26870 SDValue RHS = N->getOperand(2);
26871 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26872
26873 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
26874 LHS->getOpcode() != ISD::SIGN_EXTEND)
26875 return SDValue();
26876
26877 SDValue Extract = LHS->getOperand(0);
26878 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26879 Extract->getValueType(0) != N->getValueType(0) ||
26880 Extract->getConstantOperandVal(1) != 0)
26881 return SDValue();
26882
26883 SDValue InnerSetCC = Extract->getOperand(0);
26884 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
26885 return SDValue();
26886
26887 // By this point we've effectively got
26888 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
26889 // lanes are already zero then the trunc(sext()) sequence is redundant and we
26890 // can operate on A directly.
26891 SDValue InnerPred = InnerSetCC.getOperand(0);
26892 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
26893 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
26894 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
26895 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
26896 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
26897 return Extract;
26898
26899 return SDValue();
26900}
26901
26902static bool isSignExtInReg(const SDValue &V) {
26903 if (V.getOpcode() != AArch64ISD::VASHR ||
26904 V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
26905 return false;
26906
26907 unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
26908 unsigned ShiftAmtR = V.getConstantOperandVal(1);
26909 unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
26910 return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
26911}
26912
26913static SDValue
26915 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26916 "Unexpected opcode!");
26917
26918 SelectionDAG &DAG = DCI.DAG;
26919 SDValue Pred = N->getOperand(0);
26920 SDValue LHS = N->getOperand(1);
26921 SDValue RHS = N->getOperand(2);
26922 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26923
26924 if (SDValue V = performSetCCPunpkCombine(N, DAG))
26925 return V;
26926
26927 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26928 LHS->getOpcode() == ISD::SIGN_EXTEND &&
26929 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
26930 // setcc_merge_zero(
26931 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
26932 // => setcc_merge_zero(pred, ...)
26933 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26934 LHS->getOperand(0)->getOperand(0) == Pred)
26935 return LHS->getOperand(0);
26936
26937 // setcc_merge_zero(
26938 // all_active, extend(nxvNi1 ...), != splat(0))
26939 // -> nxvNi1 ...
26940 if (isAllActivePredicate(DAG, Pred))
26941 return LHS->getOperand(0);
26942
26943 // setcc_merge_zero(
26944 // pred, extend(nxvNi1 ...), != splat(0))
26945 // -> nxvNi1 and(pred, ...)
26946 if (DCI.isAfterLegalizeDAG())
26947 // Do this after legalization to allow more folds on setcc_merge_zero
26948 // to be recognized.
26949 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
26950 LHS->getOperand(0), Pred);
26951 }
26952
26953 // setcc_merge_zero(
26954 // pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))
26955 // => setcc_merge_zero(
26956 // pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))
26957 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26958 LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
26959 SDValue L0 = LHS->getOperand(0);
26960 SDValue L1 = LHS->getOperand(1);
26961 SDValue L2 = LHS->getOperand(2);
26962
26963 if (L0.isUndef() && isNullConstant(L2) && isSignExtInReg(L1)) {
26964 SDLoc DL(N);
26965 SDValue Shl = L1.getOperand(0);
26967 LHS.getValueType(), L0, Shl, L2);
26968 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
26969 Pred, NewLHS, RHS, N->getOperand(3));
26970 }
26971 }
26972
26973 return SDValue();
26974}
26975
26976// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
26977// as well as whether the test should be inverted. This code is required to
26978// catch these cases (as opposed to standard dag combines) because
26979// AArch64ISD::TBZ is matched during legalization.
26980static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
26981 SelectionDAG &DAG) {
26982
26983 if (!Op->hasOneUse())
26984 return Op;
26985
26986 // We don't handle undef/constant-fold cases below, as they should have
26987 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
26988 // etc.)
26989
26990 // (tbz (trunc x), b) -> (tbz x, b)
26991 // This case is just here to enable more of the below cases to be caught.
26992 if (Op->getOpcode() == ISD::TRUNCATE &&
26993 Bit < Op->getValueType(0).getSizeInBits()) {
26994 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26995 }
26996
26997 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
26998 if (Op->getOpcode() == ISD::ANY_EXTEND &&
26999 Bit < Op->getOperand(0).getValueSizeInBits()) {
27000 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27001 }
27002
27003 if (Op->getNumOperands() != 2)
27004 return Op;
27005
27006 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
27007 if (!C)
27008 return Op;
27009
27010 switch (Op->getOpcode()) {
27011 default:
27012 return Op;
27013
27014 // (tbz (and x, m), b) -> (tbz x, b)
27015 case ISD::AND:
27016 if ((C->getZExtValue() >> Bit) & 1)
27017 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27018 return Op;
27019
27020 // (tbz (shl x, c), b) -> (tbz x, b-c)
27021 case ISD::SHL:
27022 if (C->getZExtValue() <= Bit &&
27023 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
27024 Bit = Bit - C->getZExtValue();
27025 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27026 }
27027 return Op;
27028
27029 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
27030 case ISD::SRA:
27031 Bit = Bit + C->getZExtValue();
27032 if (Bit >= Op->getValueType(0).getSizeInBits())
27033 Bit = Op->getValueType(0).getSizeInBits() - 1;
27034 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27035
27036 // (tbz (srl x, c), b) -> (tbz x, b+c)
27037 case ISD::SRL:
27038 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
27039 Bit = Bit + C->getZExtValue();
27040 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27041 }
27042 return Op;
27043
27044 // (tbz (xor x, -1), b) -> (tbnz x, b)
27045 case ISD::XOR:
27046 if ((C->getZExtValue() >> Bit) & 1)
27047 Invert = !Invert;
27048 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27049 }
27050}
27051
27052// Optimize test single bit zero/non-zero and branch.
27055 SelectionDAG &DAG) {
27056 unsigned Bit = N->getConstantOperandVal(2);
27057 bool Invert = false;
27058 SDValue TestSrc = N->getOperand(1);
27059 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
27060
27061 if (TestSrc == NewTestSrc)
27062 return SDValue();
27063
27064 unsigned NewOpc = N->getOpcode();
27065 if (Invert) {
27066 if (NewOpc == AArch64ISD::TBZ)
27067 NewOpc = AArch64ISD::TBNZ;
27068 else {
27069 assert(NewOpc == AArch64ISD::TBNZ);
27070 NewOpc = AArch64ISD::TBZ;
27071 }
27072 }
27073
27074 SDLoc DL(N);
27075 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
27076 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
27077}
27078
27079// Swap vselect operands where it may allow a predicated operation to achieve
27080// the `sel`.
27081//
27082// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
27083// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
27085 auto SelectA = N->getOperand(1);
27086 auto SelectB = N->getOperand(2);
27087 auto NTy = N->getValueType(0);
27088
27089 if (!NTy.isScalableVector())
27090 return SDValue();
27091 SDValue SetCC = N->getOperand(0);
27092 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
27093 return SDValue();
27094
27095 switch (SelectB.getOpcode()) {
27096 default:
27097 return SDValue();
27098 case ISD::FMUL:
27099 case ISD::FSUB:
27100 case ISD::FADD:
27101 break;
27102 }
27103 if (SelectA != SelectB.getOperand(0))
27104 return SDValue();
27105
27106 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
27107 ISD::CondCode InverseCC =
27109 auto InverseSetCC =
27110 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
27111 SetCC.getOperand(1), InverseCC);
27112
27113 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
27114 {InverseSetCC, SelectB, SelectA});
27115}
27116
27117// vselect (v1i1 setcc) ->
27118// vselect (v1iXX setcc) (XX is the size of the compared operand type)
27119// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
27120// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
27121// such VSELECT.
27123 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
27124 return SwapResult;
27125
27126 SDValue N0 = N->getOperand(0);
27127 SDValue IfTrue = N->getOperand(1);
27128 SDValue IfFalse = N->getOperand(2);
27129 EVT ResVT = N->getValueType(0);
27130 EVT CCVT = N0.getValueType();
27131
27132 if (isAllActivePredicate(DAG, N0))
27133 return N->getOperand(1);
27134
27135 if (isAllInactivePredicate(N0))
27136 return N->getOperand(2);
27137
27138 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
27139 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
27140 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
27141 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
27142 // -> merge_pasthru_op A, B,{Bn,} C
27143 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
27144 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
27145 IfTrue->getOperand(0) == N0) {
27147 Ops[0] = N0;
27148 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
27149
27150 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
27151 }
27152 }
27153
27154 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
27155 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
27156 // supported types.
27157 SDValue SetCC = N->getOperand(0);
27158 if (SetCC.getOpcode() == ISD::SETCC &&
27159 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
27160 SDValue CmpLHS = SetCC.getOperand(0);
27161 EVT VT = CmpLHS.getValueType();
27162 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
27163 SDNode *SplatLHS = N->getOperand(1).getNode();
27164 SDNode *SplatRHS = N->getOperand(2).getNode();
27165 APInt SplatLHSVal;
27166 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
27167 VT.isSimple() &&
27168 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
27169 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
27170 VT.getSimpleVT().SimpleTy) &&
27171 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
27172 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
27174 unsigned NumElts = VT.getVectorNumElements();
27176 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
27177 VT.getScalarType()));
27178 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
27179
27180 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
27181 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
27182 return Or;
27183 }
27184 }
27185
27186 EVT CmpVT = N0.getOperand(0).getValueType();
27187 if (N0.getOpcode() != ISD::SETCC ||
27189 CCVT.getVectorElementType() != MVT::i1 ||
27191 return SDValue();
27192
27193 // Only combine when the result type is of the same size as the compared
27194 // operands.
27195 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
27196 return SDValue();
27197
27198 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
27199 N0.getOperand(0), N0.getOperand(1),
27200 cast<CondCodeSDNode>(N0.getOperand(2))->get());
27201 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
27202 IfTrue, IfFalse);
27203}
27204
27205/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
27206/// the compare-mask instructions rather than going via NZCV, even if LHS and
27207/// RHS are really scalar. This replaces any scalar setcc in the above pattern
27208/// with a vector one followed by a DUP shuffle on the result.
27211 SelectionDAG &DAG = DCI.DAG;
27212 SDValue N0 = N->getOperand(0);
27213 EVT ResVT = N->getValueType(0);
27214
27215 if (N0.getOpcode() != ISD::SETCC)
27216 return SDValue();
27217
27218 if (ResVT.isScalableVT())
27219 return SDValue();
27220
27221 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
27222 // scalar SetCCResultType. We also don't expect vectors, because we assume
27223 // that selects fed by vector SETCCs are canonicalized to VSELECT.
27224 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
27225 "Scalar-SETCC feeding SELECT has unexpected result type!");
27226
27227 // Don't try to do this optimization when the setcc itself has i1 operands.
27228 // There are no legal vectors of i1, so this would be pointless. v1f16 is
27229 // ruled out to prevent the creation of setcc that need to be scalarized.
27230 EVT SrcVT = N0.getOperand(0).getValueType();
27231 if (SrcVT == MVT::i1 ||
27232 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
27233 return SDValue();
27234
27235 // If NumMaskElts == 0, the comparison is larger than select result. The
27236 // largest real NEON comparison is 64-bits per lane, which means the result is
27237 // at most 32-bits and an illegal vector. Just bail out for now.
27238 unsigned NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
27239 if (!ResVT.isVector() || NumMaskElts == 0)
27240 return SDValue();
27241
27242 // Avoid creating vectors with excessive VFs before legalization.
27243 if (DCI.isBeforeLegalize() && NumMaskElts != ResVT.getVectorNumElements())
27244 return SDValue();
27245
27246 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
27248
27249 // Also bail out if the vector CCVT isn't the same size as ResVT.
27250 // This can happen if the SETCC operand size doesn't divide the ResVT size
27251 // (e.g., f64 vs v3f32).
27252 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
27253 return SDValue();
27254
27255 // Make sure we didn't create illegal types, if we're not supposed to.
27256 assert(DCI.isBeforeLegalize() ||
27257 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
27258
27259 // First perform a vector comparison, where lane 0 is the one we're interested
27260 // in.
27261 SDLoc DL(N0);
27262 SDValue LHS =
27263 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
27264 SDValue RHS =
27265 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
27266 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
27267
27268 // Now duplicate the comparison mask we want across all other lanes.
27269 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
27270 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
27271 Mask = DAG.getNode(ISD::BITCAST, DL,
27272 ResVT.changeVectorElementTypeToInteger(), Mask);
27273
27274 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
27275}
27276
27279 EVT VT = N->getValueType(0);
27280 SDLoc DL(N);
27281 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
27282 // 128bit vector version.
27283 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
27285 SmallVector<SDValue> Ops(N->ops());
27286 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
27287 DCI.DAG.getVTList(LVT), Ops)) {
27288 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
27289 DCI.DAG.getConstant(0, DL, MVT::i64));
27290 }
27291 }
27292
27293 if (N->getOpcode() == AArch64ISD::DUP) {
27294 SDValue Op = N->getOperand(0);
27295
27296 // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer.
27297 // For example:
27298 // v4i32 = DUP (i32 (zextloadi8 addr))
27299 // =>
27300 // v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
27301 // v4i32 = DUPLANE32 (v4i32), 0
27302 if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
27303 ISD::LoadExtType ExtType = LD->getExtensionType();
27304 EVT MemVT = LD->getMemoryVT();
27305 EVT ElemVT = VT.getVectorElementType();
27306 if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
27307 (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
27308 ElemVT != MemVT && LD->hasOneUse()) {
27309 EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
27310 128 / ElemVT.getSizeInBits());
27311 SDValue ScalarToVec =
27312 DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
27313 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
27314 DCI.DAG.getConstant(0, DL, MVT::i64));
27315 }
27316 }
27317
27318 // If the instruction is known to produce a scalar in SIMD registers, we can
27319 // duplicate it across the vector lanes using DUPLANE instead of moving it
27320 // to a GPR first. For example, this allows us to handle:
27321 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
27322 // FIXME: Ideally, we should be able to handle all instructions that
27323 // produce a scalar value in FPRs.
27324 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
27325 Op.getOpcode() == AArch64ISD::FCMGE ||
27326 Op.getOpcode() == AArch64ISD::FCMGT) {
27327 EVT ElemVT = VT.getVectorElementType();
27328 EVT ExpandedVT = VT;
27329 // Insert into a 128-bit vector to match DUPLANE's pattern.
27330 if (VT.getSizeInBits() != 128)
27331 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
27332 128 / ElemVT.getSizeInBits());
27333 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
27334 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
27335 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
27336 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
27337 }
27338
27339 if (DCI.isAfterLegalizeDAG()) {
27340 // If scalar dup's operand is extract_vector_elt, try to combine them into
27341 // duplane. For example,
27342 //
27343 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
27344 // t18: v4i32 = AArch64ISD::DUP t21
27345 // ==>
27346 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
27347 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
27348 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
27349 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
27350 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
27351 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
27352 EXTRACT_VEC_ELT.getOperand(1));
27353 }
27354 }
27355 }
27356
27357 return performPostLD1Combine(N, DCI, false);
27358 }
27359
27360 return SDValue();
27361}
27362
27363/// Get rid of unnecessary NVCASTs (that don't change the type).
27365 if (N->getValueType(0) == N->getOperand(0).getValueType())
27366 return N->getOperand(0);
27367 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
27368 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
27369 N->getOperand(0).getOperand(0));
27370
27371 return SDValue();
27372}
27373
27374// If all users of the globaladdr are of the form (globaladdr + constant), find
27375// the smallest constant, fold it into the globaladdr's offset and rewrite the
27376// globaladdr as (globaladdr + constant) - constant.
27378 const AArch64Subtarget *Subtarget,
27379 const TargetMachine &TM) {
27380 auto *GN = cast<GlobalAddressSDNode>(N);
27381 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
27383 return SDValue();
27384
27385 uint64_t MinOffset = -1ull;
27386 for (SDNode *N : GN->users()) {
27387 if (N->getOpcode() != ISD::ADD)
27388 return SDValue();
27389 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
27390 if (!C)
27391 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
27392 if (!C)
27393 return SDValue();
27394 MinOffset = std::min(MinOffset, C->getZExtValue());
27395 }
27396 uint64_t Offset = MinOffset + GN->getOffset();
27397
27398 // Require that the new offset is larger than the existing one. Otherwise, we
27399 // can end up oscillating between two possible DAGs, for example,
27400 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
27401 if (Offset <= uint64_t(GN->getOffset()))
27402 return SDValue();
27403
27404 // Check whether folding this offset is legal. It must not go out of bounds of
27405 // the referenced object to avoid violating the code model, and must be
27406 // smaller than 2^20 because this is the largest offset expressible in all
27407 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
27408 // stores an immediate signed 21 bit offset.)
27409 //
27410 // This check also prevents us from folding negative offsets, which will end
27411 // up being treated in the same way as large positive ones. They could also
27412 // cause code model violations, and aren't really common enough to matter.
27413 if (Offset >= (1 << 20))
27414 return SDValue();
27415
27416 const GlobalValue *GV = GN->getGlobal();
27417 Type *T = GV->getValueType();
27418 if (!T->isSized() ||
27420 return SDValue();
27421
27422 SDLoc DL(GN);
27423 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
27424 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
27425 DAG.getConstant(MinOffset, DL, MVT::i64));
27426}
27427
27429 const AArch64Subtarget *Subtarget) {
27430 SDValue BR = N->getOperand(0);
27431 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
27432 !BR.getValueType().isScalarInteger())
27433 return SDValue();
27434
27435 SDLoc DL(N);
27436 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
27437}
27438
27439// Turns the vector of indices into a vector of byte offstes by scaling Offset
27440// by (BitWidth / 8).
27442 SDLoc DL, unsigned BitWidth) {
27443 assert(Offset.getValueType().isScalableVector() &&
27444 "This method is only for scalable vectors of offsets");
27445
27446 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
27447 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
27448
27449 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
27450}
27451
27452/// Check if the value of \p OffsetInBytes can be used as an immediate for
27453/// the gather load/prefetch and scatter store instructions with vector base and
27454/// immediate offset addressing mode:
27455///
27456/// [<Zn>.[S|D]{, #<imm>}]
27457///
27458/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
27459inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
27460 unsigned ScalarSizeInBytes) {
27461 // The immediate is not a multiple of the scalar size.
27462 if (OffsetInBytes % ScalarSizeInBytes)
27463 return false;
27464
27465 // The immediate is out of range.
27466 if (OffsetInBytes / ScalarSizeInBytes > 31)
27467 return false;
27468
27469 return true;
27470}
27471
27472/// Check if the value of \p Offset represents a valid immediate for the SVE
27473/// gather load/prefetch and scatter store instructiona with vector base and
27474/// immediate offset addressing mode:
27475///
27476/// [<Zn>.[S|D]{, #<imm>}]
27477///
27478/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
27480 unsigned ScalarSizeInBytes) {
27481 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
27482 return OffsetConst && isValidImmForSVEVecImmAddrMode(
27483 OffsetConst->getZExtValue(), ScalarSizeInBytes);
27484}
27485
27487 unsigned Opcode,
27488 bool OnlyPackedOffsets = true) {
27489 const SDValue Src = N->getOperand(2);
27490 const EVT SrcVT = Src->getValueType(0);
27491 assert(SrcVT.isScalableVector() &&
27492 "Scatter stores are only possible for SVE vectors");
27493
27494 SDLoc DL(N);
27495 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
27496
27497 // Make sure that source data will fit into an SVE register
27499 return SDValue();
27500
27501 // For FPs, ACLE only supports _packed_ single and double precision types.
27502 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
27503 if (SrcElVT.isFloatingPoint())
27504 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
27505 ((Opcode != AArch64ISD::SST1Q_PRED &&
27506 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
27507 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
27508 return SDValue();
27509
27510 // Depending on the addressing mode, this is either a pointer or a vector of
27511 // pointers (that fits into one register)
27512 SDValue Base = N->getOperand(4);
27513 // Depending on the addressing mode, this is either a single offset or a
27514 // vector of offsets (that fits into one register)
27515 SDValue Offset = N->getOperand(5);
27516
27517 // For "scalar + vector of indices", just scale the indices. This only
27518 // applies to non-temporal scatters because there's no instruction that takes
27519 // indices.
27520 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
27521 Offset =
27523 Opcode = AArch64ISD::SSTNT1_PRED;
27524 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
27525 Offset =
27527 Opcode = AArch64ISD::SST1Q_PRED;
27528 }
27529
27530 // In the case of non-temporal gather loads there's only one SVE instruction
27531 // per data-size: "scalar + vector", i.e.
27532 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
27533 // Since we do have intrinsics that allow the arguments to be in a different
27534 // order, we may need to swap them to match the spec.
27535 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
27536 Offset.getValueType().isVector())
27538
27539 // SST1_IMM requires that the offset is an immediate that is:
27540 // * a multiple of #SizeInBytes,
27541 // * in the range [0, 31 x #SizeInBytes],
27542 // where #SizeInBytes is the size in bytes of the stored items. For
27543 // immediates outside that range and non-immediate scalar offsets use SST1 or
27544 // SST1_UXTW instead.
27545 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
27547 SrcVT.getScalarSizeInBits() / 8)) {
27548 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
27549 Opcode = AArch64ISD::SST1_UXTW_PRED;
27550 else
27551 Opcode = AArch64ISD::SST1_PRED;
27552
27554 }
27555 }
27556
27557 auto &TLI = DAG.getTargetLoweringInfo();
27558 if (!TLI.isTypeLegal(Base.getValueType()))
27559 return SDValue();
27560
27561 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
27562 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
27563 // nxv2i64. Legalize accordingly.
27564 if (!OnlyPackedOffsets &&
27565 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
27566 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
27567
27568 if (!TLI.isTypeLegal(Offset.getValueType()))
27569 return SDValue();
27570
27571 // Source value type that is representable in hardware
27572 EVT HwSrcVt = getSVEContainerType(SrcVT);
27573
27574 // Keep the original type of the input data to store - this is needed to be
27575 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
27576 // FP values we want the integer equivalent, so just use HwSrcVt.
27577 SDValue InputVT = DAG.getValueType(SrcVT);
27578 if (SrcVT.isFloatingPoint())
27579 InputVT = DAG.getValueType(HwSrcVt);
27580
27581 SDVTList VTs = DAG.getVTList(MVT::Other);
27582 SDValue SrcNew;
27583
27584 if (Src.getValueType().isFloatingPoint())
27585 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
27586 else
27587 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
27588
27589 SDValue Ops[] = {N->getOperand(0), // Chain
27590 SrcNew,
27591 N->getOperand(3), // Pg
27592 Base,
27593 Offset,
27594 InputVT};
27595
27596 return DAG.getNode(Opcode, DL, VTs, Ops);
27597}
27598
27600 unsigned Opcode,
27601 bool OnlyPackedOffsets = true) {
27602 const EVT RetVT = N->getValueType(0);
27603 assert(RetVT.isScalableVector() &&
27604 "Gather loads are only possible for SVE vectors");
27605
27606 SDLoc DL(N);
27607
27608 // Make sure that the loaded data will fit into an SVE register
27610 return SDValue();
27611
27612 // Depending on the addressing mode, this is either a pointer or a vector of
27613 // pointers (that fits into one register)
27614 SDValue Base = N->getOperand(3);
27615 // Depending on the addressing mode, this is either a single offset or a
27616 // vector of offsets (that fits into one register)
27617 SDValue Offset = N->getOperand(4);
27618
27619 // For "scalar + vector of indices", scale the indices to obtain unscaled
27620 // offsets. This applies to non-temporal and quadword gathers, which do not
27621 // have an addressing mode with scaled offset.
27622 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
27624 RetVT.getScalarSizeInBits());
27625 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
27626 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
27628 RetVT.getScalarSizeInBits());
27629 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
27630 }
27631
27632 // In the case of non-temporal gather loads and quadword gather loads there's
27633 // only one addressing mode : "vector + scalar", e.g.
27634 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
27635 // Since we do have intrinsics that allow the arguments to be in a different
27636 // order, we may need to swap them to match the spec.
27637 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
27638 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
27639 Offset.getValueType().isVector())
27641
27642 // GLD{FF}1_IMM requires that the offset is an immediate that is:
27643 // * a multiple of #SizeInBytes,
27644 // * in the range [0, 31 x #SizeInBytes],
27645 // where #SizeInBytes is the size in bytes of the loaded items. For
27646 // immediates outside that range and non-immediate scalar offsets use
27647 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
27648 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
27649 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
27651 RetVT.getScalarSizeInBits() / 8)) {
27652 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
27653 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
27654 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
27655 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
27656 else
27657 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
27658 ? AArch64ISD::GLD1_MERGE_ZERO
27659 : AArch64ISD::GLDFF1_MERGE_ZERO;
27660
27662 }
27663 }
27664
27665 auto &TLI = DAG.getTargetLoweringInfo();
27666 if (!TLI.isTypeLegal(Base.getValueType()))
27667 return SDValue();
27668
27669 // Some gather load variants allow unpacked offsets, but only as nxv2i32
27670 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
27671 // nxv2i64. Legalize accordingly.
27672 if (!OnlyPackedOffsets &&
27673 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
27674 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
27675
27676 // Return value type that is representable in hardware
27677 EVT HwRetVt = getSVEContainerType(RetVT);
27678
27679 // Keep the original output value type around - this is needed to be able to
27680 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
27681 // values we want the integer equivalent, so just use HwRetVT.
27682 SDValue OutVT = DAG.getValueType(RetVT);
27683 if (RetVT.isFloatingPoint())
27684 OutVT = DAG.getValueType(HwRetVt);
27685
27686 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
27687 SDValue Ops[] = {N->getOperand(0), // Chain
27688 N->getOperand(2), // Pg
27689 Base, Offset, OutVT};
27690
27691 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
27692 SDValue LoadChain = SDValue(Load.getNode(), 1);
27693
27694 if (RetVT.isInteger() && (RetVT != HwRetVt))
27695 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
27696
27697 // If the original return value was FP, bitcast accordingly. Doing it here
27698 // means that we can avoid adding TableGen patterns for FPs.
27699 if (RetVT.isFloatingPoint())
27700 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
27701
27702 return DAG.getMergeValues({Load, LoadChain}, DL);
27703}
27704
27705static SDValue
27707 SelectionDAG &DAG) {
27708 SDLoc DL(N);
27709 SDValue Src = N->getOperand(0);
27710 unsigned Opc = Src->getOpcode();
27711
27712 // Sign extend of an unsigned unpack -> signed unpack
27713 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
27714
27715 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
27716 : AArch64ISD::SUNPKLO;
27717
27718 // Push the sign extend to the operand of the unpack
27719 // This is necessary where, for example, the operand of the unpack
27720 // is another unpack:
27721 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
27722 // ->
27723 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
27724 // ->
27725 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
27726 SDValue ExtOp = Src->getOperand(0);
27727 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
27728 EVT EltTy = VT.getVectorElementType();
27729 (void)EltTy;
27730
27731 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
27732 "Sign extending from an invalid type");
27733
27734 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
27735
27737 ExtOp, DAG.getValueType(ExtVT));
27738
27739 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
27740 }
27741
27742 // Sign extend of CSET -> CSETM.
27743 if (Opc == AArch64ISD::CSEL &&
27744 cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
27745 EVT VT = N->getValueType(0);
27746 SDValue TVal = Src.getOperand(0);
27747 SDValue FVal = Src.getOperand(1);
27748
27749 // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
27750 if (isNullConstant(TVal) && isOneConstant(FVal))
27751 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
27752 DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
27753 Src.getOperand(3));
27754
27755 // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
27756 if (isOneConstant(TVal) && isNullConstant(FVal))
27757 return DAG.getNode(AArch64ISD::CSEL, DL, VT,
27758 DAG.getAllOnesConstant(DL, VT), FVal,
27759 Src.getOperand(2), Src.getOperand(3));
27760 }
27761
27762 if (DCI.isBeforeLegalizeOps())
27763 return SDValue();
27764
27766 return SDValue();
27767
27768 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
27769 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
27770 unsigned NewOpc;
27771 unsigned MemVTOpNum = 4;
27772 switch (Opc) {
27773 case AArch64ISD::LD1_MERGE_ZERO:
27774 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
27775 MemVTOpNum = 3;
27776 break;
27777 case AArch64ISD::LDNF1_MERGE_ZERO:
27778 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
27779 MemVTOpNum = 3;
27780 break;
27781 case AArch64ISD::LDFF1_MERGE_ZERO:
27782 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
27783 MemVTOpNum = 3;
27784 break;
27785 case AArch64ISD::GLD1_MERGE_ZERO:
27786 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
27787 break;
27788 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27789 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
27790 break;
27791 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27792 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
27793 break;
27794 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27795 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
27796 break;
27797 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27798 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
27799 break;
27800 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27801 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
27802 break;
27803 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27804 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
27805 break;
27806 case AArch64ISD::GLDFF1_MERGE_ZERO:
27807 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
27808 break;
27809 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
27810 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
27811 break;
27812 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
27813 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
27814 break;
27815 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
27816 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
27817 break;
27818 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
27819 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
27820 break;
27821 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
27822 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
27823 break;
27824 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
27825 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
27826 break;
27827 case AArch64ISD::GLDNT1_MERGE_ZERO:
27828 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
27829 break;
27830 default:
27831 return SDValue();
27832 }
27833
27834 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
27835 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
27836
27837 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
27838 return SDValue();
27839
27840 EVT DstVT = N->getValueType(0);
27841 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
27842
27844 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
27845 Ops.push_back(Src->getOperand(I));
27846
27847 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
27848 DCI.CombineTo(N, ExtLoad);
27849 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
27850
27851 // Return N so it doesn't get rechecked
27852 return SDValue(N, 0);
27853}
27854
27855/// Legalize the gather prefetch (scalar + vector addressing mode) when the
27856/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
27857/// != nxv2i32) do not need legalization.
27859 const unsigned OffsetPos = 4;
27860 SDValue Offset = N->getOperand(OffsetPos);
27861
27862 // Not an unpacked vector, bail out.
27863 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
27864 return SDValue();
27865
27866 // Extend the unpacked offset vector to 64-bit lanes.
27867 SDLoc DL(N);
27868 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
27870 // Replace the offset operand with the 64-bit one.
27871 Ops[OffsetPos] = Offset;
27872
27873 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27874}
27875
27876/// Combines a node carrying the intrinsic
27877/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
27878/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
27879/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
27880/// sve gather prefetch instruction with vector plus immediate addressing mode.
27882 unsigned ScalarSizeInBytes) {
27883 const unsigned ImmPos = 4, OffsetPos = 3;
27884 // No need to combine the node if the immediate is valid...
27885 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
27886 return SDValue();
27887
27888 // ...otherwise swap the offset base with the offset...
27890 std::swap(Ops[ImmPos], Ops[OffsetPos]);
27891 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
27892 // `aarch64_sve_prfb_gather_uxtw_index`.
27893 SDLoc DL(N);
27894 Ops[1] = DAG.getTargetConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index,
27895 DL, MVT::i64);
27896
27897 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27898}
27899
27900// Return true if the vector operation can guarantee only the first lane of its
27901// result contains data, with all bits in other lanes set to zero.
27903 switch (Op.getOpcode()) {
27904 default:
27905 return false;
27906 case AArch64ISD::ANDV_PRED:
27907 case AArch64ISD::EORV_PRED:
27908 case AArch64ISD::FADDA_PRED:
27909 case AArch64ISD::FADDV_PRED:
27910 case AArch64ISD::FMAXNMV_PRED:
27911 case AArch64ISD::FMAXV_PRED:
27912 case AArch64ISD::FMINNMV_PRED:
27913 case AArch64ISD::FMINV_PRED:
27914 case AArch64ISD::ORV_PRED:
27915 case AArch64ISD::SADDV_PRED:
27916 case AArch64ISD::SMAXV_PRED:
27917 case AArch64ISD::SMINV_PRED:
27918 case AArch64ISD::UADDV_PRED:
27919 case AArch64ISD::UMAXV_PRED:
27920 case AArch64ISD::UMINV_PRED:
27921 return true;
27922 }
27923}
27924
27925// Return true if the vector operation can guarantee that the first lane of its
27926// result is active.
27928 switch (Op.getOpcode()) {
27929 default:
27930 return false;
27931 case AArch64ISD::REINTERPRET_CAST:
27932 return isLane0KnownActive(Op->getOperand(0));
27933 case ISD::SPLAT_VECTOR:
27934 return isOneConstant(Op.getOperand(0));
27935 case AArch64ISD::PTRUE:
27936 return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;
27937 };
27938}
27939
27941 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
27942 SDValue InsertVec = N->getOperand(0);
27943 SDValue InsertElt = N->getOperand(1);
27944 SDValue InsertIdx = N->getOperand(2);
27945
27946 // We only care about inserts into the first element...
27947 if (!isNullConstant(InsertIdx))
27948 return SDValue();
27949 // ...of a zero'd vector...
27951 return SDValue();
27952 // ...where the inserted data was previously extracted...
27953 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
27954 return SDValue();
27955
27956 SDValue ExtractVec = InsertElt.getOperand(0);
27957 SDValue ExtractIdx = InsertElt.getOperand(1);
27958
27959 // ...from the first element of a vector.
27960 if (!isNullConstant(ExtractIdx))
27961 return SDValue();
27962
27963 // If we get here we are effectively trying to zero lanes 1-N of a vector.
27964
27965 // Ensure there's no type conversion going on.
27966 if (N->getValueType(0) != ExtractVec.getValueType())
27967 return SDValue();
27968
27969 if (!isLanes1toNKnownZero(ExtractVec))
27970 return SDValue();
27971
27972 // The explicit zeroing is redundant.
27973 return ExtractVec;
27974}
27975
27976static SDValue
27979 return Res;
27980
27981 // Turn INSERT_VECTOR_ELT(undef, Elt, Idx) into SPLAT_VECTOR(Elt)
27982 // Do not bother with inserts into lane 0 because there are patterns to select
27983 // them using INSERT_SUBREG hsub/ssub/dsub.
27984 SDLoc DL(N);
27985 SDValue Vec = N->getOperand(0);
27986 SDValue Elt = N->getOperand(1);
27987 SDValue Idx = N->getOperand(2);
27988 EVT VecVT = Vec.getValueType();
27989 if (VecVT.isScalableVector() && Vec->isUndef() && !isNullConstant(Idx))
27990 return DCI.DAG.getNode(ISD::SPLAT_VECTOR, DL, VecVT, Elt);
27991
27992 return performPostLD1Combine(N, DCI, true);
27993}
27994
27997 const AArch64Subtarget *Subtarget) {
27998 SDValue N0 = N->getOperand(0);
27999 EVT VT = N->getValueType(0);
28000
28001 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
28002 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
28003 return SDValue();
28004
28005 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
28006 EVT EltVT = VT.getVectorElementType();
28007 return EltVT == MVT::f32 || EltVT == MVT::f64;
28008 };
28009
28010 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
28011 // We purposefully don't care about legality of the nodes here as we know
28012 // they can be split down into something legal.
28013 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
28014 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
28015 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
28016 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
28017 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
28018 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
28019 LN0->getChain(), LN0->getBasePtr(),
28020 N0.getValueType(), LN0->getMemOperand());
28021 DCI.CombineTo(N, ExtLoad);
28022 DCI.CombineTo(
28023 N0.getNode(),
28024 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
28025 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
28026 ExtLoad.getValue(1));
28027 return SDValue(N, 0); // Return N so it doesn't get rechecked!
28028 }
28029
28030 return SDValue();
28031}
28032
28034 const AArch64Subtarget *Subtarget) {
28035 EVT VT = N->getValueType(0);
28036
28037 // Don't expand for NEON, SVE2 or SME
28038 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
28039 return SDValue();
28040
28041 SDLoc DL(N);
28042
28043 SDValue Mask = N->getOperand(0);
28044 SDValue In1 = N->getOperand(1);
28045 SDValue In2 = N->getOperand(2);
28046
28047 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
28048 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
28049 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
28050 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
28051}
28052
28054 EVT VT = N->getValueType(0);
28055
28056 SDValue Insert = N->getOperand(0);
28057 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
28058 return SDValue();
28059
28060 if (!Insert.getOperand(0).isUndef())
28061 return SDValue();
28062
28063 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
28064 uint64_t IdxDupLane = N->getConstantOperandVal(1);
28065 if (IdxInsert != 0 || IdxDupLane != 0)
28066 return SDValue();
28067
28068 SDValue Bitcast = Insert.getOperand(1);
28069 if (Bitcast.getOpcode() != ISD::BITCAST)
28070 return SDValue();
28071
28072 SDValue Subvec = Bitcast.getOperand(0);
28073 EVT SubvecVT = Subvec.getValueType();
28074 if (!SubvecVT.is128BitVector())
28075 return SDValue();
28076 EVT NewSubvecVT =
28078
28079 SDLoc DL(N);
28080 SDValue NewInsert =
28081 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
28082 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
28083 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
28084 NewInsert, N->getOperand(1));
28085 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
28086}
28087
28088// Try to combine mull with uzp1.
28091 SelectionDAG &DAG) {
28092 if (DCI.isBeforeLegalizeOps())
28093 return SDValue();
28094
28095 SDValue LHS = N->getOperand(0);
28096 SDValue RHS = N->getOperand(1);
28097
28098 SDValue ExtractHigh;
28099 SDValue ExtractLow;
28100 SDValue TruncHigh;
28101 SDValue TruncLow;
28102 SDLoc DL(N);
28103
28104 // Check the operands are trunc and extract_high.
28106 RHS.getOpcode() == ISD::TRUNCATE) {
28107 TruncHigh = RHS;
28108 if (LHS.getOpcode() == ISD::BITCAST)
28109 ExtractHigh = LHS.getOperand(0);
28110 else
28111 ExtractHigh = LHS;
28113 LHS.getOpcode() == ISD::TRUNCATE) {
28114 TruncHigh = LHS;
28115 if (RHS.getOpcode() == ISD::BITCAST)
28116 ExtractHigh = RHS.getOperand(0);
28117 else
28118 ExtractHigh = RHS;
28119 } else
28120 return SDValue();
28121
28122 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
28123 // with uzp1.
28124 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
28125 SDValue TruncHighOp = TruncHigh.getOperand(0);
28126 EVT TruncHighOpVT = TruncHighOp.getValueType();
28127 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
28128 DAG.isSplatValue(TruncHighOp, false))
28129 return SDValue();
28130
28131 // Check there is other extract_high with same source vector.
28132 // For example,
28133 //
28134 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
28135 // t12: v4i16 = truncate t11
28136 // t31: v4i32 = AArch64ISD::SMULL t18, t12
28137 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
28138 // t16: v4i16 = truncate t15
28139 // t30: v4i32 = AArch64ISD::SMULL t23, t1
28140 //
28141 // This dagcombine assumes the two extract_high uses same source vector in
28142 // order to detect the pair of the mull. If they have different source vector,
28143 // this code will not work.
28144 // TODO: Should also try to look through a bitcast.
28145 bool HasFoundMULLow = true;
28146 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
28147 if (ExtractHighSrcVec->use_size() != 2)
28148 HasFoundMULLow = false;
28149
28150 // Find ExtractLow.
28151 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
28152 if (User == ExtractHigh.getNode())
28153 continue;
28154
28155 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
28157 HasFoundMULLow = false;
28158 break;
28159 }
28160
28161 ExtractLow.setNode(User);
28162 }
28163
28164 if (!ExtractLow || !ExtractLow->hasOneUse())
28165 HasFoundMULLow = false;
28166
28167 // Check ExtractLow's user.
28168 if (HasFoundMULLow) {
28169 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
28170 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
28171 HasFoundMULLow = false;
28172 } else {
28173 if (ExtractLowUser->getOperand(0) == ExtractLow) {
28174 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
28175 TruncLow = ExtractLowUser->getOperand(1);
28176 else
28177 HasFoundMULLow = false;
28178 } else {
28179 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
28180 TruncLow = ExtractLowUser->getOperand(0);
28181 else
28182 HasFoundMULLow = false;
28183 }
28184 }
28185 }
28186
28187 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
28188 // with uzp1.
28189 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
28190 EVT TruncHighVT = TruncHigh.getValueType();
28191 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
28192 SDValue TruncLowOp =
28193 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
28194 EVT TruncLowOpVT = TruncLowOp.getValueType();
28195 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
28196 DAG.isSplatValue(TruncLowOp, false)))
28197 return SDValue();
28198
28199 // Create uzp1, extract_high and extract_low.
28200 if (TruncHighOpVT != UZP1VT)
28201 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
28202 if (TruncLowOpVT != UZP1VT)
28203 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
28204
28205 SDValue UZP1 =
28206 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
28207 SDValue HighIdxCst =
28208 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
28209 SDValue NewTruncHigh =
28210 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
28211 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
28212
28213 if (HasFoundMULLow) {
28214 EVT TruncLowVT = TruncLow.getValueType();
28215 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
28216 UZP1, ExtractLow.getOperand(1));
28217 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
28218 }
28219
28220 return SDValue(N, 0);
28221}
28222
28225 SelectionDAG &DAG) {
28226 if (SDValue Val =
28228 return Val;
28229
28230 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
28231 return Val;
28232
28233 return SDValue();
28234}
28235
28238 SelectionDAG &DAG) {
28239 if (DCI.isBeforeLegalize())
28240 return SDValue();
28241
28242 SDLoc DL(N);
28243 auto Mask = N->getOperand(0);
28244 auto Pred = N->getOperand(1);
28245
28246 if (!isLane0KnownActive(Mask))
28247 return SDValue();
28248
28249 if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)
28250 Pred = Pred->getOperand(0);
28251
28252 if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {
28253 Pred = Pred->getOperand(0);
28254 Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);
28255 return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,
28256 Pred);
28257 }
28258
28259 return SDValue();
28260}
28261
28262static SDValue
28264 SelectionDAG &DAG) {
28265 SDLoc DL(N);
28266
28267 // If a DUP(Op0) already exists, reuse it for the scalar_to_vector.
28268 if (DCI.isAfterLegalizeDAG()) {
28269 if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(),
28270 N->getOperand(0)))
28271 return SDValue(LN, 0);
28272 }
28273
28274 // Let's do below transform.
28275 //
28276 // t34: v4i32 = AArch64ISD::UADDLV t2
28277 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
28278 // t7: i64 = zero_extend t35
28279 // t20: v1i64 = scalar_to_vector t7
28280 // ==>
28281 // t34: v4i32 = AArch64ISD::UADDLV t2
28282 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
28283 // t40: v1i64 = AArch64ISD::NVCAST t39
28284 if (DCI.isBeforeLegalizeOps())
28285 return SDValue();
28286
28287 EVT VT = N->getValueType(0);
28288 if (VT != MVT::v1i64)
28289 return SDValue();
28290
28291 SDValue ZEXT = N->getOperand(0);
28292 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
28293 return SDValue();
28294
28295 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
28296 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
28297 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
28298 return SDValue();
28299
28300 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
28301 return SDValue();
28302
28303 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
28304 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
28305 UADDLV.getValueType() != MVT::v4i32 ||
28306 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
28307 return SDValue();
28308
28309 // Let's generate new sequence with AArch64ISD::NVCAST.
28310 SDValue EXTRACT_SUBVEC =
28311 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
28312 DAG.getConstant(0, DL, MVT::i64));
28313 SDValue NVCAST =
28314 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
28315
28316 return NVCAST;
28317}
28318
28321 if (!DCI.isBeforeLegalize())
28322 return SDValue();
28323
28324 unsigned NumParts = N->getNumOperands();
28325 if (NumParts != 2 && NumParts != 4)
28326 return SDValue();
28327
28328 EVT SubVecTy = N->getValueType(0);
28329
28330 // At the moment we're unlikely to see a fixed-width vector deinterleave as
28331 // we usually generate shuffles instead.
28332 unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
28333 if (!SubVecTy.isScalableVector() ||
28334 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
28335 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
28336 return SDValue();
28337
28338 // Make sure each input operand is the correct extract_subvector of the same
28339 // wider vector.
28340 SDValue Op0 = N->getOperand(0);
28341 for (unsigned I = 0; I < NumParts; I++) {
28342 SDValue OpI = N->getOperand(I);
28343 if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
28344 OpI->getOperand(0) != Op0->getOperand(0))
28345 return SDValue();
28346 if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
28347 return SDValue();
28348 }
28349
28350 // Normal loads are currently already handled by the InterleavedAccessPass so
28351 // we don't expect to see them here. Bail out if the masked load has an
28352 // unexpected number of uses, since we want to avoid a situation where we have
28353 // both deinterleaving loads and normal loads in the same block. Also, discard
28354 // masked loads that are extending, indexed, have an unexpected offset or have
28355 // an unsupported passthru value until we find a valid use case.
28356 auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
28357 if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
28358 !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
28359 !MaskedLoad->getOffset().isUndef() ||
28360 (!MaskedLoad->getPassThru()->isUndef() &&
28361 !isZerosVector(MaskedLoad->getPassThru().getNode())))
28362 return SDValue();
28363
28364 // Now prove that the mask is an interleave of identical masks.
28365 SDLoc DL(N);
28366 SDValue NarrowMask =
28367 getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
28368 if (!NarrowMask)
28369 return SDValue();
28370
28371 const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
28372 : Intrinsic::aarch64_sve_ld4_sret;
28373 SDValue NewLdOps[] = {MaskedLoad->getChain(),
28374 DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
28375 MaskedLoad->getBasePtr()};
28376 SDValue Res;
28377 if (NumParts == 2)
28379 {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
28380 else
28382 {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
28383 NewLdOps);
28384
28385 // We can now generate a structured load!
28386 SmallVector<SDValue, 4> ResOps(NumParts);
28387 for (unsigned Idx = 0; Idx < NumParts; Idx++)
28388 ResOps[Idx] = SDValue(Res.getNode(), Idx);
28389
28390 // Replace uses of the original chain result with the new chain result.
28391 DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
28392 SDValue(Res.getNode(), NumParts));
28393 return DCI.CombineTo(N, ResOps, false);
28394}
28395
28396/// If the operand is a bitwise AND with a constant RHS, and the shift has a
28397/// constant RHS and is the only use, we can pull it out of the shift, i.e.
28398///
28399/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
28400///
28401/// We prefer this canonical form to match existing isel patterns.
28404 SelectionDAG &DAG) {
28405 if (DCI.isBeforeLegalizeOps())
28406 return SDValue();
28407
28408 SDValue Op0 = N->getOperand(0);
28409 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
28410 return SDValue();
28411
28412 SDValue C1 = Op0->getOperand(1);
28413 SDValue C2 = N->getOperand(1);
28415 return SDValue();
28416
28417 // Might be folded into shifted op, do not lower.
28418 if (N->hasOneUse()) {
28419 unsigned UseOpc = N->user_begin()->getOpcode();
28420 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
28421 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
28422 return SDValue();
28423 }
28424
28425 SDLoc DL(N);
28426 EVT VT = N->getValueType(0);
28427
28428 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
28429 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
28430 // causing infinite loop. Result may also be worse.
28431 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
28432 if (!isa<ConstantSDNode>(NewRHS))
28433 return SDValue();
28434
28435 SDValue X = Op0->getOperand(0);
28436 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
28437 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
28438}
28439
28441 unsigned IntrinsicID = N->getConstantOperandVal(1);
28442 auto Register =
28443 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
28444 : AArch64SysReg::RNDRRS);
28445 SDLoc DL(N);
28446 SDValue A = DAG.getNode(
28447 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
28448 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
28449 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
28450 DAG.getConstant(0, DL, MVT::i32),
28451 DAG.getConstant(0, DL, MVT::i32),
28452 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
28453 return DAG.getMergeValues(
28454 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
28455}
28456
28459 SelectionDAG &DAG) {
28460 using namespace llvm::SDPatternMatch;
28461 if (!DCI.isBeforeLegalize())
28462 return SDValue();
28463
28464 // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
28465 SDValue Mask;
28466 if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
28467 return SDValue();
28468
28469 EVT VT = N->getValueType(0);
28470 EVT MaskVT = Mask.getValueType();
28471
28472 if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
28473 MaskVT.getVectorElementType() != MVT::i1)
28474 return SDValue();
28475
28476 EVT ReduceInVT =
28478
28479 SDLoc DL(N);
28480 // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
28481 SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
28482 SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
28483 return DAG.getNegative(NegPopCount, DL, VT);
28484}
28485
28487 DAGCombinerInfo &DCI) const {
28488 SelectionDAG &DAG = DCI.DAG;
28489 switch (N->getOpcode()) {
28490 default:
28491 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
28492 break;
28494 return performVectorDeinterleaveCombine(N, DCI, DAG);
28495 case ISD::VECREDUCE_AND:
28496 case ISD::VECREDUCE_OR:
28497 case ISD::VECREDUCE_XOR:
28498 return performVecReduceBitwiseCombine(N, DCI, DAG);
28499 case ISD::ADD:
28500 case ISD::SUB:
28501 return performAddSubCombine(N, DCI);
28502 case ISD::BUILD_VECTOR:
28503 return performBuildVectorCombine(N, DCI, DAG);
28504 case ISD::SMIN:
28505 return performSMINCombine(N, DAG);
28506 case ISD::TRUNCATE:
28507 return performTruncateCombine(N, DAG, DCI);
28508 case AArch64ISD::ANDS:
28509 return performANDSCombine(N, DCI);
28510 case AArch64ISD::ADC:
28511 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
28512 return R;
28513 return foldADCToCINC(N, DAG);
28514 case AArch64ISD::SBC:
28515 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
28516 case AArch64ISD::ADCS:
28517 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
28518 return R;
28519 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
28520 case AArch64ISD::SBCS:
28521 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
28522 return R;
28523 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
28524 case AArch64ISD::ADDS:
28525 return performFlagSettingCombine(N, DCI, ISD::ADD);
28526 case AArch64ISD::SUBS:
28527 return performFlagSettingCombine(N, DCI, ISD::SUB);
28528 case AArch64ISD::BICi:
28529 return performBICiCombine(N, DAG, DCI);
28530 case ISD::XOR:
28531 return performXorCombine(N, DAG, DCI, Subtarget);
28532 case ISD::MUL:
28533 return performMulCombine(N, DAG, DCI, Subtarget);
28534 case ISD::SINT_TO_FP:
28535 case ISD::UINT_TO_FP:
28536 return performIntToFpCombine(N, DAG, DCI, Subtarget);
28537 case ISD::FP_TO_SINT:
28538 case ISD::FP_TO_UINT:
28541 return performFpToIntCombine(N, DAG, DCI, Subtarget);
28542 case ISD::OR:
28543 return performORCombine(N, DCI, Subtarget, *this);
28544 case ISD::AND:
28545 return performANDCombine(N, DCI);
28546 case ISD::FADD:
28547 return performFADDCombine(N, DCI);
28549 return performIntrinsicCombine(N, DCI, Subtarget);
28550 case ISD::ANY_EXTEND:
28551 case ISD::ZERO_EXTEND:
28552 case ISD::SIGN_EXTEND:
28553 return performExtendCombine(N, DCI, DAG);
28555 return performSignExtendInRegCombine(N, DCI, DAG);
28557 return performConcatVectorsCombine(N, DCI, DAG);
28559 return performExtractSubvectorCombine(N, DCI, DAG);
28561 return performInsertSubvectorCombine(N, DCI, DAG);
28562 case ISD::SELECT:
28563 return performSelectCombine(N, DCI);
28564 case ISD::VSELECT:
28565 return performVSelectCombine(N, DCI.DAG);
28566 case ISD::SETCC:
28567 return performSETCCCombine(N, DCI, DAG);
28568 case ISD::LOAD:
28569 return performLOADCombine(N, DCI, DAG, Subtarget);
28570 case ISD::STORE:
28571 return performSTORECombine(N, DCI, DAG, Subtarget);
28572 case ISD::MSTORE:
28573 return performMSTORECombine(N, DCI, DAG, Subtarget);
28574 case ISD::MGATHER:
28575 case ISD::MSCATTER:
28577 return performMaskedGatherScatterCombine(N, DCI, DAG);
28578 case ISD::FP_EXTEND:
28579 return performFPExtendCombine(N, DAG, DCI, Subtarget);
28580 case AArch64ISD::BRCOND:
28581 return performBRCONDCombine(N, DCI, DAG);
28582 case AArch64ISD::TBNZ:
28583 case AArch64ISD::TBZ:
28584 return performTBZCombine(N, DCI, DAG);
28585 case AArch64ISD::CSEL:
28586 return performCSELCombine(N, DCI, DAG);
28587 case AArch64ISD::DUP:
28588 case AArch64ISD::DUPLANE8:
28589 case AArch64ISD::DUPLANE16:
28590 case AArch64ISD::DUPLANE32:
28591 case AArch64ISD::DUPLANE64:
28592 return performDUPCombine(N, DCI);
28593 case AArch64ISD::DUPLANE128:
28594 return performDupLane128Combine(N, DAG);
28595 case AArch64ISD::NVCAST:
28596 return performNVCASTCombine(N, DAG);
28597 case AArch64ISD::SPLICE:
28598 return performSpliceCombine(N, DAG);
28599 case AArch64ISD::UUNPKLO:
28600 case AArch64ISD::UUNPKHI:
28601 return performUnpackCombine(N, DAG, Subtarget);
28602 case AArch64ISD::UZP1:
28603 case AArch64ISD::UZP2:
28604 return performUzpCombine(N, DAG, Subtarget);
28605 case AArch64ISD::SETCC_MERGE_ZERO:
28606 return performSetccMergeZeroCombine(N, DCI);
28607 case AArch64ISD::REINTERPRET_CAST:
28609 case AArch64ISD::GLD1_MERGE_ZERO:
28610 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
28611 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
28612 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
28613 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
28614 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
28615 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
28616 case AArch64ISD::GLD1S_MERGE_ZERO:
28617 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
28618 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
28619 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
28620 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
28621 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
28622 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
28623 return performGLD1Combine(N, DAG);
28624 case AArch64ISD::VASHR:
28625 case AArch64ISD::VLSHR:
28626 return performVectorShiftCombine(N, *this, DCI);
28627 case AArch64ISD::SUNPKLO:
28628 return performSunpkloCombine(N, DAG);
28629 case AArch64ISD::BSP:
28630 return performBSPExpandForSVE(N, DAG, Subtarget);
28632 return performInsertVectorEltCombine(N, DCI);
28634 return performExtractVectorEltCombine(N, DCI, Subtarget);
28635 case ISD::VECREDUCE_ADD:
28636 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
28638 return performActiveLaneMaskCombine(N, DCI, Subtarget);
28639 case AArch64ISD::UADDV:
28640 return performUADDVCombine(N, DAG);
28641 case AArch64ISD::SMULL:
28642 case AArch64ISD::UMULL:
28643 case AArch64ISD::PMULL:
28644 return performMULLCombine(N, DCI, DAG);
28645 case AArch64ISD::PTEST_FIRST:
28646 return performPTestFirstCombine(N, DCI, DAG);
28649 switch (N->getConstantOperandVal(1)) {
28650 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
28651 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
28652 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
28653 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
28654 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
28655 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
28656 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
28657 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
28658 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
28659 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
28660 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
28661 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
28662 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
28663 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
28664 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
28665 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
28667 case Intrinsic::aarch64_neon_ld2:
28668 case Intrinsic::aarch64_neon_ld3:
28669 case Intrinsic::aarch64_neon_ld4:
28670 case Intrinsic::aarch64_neon_ld1x2:
28671 case Intrinsic::aarch64_neon_ld1x3:
28672 case Intrinsic::aarch64_neon_ld1x4:
28673 case Intrinsic::aarch64_neon_ld2lane:
28674 case Intrinsic::aarch64_neon_ld3lane:
28675 case Intrinsic::aarch64_neon_ld4lane:
28676 case Intrinsic::aarch64_neon_ld2r:
28677 case Intrinsic::aarch64_neon_ld3r:
28678 case Intrinsic::aarch64_neon_ld4r:
28679 case Intrinsic::aarch64_neon_st2:
28680 case Intrinsic::aarch64_neon_st3:
28681 case Intrinsic::aarch64_neon_st4:
28682 case Intrinsic::aarch64_neon_st1x2:
28683 case Intrinsic::aarch64_neon_st1x3:
28684 case Intrinsic::aarch64_neon_st1x4:
28685 case Intrinsic::aarch64_neon_st2lane:
28686 case Intrinsic::aarch64_neon_st3lane:
28687 case Intrinsic::aarch64_neon_st4lane:
28688 return performNEONPostLDSTCombine(N, DCI, DAG);
28689 case Intrinsic::aarch64_sve_ldnt1:
28690 return performLDNT1Combine(N, DAG);
28691 case Intrinsic::aarch64_sve_ld1rq:
28693 case Intrinsic::aarch64_sve_ld1ro:
28695 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
28696 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
28697 case Intrinsic::aarch64_sve_ldnt1_gather:
28698 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
28699 case Intrinsic::aarch64_sve_ldnt1_gather_index:
28700 return performGatherLoadCombine(N, DAG,
28701 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
28702 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
28703 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
28704 case Intrinsic::aarch64_sve_ld1:
28705 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
28706 case Intrinsic::aarch64_sve_ldnf1:
28707 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
28708 case Intrinsic::aarch64_sve_ldff1:
28709 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
28710 case Intrinsic::aarch64_sve_st1:
28711 return performST1Combine(N, DAG);
28712 case Intrinsic::aarch64_sve_stnt1:
28713 return performSTNT1Combine(N, DAG);
28714 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
28715 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
28716 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
28717 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
28718 case Intrinsic::aarch64_sve_stnt1_scatter:
28719 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
28720 case Intrinsic::aarch64_sve_stnt1_scatter_index:
28721 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
28722 case Intrinsic::aarch64_sve_ld1_gather:
28723 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
28724 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
28725 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
28726 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
28727 case Intrinsic::aarch64_sve_ld1q_gather_index:
28728 return performGatherLoadCombine(N, DAG,
28729 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
28730 case Intrinsic::aarch64_sve_ld1_gather_index:
28731 return performGatherLoadCombine(N, DAG,
28732 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
28733 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
28734 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
28735 /*OnlyPackedOffsets=*/false);
28736 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
28737 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
28738 /*OnlyPackedOffsets=*/false);
28739 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
28740 return performGatherLoadCombine(N, DAG,
28741 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
28742 /*OnlyPackedOffsets=*/false);
28743 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
28744 return performGatherLoadCombine(N, DAG,
28745 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
28746 /*OnlyPackedOffsets=*/false);
28747 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
28748 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
28749 case Intrinsic::aarch64_sve_ldff1_gather:
28750 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
28751 case Intrinsic::aarch64_sve_ldff1_gather_index:
28752 return performGatherLoadCombine(N, DAG,
28753 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
28754 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
28755 return performGatherLoadCombine(N, DAG,
28756 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
28757 /*OnlyPackedOffsets=*/false);
28758 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
28759 return performGatherLoadCombine(N, DAG,
28760 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
28761 /*OnlyPackedOffsets=*/false);
28762 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
28763 return performGatherLoadCombine(N, DAG,
28764 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
28765 /*OnlyPackedOffsets=*/false);
28766 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
28767 return performGatherLoadCombine(N, DAG,
28768 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
28769 /*OnlyPackedOffsets=*/false);
28770 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
28771 return performGatherLoadCombine(N, DAG,
28772 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
28773 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
28774 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
28775 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
28776 case Intrinsic::aarch64_sve_st1q_scatter_index:
28777 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
28778 case Intrinsic::aarch64_sve_st1_scatter:
28779 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
28780 case Intrinsic::aarch64_sve_st1_scatter_index:
28781 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
28782 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
28783 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
28784 /*OnlyPackedOffsets=*/false);
28785 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
28786 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
28787 /*OnlyPackedOffsets=*/false);
28788 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
28789 return performScatterStoreCombine(N, DAG,
28790 AArch64ISD::SST1_SXTW_SCALED_PRED,
28791 /*OnlyPackedOffsets=*/false);
28792 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
28793 return performScatterStoreCombine(N, DAG,
28794 AArch64ISD::SST1_UXTW_SCALED_PRED,
28795 /*OnlyPackedOffsets=*/false);
28796 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
28797 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
28798 case Intrinsic::aarch64_rndr:
28799 case Intrinsic::aarch64_rndrrs:
28800 return performRNDRCombine(N, DAG);
28801 case Intrinsic::aarch64_sme_ldr_zt:
28802 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
28803 DAG.getVTList(MVT::Other), N->getOperand(0),
28804 N->getOperand(2), N->getOperand(3));
28805 case Intrinsic::aarch64_sme_str_zt:
28806 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
28807 DAG.getVTList(MVT::Other), N->getOperand(0),
28808 N->getOperand(2), N->getOperand(3));
28809 default:
28810 break;
28811 }
28812 break;
28813 case ISD::GlobalAddress:
28814 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
28815 case ISD::CTLZ:
28816 return performCTLZCombine(N, DAG, Subtarget);
28818 return performScalarToVectorCombine(N, DCI, DAG);
28819 case ISD::SHL:
28820 return performSHLCombine(N, DCI, DAG);
28821 case ISD::CTPOP:
28822 return performCTPOPCombine(N, DCI, DAG);
28823 }
28824 return SDValue();
28825}
28826
28827// Check if the return value is used as only a return value, as otherwise
28828// we can't perform a tail-call. In particular, we need to check for
28829// target ISD nodes that are returns and any other "odd" constructs
28830// that the generic analysis code won't necessarily catch.
28831bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
28832 SDValue &Chain) const {
28833 if (N->getNumValues() != 1)
28834 return false;
28835 if (!N->hasNUsesOfValue(1, 0))
28836 return false;
28837
28838 SDValue TCChain = Chain;
28839 SDNode *Copy = *N->user_begin();
28840 if (Copy->getOpcode() == ISD::CopyToReg) {
28841 // If the copy has a glue operand, we conservatively assume it isn't safe to
28842 // perform a tail call.
28843 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
28844 MVT::Glue)
28845 return false;
28846 TCChain = Copy->getOperand(0);
28847 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
28848 return false;
28849
28850 bool HasRet = false;
28851 for (SDNode *Node : Copy->users()) {
28852 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
28853 return false;
28854 HasRet = true;
28855 }
28856
28857 if (!HasRet)
28858 return false;
28859
28860 Chain = TCChain;
28861 return true;
28862}
28863
28864// Return whether the an instruction can potentially be optimized to a tail
28865// call. This will cause the optimizers to attempt to move, or duplicate,
28866// return instructions to help enable tail call optimizations for this
28867// instruction.
28868bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
28869 return CI->isTailCall();
28870}
28871
28872bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
28873 Register Offset, bool IsPre,
28874 MachineRegisterInfo &MRI) const {
28875 auto CstOffset = getIConstantVRegVal(Offset, MRI);
28876 if (!CstOffset || CstOffset->isZero())
28877 return false;
28878
28879 // All of the indexed addressing mode instructions take a signed 9 bit
28880 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
28881 // encodes the sign/indexing direction.
28882 return isInt<9>(CstOffset->getSExtValue());
28883}
28884
28885bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
28886 SDValue &Base,
28887 SDValue &Offset,
28888 SelectionDAG &DAG) const {
28889 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
28890 return false;
28891
28892 // Non-null if there is exactly one user of the loaded value (ignoring chain).
28893 SDNode *ValOnlyUser = nullptr;
28894 for (SDUse &U : N->uses()) {
28895 if (U.getResNo() == 1)
28896 continue; // Ignore chain.
28897 if (ValOnlyUser == nullptr)
28898 ValOnlyUser = U.getUser();
28899 else {
28900 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
28901 break;
28902 }
28903 }
28904
28905 auto IsUndefOrZero = [](SDValue V) {
28906 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
28907 };
28908
28909 // If the only user of the value is a scalable vector splat, it is
28910 // preferable to do a replicating load (ld1r*).
28911 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
28912 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
28913 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
28914 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
28915 return false;
28916
28917 Base = Op->getOperand(0);
28918 // All of the indexed addressing mode instructions take a signed
28919 // 9 bit immediate offset.
28920 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
28921 int64_t RHSC = RHS->getSExtValue();
28922 if (Op->getOpcode() == ISD::SUB)
28923 RHSC = -(uint64_t)RHSC;
28924 if (!isInt<9>(RHSC))
28925 return false;
28926 // When big-endian VLD1/VST1 are used for vector load and store, and these
28927 // only allow an offset that's equal to the store size.
28928 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
28929 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
28930 (uint64_t)RHSC != MemType.getStoreSize())
28931 return false;
28932 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
28933 // when dealing with subtraction.
28934 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
28935 return true;
28936 }
28937 return false;
28938}
28939
28940bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
28941 SDValue &Offset,
28943 SelectionDAG &DAG) const {
28944 EVT VT;
28945 SDValue Ptr;
28946 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28947 VT = LD->getMemoryVT();
28948 Ptr = LD->getBasePtr();
28949 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28950 VT = ST->getMemoryVT();
28951 Ptr = ST->getBasePtr();
28952 } else
28953 return false;
28954
28955 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
28956 return false;
28957 AM = ISD::PRE_INC;
28958 return true;
28959}
28960
28961bool AArch64TargetLowering::getPostIndexedAddressParts(
28963 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
28964 EVT VT;
28965 SDValue Ptr;
28966 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28967 VT = LD->getMemoryVT();
28968 Ptr = LD->getBasePtr();
28969 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28970 VT = ST->getMemoryVT();
28971 Ptr = ST->getBasePtr();
28972 } else
28973 return false;
28974
28975 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
28976 return false;
28977 // Post-indexing updates the base, so it's not a valid transform
28978 // if that's not the same as the load's pointer.
28979 if (Ptr != Base)
28980 return false;
28981 AM = ISD::POST_INC;
28982 return true;
28983}
28984
28987 SelectionDAG &DAG) {
28988 SDLoc DL(N);
28989 SDValue Op = N->getOperand(0);
28990 EVT VT = N->getValueType(0);
28991 [[maybe_unused]] EVT SrcVT = Op.getValueType();
28992 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28993 "Must be bool vector.");
28994
28995 // Special handling for Clang's __builtin_convertvector. For vectors with <8
28996 // elements, it adds a vector concatenation with undef(s). If we encounter
28997 // this here, we can skip the concat.
28998 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
28999 bool AllUndef = true;
29000 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
29001 AllUndef &= Op.getOperand(I).isUndef();
29002
29003 if (AllUndef)
29004 Op = Op.getOperand(0);
29005 }
29006
29007 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
29008 if (VectorBits)
29009 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
29010}
29011
29014 SelectionDAG &DAG, EVT ExtendVT,
29015 EVT CastVT) {
29016 SDLoc DL(N);
29017 SDValue Op = N->getOperand(0);
29018 EVT VT = N->getValueType(0);
29019
29020 // Use SCALAR_TO_VECTOR for lane zero
29021 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
29022 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
29023 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
29024 Results.push_back(
29025 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
29026}
29027
29028void AArch64TargetLowering::ReplaceBITCASTResults(
29030 SDLoc DL(N);
29031 SDValue Op = N->getOperand(0);
29032 EVT VT = N->getValueType(0);
29033 EVT SrcVT = Op.getValueType();
29034
29035 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
29036 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
29037 return;
29038 }
29039
29040 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
29041 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
29042 return;
29043 }
29044
29045 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
29046 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
29047 return;
29048 }
29049
29050 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
29051 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
29052 "Expected fp->int bitcast!");
29053
29054 // Bitcasting between unpacked vector types of different element counts is
29055 // not a NOP because the live elements are laid out differently.
29056 // 01234567
29057 // e.g. nxv2i32 = XX??XX??
29058 // nxv4f16 = X?X?X?X?
29059 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
29060 return;
29061
29062 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
29063 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
29064 return;
29065 }
29066
29067 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
29068 !VT.isVector())
29069 return replaceBoolVectorBitcast(N, Results, DAG);
29070
29071 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
29072 return;
29073
29074 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
29075 DAG.getUNDEF(MVT::i32), Op);
29076 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
29077 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
29078}
29079
29081 SelectionDAG &DAG,
29082 const AArch64Subtarget *Subtarget) {
29083 EVT VT = N->getValueType(0);
29084 if (!VT.is256BitVector() ||
29086 !N->getFlags().hasAllowReassociation()) ||
29087 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
29088 VT.getScalarType() == MVT::bf16)
29089 return;
29090
29091 SDValue X = N->getOperand(0);
29092 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
29093 if (!Shuf) {
29094 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
29095 X = N->getOperand(1);
29096 if (!Shuf)
29097 return;
29098 }
29099
29100 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
29101 return;
29102
29103 // Check the mask is 1,0,3,2,5,4,...
29104 ArrayRef<int> Mask = Shuf->getMask();
29105 for (int I = 0, E = Mask.size(); I < E; I++)
29106 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
29107 return;
29108
29109 SDLoc DL(N);
29110 auto LoHi = DAG.SplitVector(X, DL);
29111 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
29112 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
29113 LoHi.first, LoHi.second);
29114
29115 // Shuffle the elements back into order.
29116 SmallVector<int> NMask;
29117 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
29118 NMask.push_back(I);
29119 NMask.push_back(I);
29120 }
29121 Results.push_back(
29122 DAG.getVectorShuffle(VT, DL,
29123 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
29124 DAG.getUNDEF(LoHi.first.getValueType())),
29125 DAG.getUNDEF(VT), NMask));
29126}
29127
29130 SelectionDAG &DAG, unsigned InterOp,
29131 unsigned AcrossOp) {
29132 EVT LoVT, HiVT;
29133 SDValue Lo, Hi;
29134 SDLoc DL(N);
29135 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
29136 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
29137 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
29138 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
29139 Results.push_back(SplitVal);
29140}
29141
29142void AArch64TargetLowering::ReplaceExtractSubVectorResults(
29144 SDValue In = N->getOperand(0);
29145 EVT InVT = In.getValueType();
29146
29147 // Common code will handle these just fine.
29148 if (!InVT.isScalableVector() || !InVT.isInteger())
29149 return;
29150
29151 SDLoc DL(N);
29152 EVT VT = N->getValueType(0);
29153
29154 // The following checks bail if this is not a halving operation.
29155
29156 ElementCount ResEC = VT.getVectorElementCount();
29157
29158 if (InVT.getVectorElementCount() != (ResEC * 2))
29159 return;
29160
29161 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
29162 if (!CIndex)
29163 return;
29164
29165 unsigned Index = CIndex->getZExtValue();
29166 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
29167 return;
29168
29169 unsigned Opcode = (Index == 0) ? (unsigned)ISD::ANY_EXTEND_VECTOR_INREG
29170 : (unsigned)AArch64ISD::UUNPKHI;
29171 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
29172
29173 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
29174 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
29175}
29176
29177void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
29179 assert((Subtarget->hasSVE2p1() ||
29180 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
29181 "Custom lower of get.active.lane.mask missing required feature.");
29182
29183 assert(N->getValueType(0) == MVT::nxv32i1 &&
29184 "Unexpected result type for get.active.lane.mask");
29185
29186 SDLoc DL(N);
29187 SDValue Idx = N->getOperand(0);
29188 SDValue TC = N->getOperand(1);
29189
29190 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
29191 "Unexpected operand type for get.active.lane.mask");
29192
29193 if (Idx.getValueType() != MVT::i64) {
29194 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
29195 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
29196 }
29197
29198 SDValue ID =
29199 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
29200 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
29201 auto WideMask =
29202 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
29203
29204 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
29205 {WideMask.getValue(0), WideMask.getValue(1)}));
29206}
29207
29208// Create an even/odd pair of X registers holding integer value V.
29210 SDLoc DL(V.getNode());
29211 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
29212 if (DAG.getDataLayout().isBigEndian())
29213 std::swap (VLo, VHi);
29214 SDValue RegClass =
29215 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
29216 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
29217 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
29218 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
29219 return SDValue(
29220 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
29221}
29222
29225 SelectionDAG &DAG,
29226 const AArch64Subtarget *Subtarget) {
29227 assert(N->getValueType(0) == MVT::i128 &&
29228 "AtomicCmpSwap on types less than 128 should be legal");
29229
29230 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
29231 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
29232 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
29233 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
29234 SDValue Ops[] = {
29235 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
29236 createGPRPairNode(DAG, N->getOperand(3)), // Store value
29237 N->getOperand(1), // Ptr
29238 N->getOperand(0), // Chain in
29239 };
29240
29241 unsigned Opcode;
29242 switch (MemOp->getMergedOrdering()) {
29244 Opcode = AArch64::CASPX;
29245 break;
29247 Opcode = AArch64::CASPAX;
29248 break;
29250 Opcode = AArch64::CASPLX;
29251 break;
29254 Opcode = AArch64::CASPALX;
29255 break;
29256 default:
29257 llvm_unreachable("Unexpected ordering!");
29258 }
29259
29260 MachineSDNode *CmpSwap = DAG.getMachineNode(
29261 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
29262 DAG.setNodeMemRefs(CmpSwap, {MemOp});
29263
29264 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
29265 if (DAG.getDataLayout().isBigEndian())
29266 std::swap(SubReg1, SubReg2);
29267 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
29268 SDValue(CmpSwap, 0));
29269 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
29270 SDValue(CmpSwap, 0));
29271 Results.push_back(
29272 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
29273 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
29274 return;
29275 }
29276
29277 unsigned Opcode;
29278 switch (MemOp->getMergedOrdering()) {
29280 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
29281 break;
29283 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
29284 break;
29286 Opcode = AArch64::CMP_SWAP_128_RELEASE;
29287 break;
29290 Opcode = AArch64::CMP_SWAP_128;
29291 break;
29292 default:
29293 llvm_unreachable("Unexpected ordering!");
29294 }
29295
29296 SDLoc DL(N);
29297 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
29298 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
29299 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
29300 New.first, New.second, N->getOperand(0)};
29301 SDNode *CmpSwap = DAG.getMachineNode(
29302 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
29303 Ops);
29304 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
29305
29306 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
29307 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
29308 Results.push_back(SDValue(CmpSwap, 3));
29309}
29310
29311static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
29312 AtomicOrdering Ordering) {
29313 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
29314 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
29315 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
29316 // ATOMIC_LOAD_CLR at any point.
29317 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
29318 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
29319 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
29320 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
29321
29322 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
29323 // The operand will need to be XORed in a separate step.
29324 switch (Ordering) {
29326 return AArch64::LDCLRP;
29327 break;
29329 return AArch64::LDCLRPA;
29330 break;
29332 return AArch64::LDCLRPL;
29333 break;
29336 return AArch64::LDCLRPAL;
29337 break;
29338 default:
29339 llvm_unreachable("Unexpected ordering!");
29340 }
29341 }
29342
29343 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
29344 switch (Ordering) {
29346 return AArch64::LDSETP;
29347 break;
29349 return AArch64::LDSETPA;
29350 break;
29352 return AArch64::LDSETPL;
29353 break;
29356 return AArch64::LDSETPAL;
29357 break;
29358 default:
29359 llvm_unreachable("Unexpected ordering!");
29360 }
29361 }
29362
29363 if (ISDOpcode == ISD::ATOMIC_SWAP) {
29364 switch (Ordering) {
29366 return AArch64::SWPP;
29367 break;
29369 return AArch64::SWPPA;
29370 break;
29372 return AArch64::SWPPL;
29373 break;
29376 return AArch64::SWPPAL;
29377 break;
29378 default:
29379 llvm_unreachable("Unexpected ordering!");
29380 }
29381 }
29382
29383 llvm_unreachable("Unexpected ISDOpcode!");
29384}
29385
29388 SelectionDAG &DAG,
29389 const AArch64Subtarget *Subtarget) {
29390 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
29391 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
29392 // rather than the CASP instructions, because CASP has register classes for
29393 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
29394 // to present them as single operands. LSE128 instructions use the GPR64
29395 // register class (because the pair does not have to be sequential), like
29396 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
29397
29398 assert(N->getValueType(0) == MVT::i128 &&
29399 "AtomicLoadXXX on types less than 128 should be legal");
29400
29401 if (!Subtarget->hasLSE128())
29402 return;
29403
29404 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
29405 const SDValue &Chain = N->getOperand(0);
29406 const SDValue &Ptr = N->getOperand(1);
29407 const SDValue &Val128 = N->getOperand(2);
29408 std::pair<SDValue, SDValue> Val2x64 =
29409 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
29410
29411 const unsigned ISDOpcode = N->getOpcode();
29412 const unsigned MachineOpcode =
29413 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
29414
29415 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
29416 SDLoc DL(Val128);
29417 Val2x64.first =
29418 DAG.getNode(ISD::XOR, DL, MVT::i64,
29419 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
29420 Val2x64.second =
29421 DAG.getNode(ISD::XOR, DL, MVT::i64,
29422 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
29423 }
29424
29425 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
29426 if (DAG.getDataLayout().isBigEndian())
29427 std::swap(Ops[0], Ops[1]);
29428
29429 MachineSDNode *AtomicInst =
29430 DAG.getMachineNode(MachineOpcode, SDLoc(N),
29431 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
29432
29433 DAG.setNodeMemRefs(AtomicInst, {MemOp});
29434
29435 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
29436 if (DAG.getDataLayout().isBigEndian())
29437 std::swap(Lo, Hi);
29438
29439 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
29440 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
29441}
29442
29443void AArch64TargetLowering::ReplaceNodeResults(
29445 switch (N->getOpcode()) {
29446 default:
29447 llvm_unreachable("Don't know how to custom expand this");
29448 case ISD::BITCAST:
29449 ReplaceBITCASTResults(N, Results, DAG);
29450 return;
29451 case ISD::VECREDUCE_ADD:
29456 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
29457 return;
29458 case ISD::ADD:
29459 case ISD::FADD:
29460 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
29461 return;
29462
29463 case ISD::CTPOP:
29464 case ISD::PARITY:
29465 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
29466 Results.push_back(Result);
29467 return;
29468 case AArch64ISD::SADDV:
29469 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
29470 return;
29471 case AArch64ISD::UADDV:
29472 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
29473 return;
29474 case AArch64ISD::SMINV:
29475 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
29476 return;
29477 case AArch64ISD::UMINV:
29478 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
29479 return;
29480 case AArch64ISD::SMAXV:
29481 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
29482 return;
29483 case AArch64ISD::UMAXV:
29484 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
29485 return;
29486 case ISD::MULHS:
29488 Results.push_back(
29489 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
29490 return;
29491 case ISD::MULHU:
29493 Results.push_back(
29494 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
29495 return;
29496 case ISD::FP_TO_UINT:
29497 case ISD::FP_TO_SINT:
29500 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
29501 // Let normal code take care of it by not adding anything to Results.
29502 return;
29504 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
29505 return;
29507 assert(N->getValueType(0) != MVT::i128 &&
29508 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
29509 break;
29512 case ISD::ATOMIC_SWAP: {
29513 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
29514 "Expected 128-bit atomicrmw.");
29515 // These need custom type legalisation so we go directly to instruction.
29516 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
29517 return;
29518 }
29519 case ISD::ADDRSPACECAST: {
29520 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
29521 Results.push_back(V);
29522 return;
29523 }
29524 case ISD::ATOMIC_LOAD:
29525 case ISD::LOAD: {
29526 MemSDNode *LoadNode = cast<MemSDNode>(N);
29527 EVT MemVT = LoadNode->getMemoryVT();
29528 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
29529 // targets.
29530 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
29531 MemVT.getSizeInBits() == 256u &&
29532 (MemVT.getScalarSizeInBits() == 8u ||
29533 MemVT.getScalarSizeInBits() == 16u ||
29534 MemVT.getScalarSizeInBits() == 32u ||
29535 MemVT.getScalarSizeInBits() == 64u)) {
29536
29537 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
29539 AArch64ISD::LDNP, SDLoc(N),
29540 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
29541 {LoadNode->getChain(), LoadNode->getBasePtr()},
29542 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
29543
29544 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
29545 DAG.getBitcast(HalfVT, Result.getValue(0)),
29546 DAG.getBitcast(HalfVT, Result.getValue(1)));
29547 Results.append({Pair, Result.getValue(2) /* Chain */});
29548 return;
29549 }
29550
29551 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
29552 LoadNode->getMemoryVT() != MVT::i128) {
29553 // Non-volatile or atomic loads are optimized later in AArch64's load/store
29554 // optimizer.
29555 return;
29556 }
29557
29558 if (SDValue(N, 0).getValueType() == MVT::i128) {
29559 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
29560 bool isLoadAcquire =
29562 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
29563
29564 if (isLoadAcquire)
29565 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
29566
29568 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
29569 {LoadNode->getChain(), LoadNode->getBasePtr()},
29570 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
29571
29572 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
29573
29574 SDValue Pair =
29575 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
29576 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
29577 Results.append({Pair, Result.getValue(2) /* Chain */});
29578 }
29579 return;
29580 }
29582 ReplaceExtractSubVectorResults(N, Results, DAG);
29583 return;
29586 // Custom lowering has been requested for INSERT_SUBVECTOR and
29587 // CONCAT_VECTORS -- but delegate to common code for result type
29588 // legalisation
29589 return;
29591 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
29592 return;
29594 EVT VT = N->getValueType(0);
29595
29596 Intrinsic::ID IntID =
29597 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
29598 switch (IntID) {
29599 default:
29600 return;
29601 case Intrinsic::aarch64_sve_clasta_n: {
29602 assert((VT == MVT::i8 || VT == MVT::i16) &&
29603 "custom lowering for unexpected type");
29604 SDLoc DL(N);
29605 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
29606 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
29607 N->getOperand(1), Op2, N->getOperand(3));
29608 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29609 return;
29610 }
29611 case Intrinsic::aarch64_sve_clastb_n: {
29612 assert((VT == MVT::i8 || VT == MVT::i16) &&
29613 "custom lowering for unexpected type");
29614 SDLoc DL(N);
29615 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
29616 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
29617 N->getOperand(1), Op2, N->getOperand(3));
29618 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29619 return;
29620 }
29621 case Intrinsic::aarch64_sve_lasta: {
29622 assert((VT == MVT::i8 || VT == MVT::i16) &&
29623 "custom lowering for unexpected type");
29624 SDLoc DL(N);
29625 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
29626 N->getOperand(1), N->getOperand(2));
29627 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29628 return;
29629 }
29630 case Intrinsic::aarch64_sve_lastb: {
29631 assert((VT == MVT::i8 || VT == MVT::i16) &&
29632 "custom lowering for unexpected type");
29633 SDLoc DL(N);
29634 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
29635 N->getOperand(1), N->getOperand(2));
29636 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29637 return;
29638 }
29639 case Intrinsic::aarch64_sme_in_streaming_mode: {
29640 SDLoc DL(N);
29641 SDValue Chain = DAG.getEntryNode();
29642
29643 SDValue RuntimePStateSM =
29644 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
29645 Results.push_back(
29646 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
29647 return;
29648 }
29649 case Intrinsic::experimental_vector_match: {
29650 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
29651 return;
29652
29653 // NOTE: Only trivial type promotion is supported.
29654 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
29655 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
29656 return;
29657
29658 SDLoc DL(N);
29659 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
29660 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29661 return;
29662 }
29663 }
29664 }
29665 case ISD::READ_REGISTER: {
29666 SDLoc DL(N);
29667 assert(N->getValueType(0) == MVT::i128 &&
29668 "READ_REGISTER custom lowering is only for 128-bit sysregs");
29669 SDValue Chain = N->getOperand(0);
29670 SDValue SysRegName = N->getOperand(1);
29671
29672 SDValue Result = DAG.getNode(
29673 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
29674 Chain, SysRegName);
29675
29676 // Sysregs are not endian. Result.getValue(0) always contains the lower half
29677 // of the 128-bit System Register value.
29678 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
29679 Result.getValue(0), Result.getValue(1));
29680 Results.push_back(Pair);
29681 Results.push_back(Result.getValue(2)); // Chain
29682 return;
29683 }
29684 }
29685}
29686
29688 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
29690 return true;
29691}
29692
29694 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
29695 // reciprocal if there are three or more FDIVs.
29696 return 3;
29697}
29698
29701 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
29702 // v4i16, v2i32 instead of to promote.
29703 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
29704 VT == MVT::v1f32)
29705 return TypeWidenVector;
29706
29708}
29709
29710// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
29711// provided the address is 16-byte aligned.
29713 if (!Subtarget->hasLSE2())
29714 return false;
29715
29716 if (auto LI = dyn_cast<LoadInst>(I))
29717 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
29718 LI->getAlign() >= Align(16);
29719
29720 if (auto SI = dyn_cast<StoreInst>(I))
29721 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29722 SI->getAlign() >= Align(16);
29723
29724 return false;
29725}
29726
29728 if (!Subtarget->hasLSE128())
29729 return false;
29730
29731 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
29732 // will clobber the two registers.
29733 if (const auto *SI = dyn_cast<StoreInst>(I))
29734 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29735 SI->getAlign() >= Align(16) &&
29736 (SI->getOrdering() == AtomicOrdering::Release ||
29737 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
29738
29739 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
29740 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29741 RMW->getAlign() >= Align(16) &&
29742 (RMW->getOperation() == AtomicRMWInst::Xchg ||
29743 RMW->getOperation() == AtomicRMWInst::And ||
29744 RMW->getOperation() == AtomicRMWInst::Or);
29745
29746 return false;
29747}
29748
29750 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
29751 return false;
29752
29753 if (auto LI = dyn_cast<LoadInst>(I))
29754 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
29755 LI->getAlign() >= Align(16) &&
29756 LI->getOrdering() == AtomicOrdering::Acquire;
29757
29758 if (auto SI = dyn_cast<StoreInst>(I))
29759 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29760 SI->getAlign() >= Align(16) &&
29761 SI->getOrdering() == AtomicOrdering::Release;
29762
29763 return false;
29764}
29765
29767 const Instruction *I) const {
29769 return false;
29771 return false;
29773 return true;
29774 return false;
29775}
29776
29778 const Instruction *I) const {
29779 // Store-Release instructions only provide seq_cst guarantees when paired with
29780 // Load-Acquire instructions. MSVC CRT does not use these instructions to
29781 // implement seq_cst loads and stores, so we need additional explicit fences
29782 // after memory writes.
29783 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
29784 return false;
29785
29786 if (auto *SI = dyn_cast<StoreInst>(I))
29787 return SI->getOrdering() == AtomicOrdering::SequentiallyConsistent;
29788
29789 auto *CAS = dyn_cast<AtomicCmpXchgInst>(I);
29790 auto *RMW = dyn_cast<AtomicRMWInst>(I);
29791 // Not a store.
29792 if (!CAS && !RMW)
29793 return false;
29794
29795 // Fence only needed for seq_cst.
29796 if (CAS &&
29797 CAS->getSuccessOrdering() != AtomicOrdering::SequentiallyConsistent)
29798 return false;
29799 if (RMW && RMW->getOrdering() != AtomicOrdering::SequentiallyConsistent)
29800 return false;
29801
29802 // We do not need a fence if we have LSE atomics.
29803 return !Subtarget->hasLSE();
29804}
29805
29806// Loads and stores less than 128-bits are already atomic; ones above that
29807// are doomed anyway, so defer to the default libcall and blame the OS when
29808// things go wrong.
29811 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
29812 if (Size != 128)
29821}
29822
29823// Loads and stores less than 128-bits are already atomic; ones above that
29824// are doomed anyway, so defer to the default libcall and blame the OS when
29825// things go wrong.
29828 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
29829
29830 if (Size != 128)
29832 if (isOpSuitableForRCPC3(LI))
29834 // No LSE128 loads
29835 if (isOpSuitableForLDPSTP(LI))
29837
29838 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29839 // implement atomicrmw without spilling. If the target address is also on the
29840 // stack and close enough to the spill slot, this can lead to a situation
29841 // where the monitor always gets cleared and the atomic operation can never
29842 // succeed. So at -O0 lower this operation to a CAS loop.
29843 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29845
29846 // Using CAS for an atomic load has a better chance of succeeding under high
29847 // contention situations. So use it if available.
29848 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
29850}
29851
29852// Return true if the atomic operation expansion will lower to use a library
29853// call, and is thus ineligible to use an LLSC expansion.
29854static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
29855 const AtomicRMWInst *RMW) {
29856 if (!RMW->isFloatingPointOperation())
29857 return false;
29858 switch (RMW->getType()->getScalarType()->getTypeID()) {
29859 case Type::FloatTyID:
29860 case Type::DoubleTyID:
29861 case Type::HalfTyID:
29862 case Type::BFloatTyID:
29863 // Will use soft float
29864 return !Subtarget.hasFPARMv8();
29865 default:
29866 // fp128 will emit library calls.
29867 return true;
29868 }
29869
29870 llvm_unreachable("covered type switch");
29871}
29872
29873// The "default" for integer RMW operations is to expand to an LL/SC loop.
29874// However, with the LSE instructions (or outline-atomics mode, which provides
29875// library routines in place of the LSE-instructions), we can directly emit many
29876// operations instead.
29879 const AtomicRMWInst *AI) const {
29880 Type *Ty = AI->getType();
29881 unsigned Size = Ty->getPrimitiveSizeInBits();
29882 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
29883
29884 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
29888 if (CanUseLSE128)
29890
29891 // If LSFE available, use atomic FP instructions in preference to expansion
29892 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
29898
29899 // Leave 128 bits to LLSC or CmpXChg.
29900 if (Size < 128 && !AI->isFloatingPointOperation()) {
29901 if (Subtarget->hasLSE()) {
29902 // Nand is not supported in LSE.
29903 switch (AI->getOperation()) {
29905 case AtomicRMWInst::Add:
29906 case AtomicRMWInst::Sub:
29907 case AtomicRMWInst::And:
29908 case AtomicRMWInst::Or:
29909 case AtomicRMWInst::Xor:
29910 case AtomicRMWInst::Max:
29911 case AtomicRMWInst::Min:
29915 default:
29916 break;
29917 }
29918 }
29919 if (Subtarget->outlineAtomics()) {
29920 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
29921 // Don't outline them unless
29922 // (1) high level <atomic> support approved:
29923 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
29924 // (2) low level libgcc and compiler-rt support implemented by:
29925 // min/max outline atomics helpers
29926 switch (AI->getOperation()) {
29928 case AtomicRMWInst::Add:
29929 case AtomicRMWInst::Sub:
29930 case AtomicRMWInst::And:
29931 case AtomicRMWInst::Or:
29932 case AtomicRMWInst::Xor:
29934 default:
29935 break;
29936 }
29937 }
29938 }
29939
29940 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29941 // implement atomicrmw without spilling. If the target address is also on the
29942 // stack and close enough to the spill slot, this can lead to a situation
29943 // where the monitor always gets cleared and the atomic operation can never
29944 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
29945 // we have a single CAS instruction that can replace the loop.
29946 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
29947 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
29949
29951}
29952
29955 const AtomicCmpXchgInst *AI) const {
29956 // If subtarget has LSE, leave cmpxchg intact for codegen.
29957 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
29959 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29960 // implement cmpxchg without spilling. If the address being exchanged is also
29961 // on the stack and close enough to the spill slot, this can lead to a
29962 // situation where the monitor always gets cleared and the atomic operation
29963 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
29964 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29966
29967 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
29968 // it.
29970 if (Size > 64)
29972
29974}
29975
29977 Type *ValueTy, Value *Addr,
29978 AtomicOrdering Ord) const {
29979 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29980 bool IsAcquire = isAcquireOrStronger(Ord);
29981
29982 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
29983 // intrinsic must return {i64, i64} and we have to recombine them into a
29984 // single i128 here.
29985 if (ValueTy->getPrimitiveSizeInBits() == 128) {
29987 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
29988
29989 Value *LoHi =
29990 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
29991
29992 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
29993 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
29994
29995 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
29996 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
29997 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
29998
29999 Value *Or = Builder.CreateOr(
30000 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
30001 return Builder.CreateBitCast(Or, ValueTy);
30002 }
30003
30004 Type *Tys[] = { Addr->getType() };
30006 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
30007
30008 const DataLayout &DL = M->getDataLayout();
30009 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
30010 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
30011 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
30012 Attribute::ElementType, IntEltTy));
30013 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
30014
30015 return Builder.CreateBitCast(Trunc, ValueTy);
30016}
30017
30019 IRBuilderBase &Builder) const {
30020 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
30021}
30022
30024 Value *Val, Value *Addr,
30025 AtomicOrdering Ord) const {
30026 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30027 bool IsRelease = isReleaseOrStronger(Ord);
30028
30029 // Since the intrinsics must have legal type, the i128 intrinsics take two
30030 // parameters: "i64, i64". We must marshal Val into the appropriate form
30031 // before the call.
30032 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
30034 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
30036 Type *Int64Ty = Type::getInt64Ty(M->getContext());
30037 Type *Int128Ty = Type::getInt128Ty(M->getContext());
30038
30039 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
30040
30041 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
30042 Value *Hi =
30043 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
30044 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
30045 }
30046
30048 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
30049 Type *Tys[] = { Addr->getType() };
30051
30052 const DataLayout &DL = M->getDataLayout();
30053 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
30054 Val = Builder.CreateBitCast(Val, IntValTy);
30055
30056 CallInst *CI = Builder.CreateCall(
30057 Stxr, {Builder.CreateZExtOrBitCast(
30058 Val, Stxr->getFunctionType()->getParamType(0)),
30059 Addr});
30060 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
30061 Attribute::ElementType, Val->getType()));
30062 return CI;
30063}
30064
30066 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
30067 const DataLayout &DL) const {
30068 if (!Ty->isArrayTy()) {
30069 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
30070 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
30071 }
30072
30073 // All non aggregate members of the type must have the same type
30074 SmallVector<EVT> ValueVTs;
30075 ComputeValueVTs(*this, DL, Ty, ValueVTs);
30076 return all_equal(ValueVTs);
30077}
30078
30079bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
30080 EVT) const {
30081 return false;
30082}
30083
30084static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
30085 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
30086 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
30087 M, Intrinsic::thread_pointer, IRB.getPtrTy());
30088 return IRB.CreatePointerCast(
30089 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
30090 Offset),
30091 IRB.getPtrTy(0));
30092}
30093
30095 IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const {
30096 // Android provides a fixed TLS slot for the stack cookie. See the definition
30097 // of TLS_SLOT_STACK_GUARD in
30098 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
30099 if (Subtarget->isTargetAndroid())
30100 return UseTlsOffset(IRB, 0x28);
30101
30102 // Fuchsia is similar.
30103 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
30104 if (Subtarget->isTargetFuchsia())
30105 return UseTlsOffset(IRB, -0x10);
30106
30107 return TargetLowering::getIRStackGuard(IRB, Libcalls);
30108}
30109
30111 Module &M, const LibcallLoweringInfo &Libcalls) const {
30112 // MSVC CRT provides functionalities for stack protection.
30113 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
30114 Libcalls.getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
30115
30116 RTLIB::LibcallImpl SecurityCookieVar =
30117 Libcalls.getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
30118 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
30119 SecurityCookieVar != RTLIB::Unsupported) {
30120 // MSVC CRT has a global variable holding security cookie.
30121 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
30122 PointerType::getUnqual(M.getContext()));
30123
30124 // MSVC CRT has a function to validate security cookie.
30125 FunctionCallee SecurityCheckCookie =
30126 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
30127 Type::getVoidTy(M.getContext()),
30128 PointerType::getUnqual(M.getContext()));
30129 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
30130 F->setCallingConv(CallingConv::Win64);
30131 F->addParamAttr(0, Attribute::AttrKind::InReg);
30132 }
30133 return;
30134 }
30136}
30137
30139 IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const {
30140 // Android provides a fixed TLS slot for the SafeStack pointer. See the
30141 // definition of TLS_SLOT_SAFESTACK in
30142 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
30143 if (Subtarget->isTargetAndroid())
30144 return UseTlsOffset(IRB, 0x48);
30145
30146 return TargetLowering::getSafeStackPointerLocation(IRB, Libcalls);
30147}
30148
30149/// If a physical register, this returns the register that receives the
30150/// exception address on entry to an EH pad.
30152 const Constant *PersonalityFn) const {
30153 // FIXME: This is a guess. Has this been defined yet?
30154 return AArch64::X0;
30155}
30156
30157/// If a physical register, this returns the register that receives the
30158/// exception typeid on entry to a landing pad.
30160 const Constant *PersonalityFn) const {
30161 // FIXME: This is a guess. Has this been defined yet?
30162 return AArch64::X1;
30163}
30164
30166 const Instruction &AndI) const {
30167 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
30168 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
30169 // may be beneficial to sink in other cases, but we would have to check that
30170 // the cmp would not get folded into the br to form a cbz for these to be
30171 // beneficial.
30173 if (!Mask)
30174 return false;
30175 return Mask->getValue().isPowerOf2();
30176}
30177
30181 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
30182 SelectionDAG &DAG) const {
30183 // Does baseline recommend not to perform the fold by default?
30185 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
30186 return false;
30187 // Else, if this is a vector shift, prefer 'shl'.
30188 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
30189}
30190
30193 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
30195 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
30198 ExpansionFactor);
30199}
30200
30202 // Update IsSplitCSR in AArch64unctionInfo.
30203 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
30204 AFI->setIsSplitCSR(true);
30205}
30206
30208 MachineBasicBlock *Entry,
30209 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
30210 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
30211 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
30212 if (!IStart)
30213 return;
30214
30215 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
30216 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
30217 MachineBasicBlock::iterator MBBI = Entry->begin();
30218 for (const MCPhysReg *I = IStart; *I; ++I) {
30219 const TargetRegisterClass *RC = nullptr;
30220 if (AArch64::GPR64RegClass.contains(*I))
30221 RC = &AArch64::GPR64RegClass;
30222 else if (AArch64::FPR64RegClass.contains(*I))
30223 RC = &AArch64::FPR64RegClass;
30224 else
30225 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
30226
30227 Register NewVR = MRI->createVirtualRegister(RC);
30228 // Create copy from CSR to a virtual register.
30229 // FIXME: this currently does not emit CFI pseudo-instructions, it works
30230 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
30231 // nounwind. If we want to generalize this later, we may need to emit
30232 // CFI pseudo-instructions.
30233 assert(Entry->getParent()->getFunction().hasFnAttribute(
30234 Attribute::NoUnwind) &&
30235 "Function should be nounwind in insertCopiesSplitCSR!");
30236 Entry->addLiveIn(*I);
30237 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
30238 .addReg(*I);
30239
30240 // Insert the copy-back instructions right before the terminator.
30241 for (auto *Exit : Exits)
30242 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
30243 TII->get(TargetOpcode::COPY), *I)
30244 .addReg(NewVR);
30245 }
30246}
30247
30248bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
30249 // Integer division on AArch64 is expensive. However, when aggressively
30250 // optimizing for code size, we prefer to use a div instruction, as it is
30251 // usually smaller than the alternative sequence.
30252 // The exception to this is vector division. Since AArch64 doesn't have vector
30253 // integer division, leaving the division as-is is a loss even in terms of
30254 // size, because it will have to be scalarized, while the alternative code
30255 // sequence can be performed in vector form.
30256 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
30257 return OptSize && !VT.isVector();
30258}
30259
30261 const MachineFunction &MF) const {
30262 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
30263 // In future, we could allow this when SVE is available, but currently,
30264 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
30265 // the general lowering may introduce stack spills/reloads).
30266 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
30267 return false;
30268
30269 // Do not merge to float value size (128 bytes) if no implicit float attribute
30270 // is set.
30271 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
30272 return !NoFloat || MemVT.getSizeInBits() <= 64;
30273}
30274
30276 // We want inc-of-add for scalars and sub-of-not for vectors.
30277 return VT.isScalarInteger();
30278}
30279
30281 EVT VT) const {
30282 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
30283 // legalize.
30284 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
30285 return false;
30286 if (FPVT == MVT::v8bf16)
30287 return false;
30288 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
30289}
30290
30292 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
30293 // avoid vselect becoming bsl / unrolling.
30294 return !VT.isFixedLengthVector();
30295}
30296
30300 const TargetInstrInfo *TII) const {
30301 assert(MBBI->isCall() && MBBI->getCFIType() &&
30302 "Invalid call instruction for a KCFI check");
30303
30304 switch (MBBI->getOpcode()) {
30305 case AArch64::BLR:
30306 case AArch64::BLRNoIP:
30307 case AArch64::TCRETURNri:
30308 case AArch64::TCRETURNrix16x17:
30309 case AArch64::TCRETURNrix17:
30310 case AArch64::TCRETURNrinotx16:
30311 break;
30312 default:
30313 llvm_unreachable("Unexpected CFI call opcode");
30314 }
30315
30316 MachineOperand &Target = MBBI->getOperand(0);
30317 assert(Target.isReg() && "Invalid target operand for an indirect call");
30318 Target.setIsRenamable(false);
30319
30320 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
30321 .addReg(Target.getReg())
30322 .addImm(MBBI->getCFIType())
30323 .getInstr();
30324}
30325
30327 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
30328}
30329
30330unsigned
30332 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
30333 return getPointerTy(DL).getSizeInBits();
30334
30335 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
30336}
30337
30338void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
30339 MachineFrameInfo &MFI = MF.getFrameInfo();
30340 // If we have any vulnerable SVE stack objects then the stack protector
30341 // needs to be placed at the top of the SVE stack area, as the SVE locals
30342 // are placed above the other locals, so we allocate it as if it were a
30343 // scalable vector.
30344 // FIXME: It may be worthwhile having a specific interface for this rather
30345 // than doing it here in finalizeLowering.
30346 if (MFI.hasStackProtectorIndex()) {
30347 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
30348 if (MFI.hasScalableStackID(i) &&
30353 break;
30354 }
30355 }
30356 }
30359}
30360
30361// Unlike X86, we let frame lowering assign offsets to all catch objects.
30363
30364bool AArch64TargetLowering::shouldLocalize(
30365 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
30366 auto &MF = *MI.getMF();
30367 auto &MRI = MF.getRegInfo();
30368 auto maxUses = [](unsigned RematCost) {
30369 // A cost of 1 means remats are basically free.
30370 if (RematCost == 1)
30371 return std::numeric_limits<unsigned>::max();
30372 if (RematCost == 2)
30373 return 2U;
30374
30375 // Remat is too expensive, only sink if there's one user.
30376 if (RematCost > 2)
30377 return 1U;
30378 llvm_unreachable("Unexpected remat cost");
30379 };
30380
30381 unsigned Opc = MI.getOpcode();
30382 switch (Opc) {
30383 case TargetOpcode::G_GLOBAL_VALUE: {
30384 // On Darwin, TLS global vars get selected into function calls, which
30385 // we don't want localized, as they can get moved into the middle of a
30386 // another call sequence.
30387 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
30388 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
30389 return false;
30390 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
30391 }
30392 case TargetOpcode::G_FCONSTANT:
30393 case TargetOpcode::G_CONSTANT: {
30394 const ConstantInt *CI;
30395 unsigned AdditionalCost = 0;
30396
30397 if (Opc == TargetOpcode::G_CONSTANT)
30398 CI = MI.getOperand(1).getCImm();
30399 else {
30400 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
30401 // We try to estimate cost of 32/64b fpimms, as they'll likely be
30402 // materialized as integers.
30403 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
30404 break;
30405 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
30406 bool OptForSize = MF.getFunction().hasOptSize();
30408 OptForSize))
30409 return true; // Constant should be cheap.
30410 CI =
30411 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
30412 // FP materialization also costs an extra move, from gpr to fpr.
30413 AdditionalCost = 1;
30414 }
30415 APInt Imm = CI->getValue();
30418 assert(Cost.isValid() && "Expected a valid imm cost");
30419
30420 unsigned RematCost = Cost.getValue();
30421 RematCost += AdditionalCost;
30422 Register Reg = MI.getOperand(0).getReg();
30423 unsigned MaxUses = maxUses(RematCost);
30424 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
30425 if (MaxUses == std::numeric_limits<unsigned>::max())
30426 --MaxUses;
30427 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
30428 }
30429 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
30430 // localizable.
30431 case AArch64::ADRP:
30432 case AArch64::G_ADD_LOW:
30433 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
30434 case TargetOpcode::G_PTR_ADD:
30435 return true;
30436 default:
30437 break;
30438 }
30440}
30441
30443 // Fallback for scalable vectors.
30444 // Note that if EnableSVEGISel is true, we allow scalable vector types for
30445 // all instructions, regardless of whether they are actually supported.
30446 if (!EnableSVEGISel) {
30447 if (Inst.getType()->isScalableTy()) {
30448 return true;
30449 }
30450
30451 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
30452 if (Inst.getOperand(i)->getType()->isScalableTy())
30453 return true;
30454
30455 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
30456 if (AI->getAllocatedType()->isScalableTy())
30457 return true;
30458 }
30459 }
30460
30461 // Checks to allow the use of SME instructions
30462 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
30463 auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());
30464 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
30465 CallAttrs.requiresPreservingZT0() ||
30466 CallAttrs.requiresPreservingAllZAState())
30467 return true;
30468 }
30469 return false;
30470}
30471
30472// Return the largest legal scalable vector type that matches VT's element type.
30476 "Expected legal fixed length vector!");
30477 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
30478 default:
30479 llvm_unreachable("unexpected element type for SVE container");
30480 case MVT::i8:
30481 return EVT(MVT::nxv16i8);
30482 case MVT::i16:
30483 return EVT(MVT::nxv8i16);
30484 case MVT::i32:
30485 return EVT(MVT::nxv4i32);
30486 case MVT::i64:
30487 return EVT(MVT::nxv2i64);
30488 case MVT::bf16:
30489 return EVT(MVT::nxv8bf16);
30490 case MVT::f16:
30491 return EVT(MVT::nxv8f16);
30492 case MVT::f32:
30493 return EVT(MVT::nxv4f32);
30494 case MVT::f64:
30495 return EVT(MVT::nxv2f64);
30496 }
30497}
30498
30499// Return a predicate with active lanes corresponding to the extent of VT.
30501 EVT VT) {
30504 "Expected legal fixed length vector!");
30505
30506 std::optional<unsigned> PgPattern =
30508 assert(PgPattern && "Unexpected element count for SVE predicate");
30509
30510 MVT MaskVT;
30511 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
30512 default:
30513 llvm_unreachable("unexpected element type for SVE predicate");
30514 case MVT::i8:
30515 MaskVT = MVT::nxv16i1;
30516 break;
30517 case MVT::i16:
30518 case MVT::f16:
30519 case MVT::bf16:
30520 MaskVT = MVT::nxv8i1;
30521 break;
30522 case MVT::i32:
30523 case MVT::f32:
30524 MaskVT = MVT::nxv4i1;
30525 break;
30526 case MVT::i64:
30527 case MVT::f64:
30528 MaskVT = MVT::nxv2i1;
30529 break;
30530 }
30531
30532 return getPTrue(DAG, DL, MaskVT, *PgPattern);
30533}
30534
30536 EVT VT) {
30538 "Expected legal scalable vector!");
30539 auto PredTy = VT.changeVectorElementType(*DAG.getContext(), MVT::i1);
30540 return DAG.getConstant(1, DL, PredTy);
30541}
30542
30544 if (VT.isFixedLengthVector())
30545 return getPredicateForFixedLengthVector(DAG, DL, VT);
30546
30547 return getPredicateForScalableVector(DAG, DL, VT);
30548}
30549
30550// Grow V to consume an entire SVE register.
30552 assert(VT.isScalableVector() &&
30553 "Expected to convert into a scalable vector!");
30554 assert(V.getValueType().isFixedLengthVector() &&
30555 "Expected a fixed length vector operand!");
30556 SDLoc DL(V);
30557 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30558 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
30559}
30560
30561// Shrink V so it's just big enough to maintain a VT's worth of data.
30564 "Expected to convert into a fixed length vector!");
30565 assert(V.getValueType().isScalableVector() &&
30566 "Expected a scalable vector operand!");
30567 SDLoc DL(V);
30568 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30569 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
30570}
30571
30572// Convert all fixed length vector loads larger than NEON to masked_loads.
30573SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
30574 SDValue Op, SelectionDAG &DAG) const {
30575 auto Load = cast<LoadSDNode>(Op);
30576
30577 SDLoc DL(Op);
30578 EVT VT = Op.getValueType();
30579 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30580 EVT LoadVT = ContainerVT;
30581 EVT MemVT = Load->getMemoryVT();
30582
30583 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
30584
30585 if (VT.isFloatingPoint()) {
30586 LoadVT = ContainerVT.changeTypeToInteger();
30587 MemVT = MemVT.changeTypeToInteger();
30588 }
30589
30590 SDValue NewLoad = DAG.getMaskedLoad(
30591 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
30592 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
30593 Load->getAddressingMode(), Load->getExtensionType());
30594
30595 SDValue Result = NewLoad;
30596 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
30597 EVT ExtendVT = ContainerVT.changeVectorElementType(
30598 *DAG.getContext(), Load->getMemoryVT().getVectorElementType());
30599
30600 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
30601 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
30602 Pg, Result, DAG.getUNDEF(ContainerVT));
30603 } else if (VT.isFloatingPoint()) {
30604 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
30605 }
30606
30607 Result = convertFromScalableVector(DAG, VT, Result);
30608 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
30609 return DAG.getMergeValues(MergedValues, DL);
30610}
30611
30613 SelectionDAG &DAG) {
30614 SDLoc DL(Mask);
30615 EVT InVT = Mask.getValueType();
30616 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30618
30619 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
30620 return Pg;
30621
30622 bool InvertCond = false;
30623 if (isBitwiseNot(Mask)) {
30624 InvertCond = true;
30625 Mask = Mask.getOperand(0);
30626 }
30627
30628 SDValue Op1, Op2;
30629 ISD::CondCode CC;
30630
30631 // When Mask is the result of a SETCC, it's better to regenerate the compare.
30632 if (Mask.getOpcode() == ISD::SETCC) {
30633 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
30634 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
30635 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
30636 } else {
30637 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
30638 Op2 = DAG.getConstant(0, DL, ContainerVT);
30639 CC = ISD::SETNE;
30640 }
30641
30642 if (InvertCond)
30643 CC = getSetCCInverse(CC, Op1.getValueType());
30644
30645 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
30646 {Pg, Op1, Op2, DAG.getCondCode(CC)});
30647}
30648
30649// Convert all fixed length vector loads larger than NEON to masked_loads.
30650SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
30651 SDValue Op, SelectionDAG &DAG) const {
30653
30654 SDLoc DL(Op);
30655 EVT VT = Op.getValueType();
30656 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30657
30658 SDValue Mask = Load->getMask();
30659 // If this is an extending load and the mask type is not the same as
30660 // load's type then we have to extend the mask type.
30661 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
30662 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
30663 "Incorrect mask type");
30664 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
30665 }
30667
30668 SDValue PassThru;
30669 bool IsPassThruZeroOrUndef = false;
30670
30671 if (Load->getPassThru()->isUndef()) {
30672 PassThru = DAG.getUNDEF(ContainerVT);
30673 IsPassThruZeroOrUndef = true;
30674 } else {
30675 if (ContainerVT.isInteger())
30676 PassThru = DAG.getConstant(0, DL, ContainerVT);
30677 else
30678 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
30679 if (isZerosVector(Load->getPassThru().getNode()))
30680 IsPassThruZeroOrUndef = true;
30681 }
30682
30683 SDValue NewLoad = DAG.getMaskedLoad(
30684 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
30685 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
30686 Load->getAddressingMode(), Load->getExtensionType());
30687
30688 SDValue Result = NewLoad;
30689 if (!IsPassThruZeroOrUndef) {
30690 SDValue OldPassThru =
30691 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
30692 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
30693 }
30694
30695 Result = convertFromScalableVector(DAG, VT, Result);
30696 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
30697 return DAG.getMergeValues(MergedValues, DL);
30698}
30699
30700// Convert all fixed length vector stores larger than NEON to masked_stores.
30701SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
30702 SDValue Op, SelectionDAG &DAG) const {
30703 auto Store = cast<StoreSDNode>(Op);
30704
30705 SDLoc DL(Op);
30706 EVT VT = Store->getValue().getValueType();
30707 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30708 EVT MemVT = Store->getMemoryVT();
30709
30710 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
30711 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
30712
30713 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
30714 EVT TruncVT = ContainerVT.changeVectorElementType(
30715 *DAG.getContext(), Store->getMemoryVT().getVectorElementType());
30716 MemVT = MemVT.changeTypeToInteger();
30717 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
30718 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
30719 DAG.getUNDEF(TruncVT));
30720 NewValue =
30721 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
30722 } else if (VT.isFloatingPoint()) {
30723 MemVT = MemVT.changeTypeToInteger();
30724 NewValue =
30725 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
30726 }
30727
30728 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
30729 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
30730 Store->getMemOperand(), Store->getAddressingMode(),
30731 Store->isTruncatingStore());
30732}
30733
30734SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op,
30735 SelectionDAG &DAG) const {
30736 SDLoc DL(Op);
30738 EVT VT = Store->getValue().getValueType();
30739 if (VT.isFixedLengthVector())
30740 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
30741
30742 if (!Store->isCompressingStore())
30743 return SDValue();
30744
30745 EVT MaskVT = Store->getMask().getValueType();
30746 EVT MaskExtVT = getPromotedVTForPredicate(MaskVT);
30747 EVT MaskReduceVT = MaskExtVT.getScalarType();
30748 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30749
30750 SDValue MaskExt =
30751 DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Store->getMask());
30752 SDValue CntActive =
30753 DAG.getNode(ISD::VECREDUCE_ADD, DL, MaskReduceVT, MaskExt);
30754 if (MaskReduceVT != MVT::i64)
30755 CntActive = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CntActive);
30756
30757 SDValue CompressedValue =
30758 DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(),
30759 Store->getMask(), DAG.getPOISON(VT));
30760 SDValue CompressedMask =
30761 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
30762
30763 return DAG.getMaskedStore(Store->getChain(), DL, CompressedValue,
30764 Store->getBasePtr(), Store->getOffset(),
30765 CompressedMask, Store->getMemoryVT(),
30766 Store->getMemOperand(), Store->getAddressingMode(),
30767 Store->isTruncatingStore(),
30768 /*isCompressing=*/false);
30769}
30770
30771SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
30772 SDValue Op, SelectionDAG &DAG) const {
30774
30775 SDLoc DL(Op);
30776 EVT VT = Store->getValue().getValueType();
30777 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30778
30779 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
30781
30782 return DAG.getMaskedStore(
30783 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
30784 Mask, Store->getMemoryVT(), Store->getMemOperand(),
30785 Store->getAddressingMode(), Store->isTruncatingStore(),
30786 Store->isCompressingStore());
30787}
30788
30789SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
30790 SDValue Op, SelectionDAG &DAG) const {
30791 SDLoc DL(Op);
30792 EVT VT = Op.getValueType();
30793 EVT EltVT = VT.getVectorElementType();
30794
30795 bool Signed = Op.getOpcode() == ISD::SDIV;
30796 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
30797
30798 bool Negated;
30799 uint64_t SplatVal;
30800 // NOTE: SRAD cannot be used to represent sdiv-by-one.
30801 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
30802 SplatVal > 1) {
30803 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30804 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30805 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
30806
30808 SDValue Res =
30809 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
30810 if (Negated)
30811 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
30812 DAG.getConstant(0, DL, ContainerVT), Res);
30813
30814 return convertFromScalableVector(DAG, VT, Res);
30815 }
30816
30817 // Scalable vector i32/i64 DIV is supported.
30818 if (EltVT == MVT::i32 || EltVT == MVT::i64)
30819 return LowerToPredicatedOp(Op, DAG, PredOpcode);
30820
30821 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
30822 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
30823 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
30824 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30825
30826 // If the wider type is legal: extend, op, and truncate.
30827 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
30828 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
30829 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
30830 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
30831 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
30832 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
30833 }
30834
30835 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
30836 &ExtendOpcode](SDValue Op) {
30837 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
30838 SDValue IdxHalf =
30839 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
30840 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
30841 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
30842 return std::pair<SDValue, SDValue>(
30843 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
30844 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
30845 };
30846
30847 // If wider type is not legal: split, extend, op, trunc and concat.
30848 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
30849 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
30850 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
30851 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
30852 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
30853 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
30854 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
30855}
30856
30857SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
30858 SDValue Op, SelectionDAG &DAG) const {
30859 EVT VT = Op.getValueType();
30860 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30861
30862 SDLoc DL(Op);
30863 SDValue Val = Op.getOperand(0);
30864 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30865 Val = convertToScalableVector(DAG, ContainerVT, Val);
30866
30867 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
30868 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
30869
30870 // Repeatedly unpack Val until the result is of the desired element type.
30871 switch (ContainerVT.getSimpleVT().SimpleTy) {
30872 default:
30873 llvm_unreachable("unimplemented container type");
30874 case MVT::nxv16i8:
30875 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
30876 if (VT.getVectorElementType() == MVT::i16)
30877 break;
30878 [[fallthrough]];
30879 case MVT::nxv8i16:
30880 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
30881 if (VT.getVectorElementType() == MVT::i32)
30882 break;
30883 [[fallthrough]];
30884 case MVT::nxv4i32:
30885 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
30886 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
30887 break;
30888 }
30889
30890 return convertFromScalableVector(DAG, VT, Val);
30891}
30892
30893SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
30894 SDValue Op, SelectionDAG &DAG) const {
30895 EVT VT = Op.getValueType();
30896 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30897
30898 SDLoc DL(Op);
30899 SDValue Val = Op.getOperand(0);
30900 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30901 Val = convertToScalableVector(DAG, ContainerVT, Val);
30902
30903 // Repeatedly truncate Val until the result is of the desired element type.
30904 switch (ContainerVT.getSimpleVT().SimpleTy) {
30905 default:
30906 llvm_unreachable("unimplemented container type");
30907 case MVT::nxv2i64:
30908 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
30909 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
30910 if (VT.getVectorElementType() == MVT::i32)
30911 break;
30912 [[fallthrough]];
30913 case MVT::nxv4i32:
30914 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
30915 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
30916 if (VT.getVectorElementType() == MVT::i16)
30917 break;
30918 [[fallthrough]];
30919 case MVT::nxv8i16:
30920 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
30921 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
30922 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
30923 break;
30924 }
30925
30926 return convertFromScalableVector(DAG, VT, Val);
30927}
30928
30929SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
30930 SDValue Op, SelectionDAG &DAG) const {
30931 EVT VT = Op.getValueType();
30932 EVT InVT = Op.getOperand(0).getValueType();
30933 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
30934
30935 SDLoc DL(Op);
30936 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30937 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30938
30939 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
30940}
30941
30942SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
30943 SDValue Op, SelectionDAG &DAG) const {
30944 EVT VT = Op.getValueType();
30945 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30946
30947 SDLoc DL(Op);
30948 EVT InVT = Op.getOperand(0).getValueType();
30949 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30950 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30951
30952 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
30953 Op.getOperand(1), Op.getOperand(2));
30954
30955 return convertFromScalableVector(DAG, VT, ScalableRes);
30956}
30957
30958// Convert vector operation 'Op' to an equivalent predicated operation whereby
30959// the original operation's type is used to construct a suitable predicate.
30960// NOTE: The results for inactive lanes are undefined.
30961SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
30962 SelectionDAG &DAG,
30963 unsigned NewOp) const {
30964 EVT VT = Op.getValueType();
30965 SDLoc DL(Op);
30966 auto Pg = getPredicateForVector(DAG, DL, VT);
30967
30968 if (VT.isFixedLengthVector()) {
30969 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
30970 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30971
30972 // Create list of operands by converting existing ones to scalable types.
30973 SmallVector<SDValue, 4> Operands = {Pg};
30974 for (const SDValue &V : Op->op_values()) {
30975 if (isa<CondCodeSDNode>(V)) {
30976 Operands.push_back(V);
30977 continue;
30978 }
30979
30980 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
30981 EVT VTArg = VTNode->getVT().getVectorElementType();
30982 EVT NewVTArg =
30983 ContainerVT.changeVectorElementType(*DAG.getContext(), VTArg);
30984 Operands.push_back(DAG.getValueType(NewVTArg));
30985 continue;
30986 }
30987
30988 assert(isTypeLegal(V.getValueType()) &&
30989 "Expected only legal fixed-width types");
30990 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
30991 }
30992
30993 if (isMergePassthruOpcode(NewOp))
30994 Operands.push_back(DAG.getUNDEF(ContainerVT));
30995
30996 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
30997 return convertFromScalableVector(DAG, VT, ScalableRes);
30998 }
30999
31000 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
31001
31002 SmallVector<SDValue, 4> Operands = {Pg};
31003 for (const SDValue &V : Op->op_values()) {
31004 assert((!V.getValueType().isVector() ||
31005 V.getValueType().isScalableVector()) &&
31006 "Only scalable vectors are supported!");
31007 Operands.push_back(V);
31008 }
31009
31010 if (isMergePassthruOpcode(NewOp))
31011 Operands.push_back(DAG.getUNDEF(VT));
31012
31013 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
31014}
31015
31016// If a fixed length vector operation has no side effects when applied to
31017// undefined elements, we can safely use scalable vectors to perform the same
31018// operation without needing to worry about predication.
31019SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
31020 SelectionDAG &DAG) const {
31021 EVT VT = Op.getValueType();
31023 "Only expected to lower fixed length vector operation!");
31024 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31025
31026 // Create list of operands by converting existing ones to scalable types.
31028 for (const SDValue &V : Op->op_values()) {
31029 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
31030
31031 // Pass through non-vector operands.
31032 if (!V.getValueType().isVector()) {
31033 Ops.push_back(V);
31034 continue;
31035 }
31036
31037 // "cast" fixed length vector to a scalable vector.
31038 assert(V.getValueType().isFixedLengthVector() &&
31039 isTypeLegal(V.getValueType()) &&
31040 "Only fixed length vectors are supported!");
31041 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
31042 }
31043
31044 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
31045 return convertFromScalableVector(DAG, VT, ScalableRes);
31046}
31047
31048SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
31049 SelectionDAG &DAG) const {
31050 SDLoc DL(ScalarOp);
31051 SDValue AccOp = ScalarOp.getOperand(0);
31052 SDValue VecOp = ScalarOp.getOperand(1);
31053 EVT SrcVT = VecOp.getValueType();
31054 EVT ResVT = SrcVT.getVectorElementType();
31055
31056 EVT ContainerVT = SrcVT;
31057 if (SrcVT.isFixedLengthVector()) {
31058 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
31059 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
31060 }
31061
31062 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
31063 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
31064
31065 // Convert operands to Scalable.
31066 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
31067 DAG.getUNDEF(ContainerVT), AccOp, Zero);
31068
31069 // Perform reduction.
31070 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
31071 Pg, AccOp, VecOp);
31072
31073 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
31074}
31075
31076SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
31077 SelectionDAG &DAG) const {
31078 SDLoc DL(ReduceOp);
31079 SDValue Op = ReduceOp.getOperand(0);
31080 EVT OpVT = Op.getValueType();
31081 EVT VT = ReduceOp.getValueType();
31082
31083 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
31084 return SDValue();
31085
31086 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
31087
31088 switch (ReduceOp.getOpcode()) {
31089 default:
31090 return SDValue();
31091 case ISD::VECREDUCE_OR:
31092 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
31093 // The predicate can be 'Op' because
31094 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
31095 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
31096 else
31097 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
31098 case ISD::VECREDUCE_AND: {
31099 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
31100 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
31101 }
31102 case ISD::VECREDUCE_XOR: {
31103 SDValue ID =
31104 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
31105 if (OpVT == MVT::nxv1i1) {
31106 // Emulate a CNTP on .Q using .D and a different governing predicate.
31107 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
31108 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
31109 }
31110 SDValue Cntp =
31111 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
31112 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
31113 }
31114 }
31115
31116 return SDValue();
31117}
31118
31119SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
31120 SDValue ScalarOp,
31121 SelectionDAG &DAG) const {
31122 SDLoc DL(ScalarOp);
31123 SDValue VecOp = ScalarOp.getOperand(0);
31124 EVT SrcVT = VecOp.getValueType();
31125
31127 SrcVT,
31128 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
31129 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
31130 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
31131 }
31132
31133 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
31134 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
31135 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
31136 SDValue BoolVec = VecOp.getOperand(0);
31137 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
31138 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
31139 SDValue CntpOp = DAG.getNode(
31140 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
31141 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
31142 BoolVec, BoolVec);
31143 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
31144 }
31145 }
31146
31147 // UADDV always returns an i64 result.
31148 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
31149 SrcVT.getVectorElementType();
31150 EVT RdxVT = SrcVT;
31151 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
31152 RdxVT = getPackedSVEVectorVT(ResVT);
31153
31154 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
31155 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
31156 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
31157 Rdx, DAG.getConstant(0, DL, MVT::i64));
31158
31159 // The VEC_REDUCE nodes expect an element size result.
31160 if (ResVT != ScalarOp.getValueType())
31161 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
31162
31163 return Res;
31164}
31165
31166SDValue
31167AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
31168 SelectionDAG &DAG) const {
31169 EVT VT = Op.getValueType();
31170 SDLoc DL(Op);
31171
31172 EVT InVT = Op.getOperand(1).getValueType();
31173 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
31174 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
31175 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
31176
31177 // Convert the mask to a predicated (NOTE: We don't need to worry about
31178 // inactive lanes since VSELECT is safe when given undefined elements).
31179 EVT MaskVT = Op.getOperand(0).getValueType();
31180 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
31181 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
31182 Mask = DAG.getNode(
31184 MaskContainerVT.changeVectorElementType(*DAG.getContext(), MVT::i1),
31185 Mask);
31186
31187 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
31188 Mask, Op1, Op2);
31189
31190 return convertFromScalableVector(DAG, VT, ScalableRes);
31191}
31192
31193SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
31194 SDValue Op, SelectionDAG &DAG) const {
31195 SDLoc DL(Op);
31196 EVT InVT = Op.getOperand(0).getValueType();
31197 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
31198
31199 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
31200 "Only expected to lower fixed length vector operation!");
31201 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
31202 "Expected integer result of the same bit length as the inputs!");
31203
31204 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
31205 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
31206 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
31207
31208 EVT CmpVT = Pg.getValueType();
31209 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
31210 {Pg, Op1, Op2, Op.getOperand(2)});
31211
31212 EVT PromoteVT = ContainerVT.changeTypeToInteger();
31213 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
31214 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
31215}
31216
31217SDValue
31218AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
31219 SelectionDAG &DAG) const {
31220 SDLoc DL(Op);
31221 auto SrcOp = Op.getOperand(0);
31222 EVT VT = Op.getValueType();
31223 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
31224 EVT ContainerSrcVT =
31226
31227 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
31228 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
31229 return convertFromScalableVector(DAG, VT, Op);
31230}
31231
31232SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
31233 SDValue Op, SelectionDAG &DAG) const {
31234 SDLoc DL(Op);
31235 unsigned NumOperands = Op->getNumOperands();
31236
31237 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
31238 "Unexpected number of operands in CONCAT_VECTORS");
31239
31240 auto SrcOp1 = Op.getOperand(0);
31241 auto SrcOp2 = Op.getOperand(1);
31242 EVT VT = Op.getValueType();
31243 EVT SrcVT = SrcOp1.getValueType();
31244
31245 // Match a splat of 128b segments that fit in a single register.
31246 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
31247 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31248 SDValue Splat =
31249 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
31250 convertToScalableVector(DAG, ContainerVT, SrcOp1),
31251 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
31252 return convertFromScalableVector(DAG, VT, Splat);
31253 }
31254
31255 if (NumOperands > 2) {
31257 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
31258 for (unsigned I = 0; I < NumOperands; I += 2)
31259 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
31260 Op->getOperand(I), Op->getOperand(I + 1)));
31261
31262 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
31263 }
31264
31265 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31266
31268 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
31269 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
31270
31271 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
31272
31273 return convertFromScalableVector(DAG, VT, Op);
31274}
31275
31276SDValue
31277AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
31278 SelectionDAG &DAG) const {
31279 EVT VT = Op.getValueType();
31280 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31281
31282 SDLoc DL(Op);
31283 SDValue Val = Op.getOperand(0);
31284 SDValue Pg = getPredicateForVector(DAG, DL, VT);
31285 EVT SrcVT = Val.getValueType();
31286 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31287 EVT ExtendVT = ContainerVT.changeVectorElementType(
31288 *DAG.getContext(), SrcVT.getVectorElementType());
31289
31290 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
31291 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
31292
31293 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
31294 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
31295 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
31296 Pg, Val, DAG.getUNDEF(ContainerVT));
31297
31298 return convertFromScalableVector(DAG, VT, Val);
31299}
31300
31301SDValue
31302AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
31303 SelectionDAG &DAG) const {
31304 EVT VT = Op.getValueType();
31305 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31306
31307 SDLoc DL(Op);
31308 SDValue Val = Op.getOperand(0);
31309 EVT SrcVT = Val.getValueType();
31310 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
31311 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
31312 *DAG.getContext(), VT.getVectorElementType());
31313 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
31314
31315 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
31316 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
31317 Op.getOperand(1), DAG.getUNDEF(RoundVT));
31318 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
31319 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
31320
31321 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
31322 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
31323}
31324
31325SDValue
31326AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
31327 SelectionDAG &DAG) const {
31328 EVT VT = Op.getValueType();
31329 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31330
31331 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
31332 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
31333 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
31334
31335 SDLoc DL(Op);
31336 SDValue Val = Op.getOperand(0);
31337 EVT SrcVT = Val.getValueType();
31338 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
31339 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
31340
31341 if (VT.bitsGE(SrcVT)) {
31343
31344 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
31345 VT.changeTypeToInteger(), Val);
31346
31347 // Safe to use a larger than specified operand because by promoting the
31348 // value nothing has changed from an arithmetic point of view.
31349 Val =
31350 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
31351 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
31352 DAG.getUNDEF(ContainerDstVT));
31353 return convertFromScalableVector(DAG, VT, Val);
31354 } else {
31355 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
31356 *DAG.getContext(), ContainerDstVT.getVectorElementType());
31358
31359 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
31360 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
31361 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
31362 Val = convertFromScalableVector(DAG, SrcVT, Val);
31363
31364 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
31365 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
31366 }
31367}
31368
31369SDValue
31370AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
31371 SelectionDAG &DAG) const {
31372 SDLoc DL(Op);
31373 EVT OpVT = Op.getValueType();
31374
31375 if (OpVT.isScalableVector() && Op->getNumOperands() == 3) {
31376 // aarch64_sve_ld3 only supports packed datatypes.
31377 EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
31378 Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
31380 DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
31381
31382 // Write out unmodified operands.
31384 for (unsigned I = 0; I < 3; ++I) {
31385 SDValue Ptr =
31386 DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
31387 SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
31388 Chains.push_back(
31389 DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
31390 }
31391
31392 Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
31393 EVT PredVT = PackedVT.changeVectorElementType(*DAG.getContext(), MVT::i1);
31394
31396 Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
31397 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31398 Ops.push_back(DAG.getConstant(1, DL, PredVT));
31399 Ops.push_back(StackPtr);
31400
31401 // Read back and deinterleave data.
31402 SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
31403 SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
31404
31406 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
31407 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
31408 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
31409 return DAG.getMergeValues(Results, DL);
31410 }
31411
31412 // Are multi-register uzp instructions available?
31413 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
31414 OpVT.isScalableVector() && OpVT.getVectorElementType() != MVT::i1) {
31415 Intrinsic::ID IntID;
31416 switch (Op->getNumOperands()) {
31417 default:
31418 return SDValue();
31419 case 2:
31420 IntID = Intrinsic::aarch64_sve_uzp_x2;
31421 break;
31422 case 4:
31423 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
31424 OpVT.getScalarSizeInBits() == 64)
31425 return SDValue();
31426 IntID = Intrinsic::aarch64_sve_uzp_x4;
31427 break;
31428 }
31429
31431 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31432 Ops.append(Op->op_values().begin(), Op->op_values().end());
31433 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
31434 }
31435
31436 if (Op->getNumOperands() != 2)
31437 return SDValue();
31438
31439 if (OpVT == MVT::v1i64 || OpVT == MVT::v1f64)
31440 return DAG.getMergeValues({Op.getOperand(0), Op.getOperand(1)}, DL);
31441
31442 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
31443 Op.getOperand(1));
31444 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
31445 Op.getOperand(1));
31446 return DAG.getMergeValues({Even, Odd}, DL);
31447}
31448
31449SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
31450 SelectionDAG &DAG) const {
31451 SDLoc DL(Op);
31452 EVT OpVT = Op.getValueType();
31453
31454 if (OpVT.isScalableVector() && Op->getNumOperands() == 3) {
31455 // aarch64_sve_st3 only supports packed datatypes.
31456 EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
31458 for (SDValue V : Op->ops())
31459 InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
31460
31461 Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
31463 DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
31464
31465 Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
31466 EVT PredVT = PackedVT.changeVectorElementType(*DAG.getContext(), MVT::i1);
31467
31469 Ops.push_back(DAG.getEntryNode());
31470 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31471 Ops.append(InVecs);
31472 Ops.push_back(DAG.getConstant(1, DL, PredVT));
31473 Ops.push_back(StackPtr);
31474
31475 // Interleave operands and store.
31476 SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
31477
31478 // Read back the interleaved data.
31480 for (unsigned I = 0; I < 3; ++I) {
31481 SDValue Ptr =
31482 DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
31483 SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());
31484 Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
31485 }
31486
31487 return DAG.getMergeValues(Results, DL);
31488 }
31489
31490 // Are multi-register zip instructions available?
31491 // If so, use them for packed types. Interleaves of unpacked types can be
31492 // selected using trn1.
31493 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
31494 OpVT.isScalableVector() && isPackedVectorType(OpVT, DAG)) {
31495 Intrinsic::ID IntID;
31496 switch (Op->getNumOperands()) {
31497 default:
31498 return SDValue();
31499 case 2:
31500 IntID = Intrinsic::aarch64_sve_zip_x2;
31501 break;
31502 case 4:
31503 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
31504 OpVT.getScalarSizeInBits() == 64)
31505 return SDValue();
31506 IntID = Intrinsic::aarch64_sve_zip_x4;
31507 break;
31508 }
31509
31511 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31512 Ops.append(Op->op_values().begin(), Op->op_values().end());
31513 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
31514 }
31515
31516 if (Op->getNumOperands() != 2)
31517 return SDValue();
31518
31519 if (OpVT == MVT::v1i64 || OpVT == MVT::v1f64)
31520 return DAG.getMergeValues({Op.getOperand(0), Op.getOperand(1)}, DL);
31521
31522 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
31523 Op.getOperand(1));
31524 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
31525 Op.getOperand(1));
31526 return DAG.getMergeValues({Lo, Hi}, DL);
31527}
31528
31529SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
31530 SelectionDAG &DAG) const {
31531 // FIXME: Maybe share some code with LowerMGather/Scatter?
31532 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
31533 SDLoc DL(HG);
31534 SDValue Chain = HG->getChain();
31535 SDValue Inc = HG->getInc();
31536 SDValue Mask = HG->getMask();
31537 SDValue Ptr = HG->getBasePtr();
31538 SDValue Index = HG->getIndex();
31539 SDValue Scale = HG->getScale();
31540 SDValue IntID = HG->getIntID();
31541
31542 // The Intrinsic ID determines the type of update operation.
31543 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
31544 // Right now, we only support 'add' as an update.
31545 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
31546 "Unexpected histogram update operation");
31547
31548 EVT IndexVT = Index.getValueType();
31549 LLVMContext &Ctx = *DAG.getContext();
31550 ElementCount EC = IndexVT.getVectorElementCount();
31551 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
31552 EVT IncExtVT =
31553 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
31554 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
31555 bool ExtTrunc = IncSplatVT != MemVT;
31556
31557 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
31558 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
31559 SDValue IncSplat = DAG.getSplatVector(
31560 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
31561 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
31562
31563 MachineMemOperand *MMO = HG->getMemOperand();
31564 // Create an MMO for the gather, without load|store flags.
31565 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
31567 MMO->getAlign(), MMO->getAAInfo());
31568 ISD::MemIndexType IndexType = HG->getIndexType();
31569 SDValue Gather = DAG.getMaskedGather(
31570 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
31571 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
31572
31573 SDValue GChain = Gather.getValue(1);
31574
31575 // Perform the histcnt, multiply by inc, add to bucket data.
31576 SDValue ID =
31577 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
31578 SDValue HistCnt =
31579 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
31580 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
31581 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
31582
31583 // Create an MMO for the scatter, without load|store flags.
31584 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
31586 MMO->getAlign(), MMO->getAAInfo());
31587
31588 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
31589 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
31590 ScatterOps, SMMO, IndexType, ExtTrunc);
31591 return Scatter;
31592}
31593
31594/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
31595/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
31596/// however still make use of the dot product instruction by instead
31597/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
31598/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
31599/// the following pattern is emitted:
31600/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
31601/// NTy/2))))
31602SDValue
31603AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
31604 SelectionDAG &DAG) const {
31605 SDLoc DL(Op);
31606
31607 SDValue Acc = Op.getOperand(0);
31608 SDValue LHS = Op.getOperand(1);
31609 SDValue RHS = Op.getOperand(2);
31610 EVT ResultVT = Op.getValueType();
31611 EVT OrigResultVT = ResultVT;
31612 EVT OpVT = LHS.getValueType();
31613
31614 bool ConvertToScalable =
31615 ResultVT.isFixedLengthVector() &&
31616 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
31617
31618 // We can handle this case natively by accumulating into a wider
31619 // zero-padded vector.
31620 if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
31621 SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);
31622 SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);
31623 SDValue Wide =
31624 DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS);
31625 SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide);
31626 return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);
31627 }
31628
31629 if (ConvertToScalable) {
31630 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
31631 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
31632 Acc = convertToScalableVector(DAG, ResultVT, Acc);
31633 LHS = convertToScalableVector(DAG, OpVT, LHS);
31634 RHS = convertToScalableVector(DAG, OpVT, RHS);
31635 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
31636 }
31637
31638 // Two-way and four-way partial reductions are supported by patterns.
31639 // We only need to handle the 8-way partial reduction.
31640 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
31641 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
31642 : Op;
31643
31644 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
31645 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
31646 DAG.getConstant(0, DL, DotVT), LHS, RHS);
31647
31648 SDValue Res;
31649 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
31650 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
31651 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
31652 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
31653 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
31654 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
31655 } else {
31656 // Fold (nx)v4i32 into (nx)v2i64
31657 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
31658 if (IsUnsigned) {
31659 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
31660 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
31661 } else {
31662 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
31663 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
31664 }
31665 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
31666 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
31667 }
31668
31669 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
31670 : Res;
31671}
31672
31673SDValue
31674AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
31675 SelectionDAG &DAG) const {
31676 EVT VT = Op.getValueType();
31677 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31678
31679 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
31680 "Lowering fixed length get_active_lane_mask requires SVE!");
31681
31682 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
31683 // but we can use SVE when available.
31684
31685 SDLoc DL(Op);
31686 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31687 EVT WhileVT = ContainerVT.changeElementType(*DAG.getContext(), MVT::i1);
31688
31690 Op.getOperand(0), Op.getOperand(1));
31691 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
31692 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
31693 DAG.getVectorIdxConstant(0, DL));
31694}
31695
31696SDValue
31697AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
31698 SelectionDAG &DAG) const {
31699 EVT VT = Op.getValueType();
31700 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31701
31702 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
31703 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
31704 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
31705
31706 SDLoc DL(Op);
31707 SDValue Val = Op.getOperand(0);
31708 EVT SrcVT = Val.getValueType();
31709 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
31710 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
31711
31712 if (VT.bitsGT(SrcVT)) {
31713 EVT CvtVT = ContainerDstVT.changeVectorElementType(
31714 *DAG.getContext(), ContainerSrcVT.getVectorElementType());
31716
31717 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
31718 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
31719
31720 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
31721 Val = getSVESafeBitCast(CvtVT, Val, DAG);
31722 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
31723 DAG.getUNDEF(ContainerDstVT));
31724 return convertFromScalableVector(DAG, VT, Val);
31725 } else {
31726 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
31728
31729 // Safe to use a larger than specified result since an fp_to_int where the
31730 // result doesn't fit into the destination is undefined.
31731 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
31732 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
31733 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
31734
31735 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
31736 }
31737}
31738
31740 ArrayRef<int> ShuffleMask, EVT VT,
31741 EVT ContainerVT, SelectionDAG &DAG) {
31742 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
31743 SDLoc DL(Op);
31744 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
31745 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
31746 bool IsSingleOp =
31747 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
31748
31749 if (!Subtarget.isNeonAvailable() && !MinSVESize)
31750 MinSVESize = 128;
31751
31752 // Ignore two operands if no SVE2 or all index numbers couldn't
31753 // be represented.
31754 if (!IsSingleOp && !Subtarget.hasSVE2())
31755 return SDValue();
31756
31757 EVT VTOp1 = Op.getOperand(0).getValueType();
31758 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
31759 unsigned IndexLen = MinSVESize / BitsPerElt;
31760 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
31761 uint64_t MaxOffset = maxUIntN(BitsPerElt);
31762 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
31763 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
31764 bool MinMaxEqual = (MinSVESize == MaxSVESize);
31765 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
31766 "Incorrectly legalised shuffle operation");
31767
31769 // If MinSVESize is not equal to MaxSVESize then we need to know which
31770 // TBL mask element needs adjustment.
31771 SmallVector<SDValue, 8> AddRuntimeVLMask;
31772
31773 // Bail out for 8-bits element types, because with 2048-bit SVE register
31774 // size 8 bits is only sufficient to index into the first source vector.
31775 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
31776 return SDValue();
31777
31778 for (int Index : ShuffleMask) {
31779 // Handling poison index value.
31780 if (Index < 0)
31781 Index = 0;
31782 // If the mask refers to elements in the second operand, then we have to
31783 // offset the index by the number of elements in a vector. If this is number
31784 // is not known at compile-time, we need to maintain a mask with 'VL' values
31785 // to add at runtime.
31786 if ((unsigned)Index >= ElementsPerVectorReg) {
31787 if (MinMaxEqual) {
31788 Index += IndexLen - ElementsPerVectorReg;
31789 } else {
31790 Index = Index - ElementsPerVectorReg;
31791 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
31792 }
31793 } else if (!MinMaxEqual)
31794 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
31795 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
31796 // to 255, this might point to the last element of in the second operand
31797 // of the shufflevector, thus we are rejecting this transform.
31798 if ((unsigned)Index >= MaxOffset)
31799 return SDValue();
31800 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
31801 }
31802
31803 // Choosing an out-of-range index leads to the lane being zeroed vs zero
31804 // value where it would perform first lane duplication for out of
31805 // index elements. For i8 elements an out-of-range index could be a valid
31806 // for 2048-bit vector register size.
31807 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
31808 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
31809 if (!MinMaxEqual)
31810 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
31811 }
31812
31813 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
31814 SDValue VecMask =
31815 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
31816 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
31817
31818 SDValue Shuffle;
31819 if (IsSingleOp)
31820 Shuffle = DAG.getNode(
31821 ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31822 DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1,
31823 SVEMask);
31824 else if (Subtarget.hasSVE2()) {
31825 if (!MinMaxEqual) {
31826 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
31827 SDValue VScale = (BitsPerElt == 64)
31828 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
31829 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
31830 SDValue VecMask =
31831 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
31832 SDValue MulByMask = DAG.getNode(
31833 ISD::MUL, DL, MaskType,
31834 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
31835 DAG.getBuildVector(MaskType, DL,
31836 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
31837 SDValue UpdatedVecMask =
31838 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
31839 SVEMask = convertToScalableVector(
31840 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
31841 }
31842 Shuffle = DAG.getNode(
31843 ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31844 DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1,
31845 Op2, SVEMask);
31846 }
31847 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
31848 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
31849}
31850
31851SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
31852 SDValue Op, SelectionDAG &DAG) const {
31853 EVT VT = Op.getValueType();
31854 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31855
31856 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
31857 auto ShuffleMask = SVN->getMask();
31858
31859 SDLoc DL(Op);
31860 SDValue Op1 = Op.getOperand(0);
31861 SDValue Op2 = Op.getOperand(1);
31862
31863 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31864 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
31865 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
31866
31867 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
31868 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
31869 return MVT::i32;
31870 return ScalarTy;
31871 };
31872
31873 if (SVN->isSplat()) {
31874 unsigned Lane = std::max(0, SVN->getSplatIndex());
31875 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31876 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31877 DAG.getConstant(Lane, DL, MVT::i64));
31878 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
31879 return convertFromScalableVector(DAG, VT, Op);
31880 }
31881
31882 bool ReverseEXT = false;
31883 unsigned Imm;
31884 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
31885 Imm == VT.getVectorNumElements() - 1) {
31886 if (ReverseEXT)
31887 std::swap(Op1, Op2);
31888 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31889 SDValue Scalar = DAG.getNode(
31890 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31891 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
31892 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
31893 return convertFromScalableVector(DAG, VT, Op);
31894 }
31895
31896 unsigned EltSize = VT.getScalarSizeInBits();
31897 for (unsigned BlockSize : {64U, 32U, 16U}) {
31898 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
31899 unsigned RevOp;
31900 if (EltSize == 8)
31901 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
31902 else if (EltSize == 16)
31903 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
31904 else
31905 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
31906 EVT BlockedVT =
31908 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
31909 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
31910 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
31911 DAG.getUNDEF(BlockedVT));
31912 SDValue Container =
31913 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
31914 return convertFromScalableVector(DAG, VT, Container);
31915 }
31916 }
31917
31918 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
31919 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
31920 SDValue Pg = getPredicateForVector(DAG, DL, VT);
31921 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
31922 Pg, Op1, DAG.getUNDEF(ContainerVT));
31923 return convertFromScalableVector(DAG, VT, Revd);
31924 }
31925
31926 unsigned WhichResult;
31927 unsigned OperandOrder;
31928 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31929 OperandOrder) &&
31930 WhichResult == 0) {
31931 SDValue ZIP = DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT,
31932 OperandOrder == 0 ? Op1 : Op2,
31933 OperandOrder == 0 ? Op2 : Op1);
31934 return convertFromScalableVector(DAG, VT, ZIP);
31935 }
31936
31937 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31938 OperandOrder)) {
31939 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31940 SDValue TRN =
31941 DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
31942 OperandOrder == 0 ? Op2 : Op1);
31943 return convertFromScalableVector(DAG, VT, TRN);
31944 }
31945
31946 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
31948 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
31949
31950 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
31951 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31953 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
31954 }
31955
31956 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
31957 // represents the same logical operation as performed by a ZIP instruction. In
31958 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
31959 // equivalent to an AArch64 instruction. There's the extra component of
31960 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
31961 // only operated on 64/128bit vector types that have a direct mapping to a
31962 // target register and so an exact mapping is implied.
31963 // However, when using SVE for fixed length vectors, most legal vector types
31964 // are actually sub-vectors of a larger SVE register. When mapping
31965 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
31966 // how the mask's indices translate. Specifically, when the mapping requires
31967 // an exact meaning for a specific vector index (e.g. Index X is the last
31968 // vector element in the register) then such mappings are often only safe when
31969 // the exact SVE register size is know. The main exception to this is when
31970 // indices are logically relative to the first element of either
31971 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
31972 // when converting from fixed-length to scalable vector types (i.e. the start
31973 // of a fixed length vector is always the start of a scalable vector).
31974 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
31975 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
31976 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
31977 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
31978 Op2.isUndef()) {
31979 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
31980 return convertFromScalableVector(DAG, VT, Op);
31981 }
31982
31983 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31984 OperandOrder) &&
31985 WhichResult != 0) {
31986 SDValue ZIP = DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT,
31987 OperandOrder == 0 ? Op1 : Op2,
31988 OperandOrder == 0 ? Op2 : Op1);
31989 return convertFromScalableVector(DAG, VT, ZIP);
31990 }
31991
31992 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
31993 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
31995 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
31996 }
31997
31998 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
32000 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
32001
32002 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
32003 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
32005 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
32006 }
32007
32008 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
32009 Subtarget->isSVEorStreamingSVEAvailable()) {
32011 "Unsupported SVE vector size");
32012
32014 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
32015 if (std::optional<unsigned> Lane =
32016 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
32017 SDValue IID = DAG.getTargetConstant(Intrinsic::aarch64_sve_dup_laneq,
32018 DL, MVT::i64);
32020 DAG, VT,
32021 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
32022 {IID, Op1,
32023 DAG.getConstant(*Lane, DL, MVT::i64,
32024 /*isTarget=*/true)}));
32025 }
32026 }
32027 }
32028
32029 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
32030 // This may allow the shuffle to be matched as something cheaper like ZIP1.
32031 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
32032 return WideOp;
32033
32034 // Avoid producing TBL instruction if we don't know SVE register minimal size,
32035 // unless NEON is not available and we can assume minimal SVE register size is
32036 // 128-bits.
32037 if (MinSVESize || !Subtarget->isNeonAvailable())
32038 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
32039 DAG);
32040
32041 return SDValue();
32042}
32043
32044SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
32045 SelectionDAG &DAG) const {
32046 SDLoc DL(Op);
32047 EVT InVT = Op.getValueType();
32048
32049 assert(VT.isScalableVector() && isTypeLegal(VT) &&
32050 InVT.isScalableVector() && isTypeLegal(InVT) &&
32051 "Only expect to cast between legal scalable vector types!");
32052 assert(VT.getVectorElementType() != MVT::i1 &&
32053 InVT.getVectorElementType() != MVT::i1 &&
32054 "For predicate bitcasts, use getSVEPredicateBitCast");
32055
32056 if (InVT == VT)
32057 return Op;
32058
32059 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
32060 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
32061
32062 // Safe bitcasting between unpacked vector types of different element counts
32063 // is currently unsupported because the following is missing the necessary
32064 // work to ensure the result's elements live where they're supposed to within
32065 // an SVE register.
32066 // 01234567
32067 // e.g. nxv2i32 = XX??XX??
32068 // nxv4f16 = X?X?X?X?
32070 VT == PackedVT || InVT == PackedInVT) &&
32071 "Unexpected bitcast!");
32072
32073 // Pack input if required.
32074 if (InVT != PackedInVT)
32075 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
32076
32077 if (Subtarget->isLittleEndian() ||
32078 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
32079 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
32080 else {
32081 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
32082 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
32083
32084 // Simulate the effect of casting through memory.
32085 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
32086 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
32087 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
32088 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
32089 if (PackedVTAsInt.getScalarSizeInBits() != 8)
32090 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
32091 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
32092 }
32093
32094 // Unpack result if required.
32095 if (VT != PackedVT)
32096 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
32097
32098 return Op;
32099}
32100
32102 SDValue N) const {
32103 return ::isAllActivePredicate(DAG, N);
32104}
32105
32107 return ::getPromotedVTForPredicate(VT);
32108}
32109
32110bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
32111 SDValue Op, const APInt &OriginalDemandedBits,
32112 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
32113 unsigned Depth) const {
32114
32115 unsigned Opc = Op.getOpcode();
32116 switch (Opc) {
32117 case AArch64ISD::VSHL: {
32118 // Match (VSHL (VLSHR Val X) X)
32119 SDValue ShiftL = Op;
32120 SDValue ShiftR = Op->getOperand(0);
32121 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
32122 return false;
32123
32124 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
32125 return false;
32126
32127 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
32128 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
32129
32130 // Other cases can be handled as well, but this is not
32131 // implemented.
32132 if (ShiftRBits != ShiftLBits)
32133 return false;
32134
32135 unsigned ScalarSize = Op.getScalarValueSizeInBits();
32136 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
32137
32138 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
32139 APInt UnusedBits = ~OriginalDemandedBits;
32140
32141 if ((ZeroBits & UnusedBits) != ZeroBits)
32142 return false;
32143
32144 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
32145 // used - simplify to just Val.
32146 return TLO.CombineTo(Op, ShiftR->getOperand(0));
32147 }
32148 case AArch64ISD::BICi: {
32149 // Fold BICi if all destination bits already known to be zeroed
32150 SDValue Op0 = Op.getOperand(0);
32151 KnownBits KnownOp0 =
32152 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
32153 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
32154 APInt BitsToClear =
32155 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
32156 .trunc(KnownOp0.getBitWidth());
32157 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
32158 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
32159 return TLO.CombineTo(Op, Op0);
32160
32161 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
32162 return false;
32163 }
32165 std::optional<ElementCount> MaxCount = getMaxValueForSVECntIntrinsic(Op);
32166 if (!MaxCount)
32167 return false;
32168 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
32169 if (!MaxSVEVectorSizeInBits)
32170 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
32171 unsigned VscaleMax = MaxSVEVectorSizeInBits / 128;
32172 unsigned MaxValue = MaxCount->getKnownMinValue() * VscaleMax;
32173 // The SVE count intrinsics don't support the multiplier immediate so we
32174 // don't have to account for that here. The value returned may be slightly
32175 // over the true required bits, as this is based on the "ALL" pattern. The
32176 // other patterns are also exposed by these intrinsics, but they all
32177 // return a value that's strictly less than "ALL".
32178 unsigned RequiredBits = llvm::bit_width(MaxValue);
32179 unsigned BitWidth = Known.Zero.getBitWidth();
32180 if (RequiredBits < BitWidth)
32181 Known.Zero.setHighBits(BitWidth - RequiredBits);
32182 return false;
32183 }
32184 }
32185
32187 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
32188}
32189
32190bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
32191 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
32192 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
32193
32194 // TODO: Add more target nodes.
32195 switch (Op.getOpcode()) {
32196 case AArch64ISD::MOVI:
32197 case AArch64ISD::MOVIedit:
32198 case AArch64ISD::MOVImsl:
32199 case AArch64ISD::MOVIshift:
32200 case AArch64ISD::MVNImsl:
32201 case AArch64ISD::MVNIshift:
32202 case AArch64ISD::VASHR:
32203 case AArch64ISD::VLSHR:
32204 case AArch64ISD::VSHL:
32205 return false;
32206 }
32208 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
32209}
32210
32211bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
32212 return Op.getOpcode() == AArch64ISD::DUP ||
32213 Op.getOpcode() == AArch64ISD::MOVI ||
32214 Op.getOpcode() == AArch64ISD::MOVIshift ||
32215 Op.getOpcode() == AArch64ISD::MOVImsl ||
32216 Op.getOpcode() == AArch64ISD::MOVIedit ||
32217 Op.getOpcode() == AArch64ISD::MVNIshift ||
32218 Op.getOpcode() == AArch64ISD::MVNImsl ||
32219 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
32220 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
32221 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
32222 (Op.getOpcode() == ISD::FNEG &&
32223 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
32224 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
32225 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
32226 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
32228}
32229
32231 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
32232 Subtarget->hasComplxNum();
32233}
32234
32237 auto *VTy = dyn_cast<VectorType>(Ty);
32238 if (!VTy)
32239 return false;
32240
32241 // If the vector is scalable, SVE is enabled, implying support for complex
32242 // numbers. Otherwise, we need to ensure complex number support is available
32243 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
32244 return false;
32245
32246 auto *ScalarTy = VTy->getScalarType();
32247 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
32248
32249 // We can only process vectors that have a bit size of 128 or higher (with an
32250 // additional 64 bits for Neon). Additionally, these vectors must have a
32251 // power-of-2 size, as we later split them into the smallest supported size
32252 // and merging them back together after applying complex operation.
32253 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
32254 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
32255 !llvm::isPowerOf2_32(VTyWidth))
32256 return false;
32257
32258 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
32259 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
32260
32262 return ScalarWidth == 32 || ScalarWidth == 64;
32263 return 8 <= ScalarWidth && ScalarWidth <= 64;
32264 }
32265
32266 // CDot is not supported outside of scalable/sve scopes
32268 return false;
32269
32270 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
32271 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
32272}
32273
32276 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
32277 Value *Accumulator) const {
32278 VectorType *Ty = cast<VectorType>(InputA->getType());
32279 if (Accumulator == nullptr)
32281 bool IsScalable = Ty->isScalableTy();
32282 bool IsInt = Ty->getElementType()->isIntegerTy();
32283
32284 unsigned TyWidth =
32285 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
32286
32287 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
32288 "Vector type must be either 64 or a power of 2 that is at least 128");
32289
32290 if (TyWidth > 128) {
32291 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
32292 int AccStride = cast<VectorType>(Accumulator->getType())
32293 ->getElementCount()
32294 .getKnownMinValue() /
32295 2;
32296 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
32297 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
32298 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
32299 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
32300 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
32301 Value *LowerSplitAcc = nullptr;
32302 Value *UpperSplitAcc = nullptr;
32303 Type *FullTy = Ty;
32304 FullTy = Accumulator->getType();
32305 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
32306 cast<VectorType>(Accumulator->getType()));
32307 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
32308 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
32309 auto *LowerSplitInt = createComplexDeinterleavingIR(
32310 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
32311 auto *UpperSplitInt = createComplexDeinterleavingIR(
32312 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
32313
32314 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
32315 LowerSplitInt, uint64_t(0));
32316 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
32317 }
32318
32319 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
32320 if (IsScalable) {
32321 if (IsInt)
32322 return B.CreateIntrinsic(
32323 Intrinsic::aarch64_sve_cmla_x, Ty,
32324 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
32325
32326 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
32327 return B.CreateIntrinsic(
32328 Intrinsic::aarch64_sve_fcmla, Ty,
32329 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
32330 }
32331
32332 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
32333 Intrinsic::aarch64_neon_vcmla_rot90,
32334 Intrinsic::aarch64_neon_vcmla_rot180,
32335 Intrinsic::aarch64_neon_vcmla_rot270};
32336
32337
32338 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
32339 {Accumulator, InputA, InputB});
32340 }
32341
32342 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
32343 if (IsScalable) {
32346 if (IsInt)
32347 return B.CreateIntrinsic(
32348 Intrinsic::aarch64_sve_cadd_x, Ty,
32349 {InputA, InputB, B.getInt32((int)Rotation * 90)});
32350
32351 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
32352 return B.CreateIntrinsic(
32353 Intrinsic::aarch64_sve_fcadd, Ty,
32354 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
32355 }
32356 return nullptr;
32357 }
32358
32361 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
32363 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
32364
32365 if (IntId == Intrinsic::not_intrinsic)
32366 return nullptr;
32367
32368 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
32369 }
32370
32371 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
32372 IsScalable) {
32373 return B.CreateIntrinsic(
32374 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
32375 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
32376 }
32377
32378 return nullptr;
32379}
32380
32381bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
32382 unsigned Opc = N->getOpcode();
32383 if (ISD::isExtOpcode(Opc)) {
32384 if (any_of(N->users(),
32385 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
32386 return false;
32387 }
32388 return true;
32389}
32390
32392 return Subtarget->getMinimumJumpTableEntries();
32393}
32394
32396 CallingConv::ID CC,
32397 EVT VT) const {
32398 bool NonUnitFixedLengthVector =
32400 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
32401 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
32402
32403 EVT VT1;
32404 MVT RegisterVT;
32405 unsigned NumIntermediates;
32406 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
32407 RegisterVT);
32408 return RegisterVT;
32409}
32410
32412 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
32413 bool NonUnitFixedLengthVector =
32415 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
32416 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
32417
32418 EVT VT1;
32419 MVT VT2;
32420 unsigned NumIntermediates;
32421 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
32422 NumIntermediates, VT2);
32423}
32424
32426 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
32427 unsigned &NumIntermediates, MVT &RegisterVT) const {
32429 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
32430 if (!RegisterVT.isFixedLengthVector() ||
32431 RegisterVT.getFixedSizeInBits() <= 128)
32432 return NumRegs;
32433
32434 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
32435 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
32436 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
32437
32438 // A size mismatch here implies either type promotion or widening and would
32439 // have resulted in scalarisation if larger vectors had not be available.
32440 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
32441 EVT EltTy = VT.getVectorElementType();
32442 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
32443 if (!isTypeLegal(NewVT))
32444 NewVT = EltTy;
32445
32446 IntermediateVT = NewVT;
32447 NumIntermediates = VT.getVectorNumElements();
32448 RegisterVT = getRegisterType(Context, NewVT);
32449 return NumIntermediates;
32450 }
32451
32452 // SVE VLS support does not introduce a new ABI so we should use NEON sized
32453 // types for vector arguments and returns.
32454
32455 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
32456 NumIntermediates *= NumSubRegs;
32457 NumRegs *= NumSubRegs;
32458
32459 switch (RegisterVT.getVectorElementType().SimpleTy) {
32460 default:
32461 llvm_unreachable("unexpected element type for vector");
32462 case MVT::i8:
32463 IntermediateVT = RegisterVT = MVT::v16i8;
32464 break;
32465 case MVT::i16:
32466 IntermediateVT = RegisterVT = MVT::v8i16;
32467 break;
32468 case MVT::i32:
32469 IntermediateVT = RegisterVT = MVT::v4i32;
32470 break;
32471 case MVT::i64:
32472 IntermediateVT = RegisterVT = MVT::v2i64;
32473 break;
32474 case MVT::f16:
32475 IntermediateVT = RegisterVT = MVT::v8f16;
32476 break;
32477 case MVT::f32:
32478 IntermediateVT = RegisterVT = MVT::v4f32;
32479 break;
32480 case MVT::f64:
32481 IntermediateVT = RegisterVT = MVT::v2f64;
32482 break;
32483 case MVT::bf16:
32484 IntermediateVT = RegisterVT = MVT::v8bf16;
32485 break;
32486 }
32487
32488 return NumRegs;
32489}
32490
32492 const MachineFunction &MF) const {
32493 return !Subtarget->isTargetWindows() &&
32494 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
32495}
32496
32498 switch (Opc) {
32502 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
32503 return true;
32504 }
32505
32507}
32508
32510 EVT VT) const {
32511 return Subtarget->hasCPA() && UseFEATCPACodegen;
32512}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST, APInt &DefBits)
static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG)
Helper function to optimize loads of extended small vectors.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue performPTestFirstCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static std::optional< ElementCount > getMaxValueForSVECntIntrinsic(SDValue Op)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
return SDValue()
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performANDSCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static SDValue performBICiCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static bool IsSVECntIntrinsic(SDValue S)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, const AArch64TargetLowering &TLI, const AArch64RegisterInfo &TRI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static bool isLane0KnownActive(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode, SelectionDAG &DAG, bool LastOperandIsImm=false)
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performCTPOPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtInReg(const SDValue &V)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static bool isUnpackedType(EVT VT, SelectionDAG &DAG)
Returns true if the conceptual representation for VT does not map directly to its physical register r...
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool &PreferFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static bool isPackedPredicateType(EVT VT, SelectionDAG &DAG)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue optimizeBitTest(SDLoc DL, SDValue Op, SDValue Chain, SDValue Dest, unsigned Opcode, SelectionDAG &DAG)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD, const AArch64Subtarget &Subtarget)
Helper function to check if a small vector load can be optimized.
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallBase &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
basic Basic Alias true
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
dxil translate DXIL Translate Metadata
@ Default
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
#define T
This file defines ARC utility functions which are used by various parts of the compiler.
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const uint8_t *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static Split data
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
The Input class is used to parse a yaml document into in-memory structs and vectors.
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMaximumJumpTableSize() const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
unsigned getSVEVectorSizeInBits() const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
Value * getIRStackGuard(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context, EVT VT) const override
bool shouldInsertTrailingSeqCstFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a seq_cst trailing fence without reducing the or...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:213
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:230
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1549
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1901
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1400
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1679
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1044
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1939
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1167
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1946
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1648
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
unsigned logBase2() const
Definition APInt.h:1770
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isMask(unsigned numBits) const
Definition APInt.h:489
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:335
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
an instruction to allocate memory on the stack
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
bool isFloatingPointOperation() const
BinOp getOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const BlockAddress * getBlockAddress() const
Function * getFunction() const
Definition Constants.h:940
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:214
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
static FixedVectorType * getInteger(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
arg_iterator arg_end()
Definition Function.h:875
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
const Argument * const_arg_iterator
Definition Function.h:73
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:730
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1939
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2222
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2479
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2762
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Tracks which library functions to use for a particular subtarget.
LLVM_ABI CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall.
LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Return the lowering's selection of implementation call for Call.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool hasScalableStackID(int ObjectIdx) const
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition MapVector.h:56
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:722
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:358
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAssert() const
Test if this node is an assert operation.
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasStreamingInterfaceOrBody() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC)
LLVM_ABI Align getReducedAlign(EVT VT, bool UseABI)
In most cases this function returns the ABI alignment for a given type, except for illegal vector typ...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getNeutralElement(unsigned Opcode, const SDLoc &DL, EVT VT, SDNodeFlags Flags)
Get the (commutative) neutral element for the given opcode, if it exists.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS)
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
const LibcallLoweringInfo & getLibcalls() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getDeactivationSymbol(const GlobalValue *GV)
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:611
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:696
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual Value * getIRStackGuard(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const
Returns the target-specific address of the unsafe stack pointer.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
This class represents a truncation of integer types.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
@ HalfTyID
16-bit floating point type
Definition Type.h:56
@ FloatTyID
32-bit floating point type
Definition Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:57
@ DoubleTyID
64-bit floating point type
Definition Type.h:59
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:283
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:282
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static bool isSVELogicalImm(unsigned SizeInBits, uint64_t ImmVal, uint64_t &Encoding)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, int32_t &Shift)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering)
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:818
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:511
@ PARTIAL_REDUCE_SMLA
PARTIAL_REDUCE_[U|S]MLA(Accumulator, Input1, Input2) The partial reduction nodes sign or zero extend ...
@ LOOP_DEPENDENCE_RAW_MASK
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:778
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:852
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ VECTOR_FIND_LAST_ACTIVE
Finds the index of the last active mask element Operands: Mask.
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:471
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:746
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:909
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:992
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:438
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:974
@ PARTIAL_REDUCE_UMLA
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:714
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:485
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:664
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ PARTIAL_REDUCE_FMLA
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:872
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:826
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:635
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:690
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:795
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:671
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ GET_ACTIVE_LANE_MASK
GET_ACTIVE_LANE_MASK - this corrosponds to the llvm.get.active.lane.mask intrinsic.
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:791
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:969
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:703
@ ATOMIC_LOAD_FMAXIMUM
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:470
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:139
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:810
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ ATOMIC_LOAD_FMINIMUM
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ VECTOR_SPLICE_LEFT
VECTOR_SPLICE_LEFT(VEC1, VEC2, IMM) - Shifts CONCAT_VECTORS(VEC1, VEC2) left by IMM elements and retu...
Definition ISDOpcodes.h:653
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:898
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:887
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:640
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:804
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:484
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:179
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:709
@ VECTOR_SPLICE_RIGHT
VECTOR_SPLICE_RIGHT(VEC1, VEC2, IMM) - Shifts CONCAT_VECTORS(VEC1, VEC2) right by IMM elements and re...
Definition ISDOpcodes.h:656
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:427
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:698
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ CLEAR_CACHE
llvm.clear_cache intrinsic Operands: Input Chain, Start Addres, End Address Outputs: Output Chain
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:920
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:996
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Experimental vector histogram intrinsic Operands: Input Chain, Inc, Mask, Base, Index,...
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:944
@ VECREDUCE_FMINIMUM
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:832
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ PARTIAL_REDUCE_SUMLA
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:624
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:870
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:721
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:874
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
@ LOOP_DEPENDENCE_WAR_MASK
The llvm.loop.dependence.
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LLVM_ABI NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode)
Get underlying scalar opcode for VECREDUCE opcode.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
const unsigned VectorBits
Definition SystemZ.h:155
initializer< Ty > init(const Ty &Val)
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
bool isPackedVectorType(EVT SomeVT)
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:295
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition APFloat.h:1626
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1595
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:173
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
unsigned M1(unsigned Val)
Definition VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
generic_gep_type_iterator<> gep_type_iterator
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr unsigned getDefRegState(bool B)
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2182
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
static const MachineMemOperand::Flags MOStridedAccess
@ Enabled
Convert any .debug_str_offsets tables to DWARF64 if needed.
Definition DWP.h:27
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
@ Enable
Enable colors.
Definition WithColor.h:47
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:402
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
EVT changeVectorElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:444
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:187
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:314
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:164
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:309
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:324
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:360
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:138
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
A simple container for information about the supported runtime calls.
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64