LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
176 setOperationAction(ISD::LOAD, VT, Promote);
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
179 setOperationAction(ISD::STORE, VT, Promote);
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
249 setOperationAction(ISD::BITCAST, VT, Legal);
250 setOperationAction(ISD::LOAD, VT, Legal);
251 setOperationAction(ISD::STORE, VT, Legal);
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
280 setOperationAction(ISD::MLOAD, VT, Custom);
281 setOperationAction(ISD::MSTORE, VT, Legal);
296
297 // No native support for these.
307
308 // Vector reductions
309 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
310 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
311 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
312 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
313 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
314 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
315 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
316 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
317 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
318
319 if (!HasMVEFP) {
324 } else {
327 }
328
329 // Pre and Post inc are supported on loads and stores
330 for (unsigned im = (unsigned)ISD::PRE_INC;
331 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
336 }
337 }
338
339 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
340 for (auto VT : FloatTypes) {
341 addRegisterClass(VT, &ARM::MQPRRegClass);
342 if (!HasMVEFP)
343 setAllExpand(VT);
344
345 // These are legal or custom whether we have MVE.fp or not
354 setOperationAction(ISD::MLOAD, VT, Custom);
355 setOperationAction(ISD::MSTORE, VT, Legal);
358
359 // Pre and Post inc are supported on loads and stores
360 for (unsigned im = (unsigned)ISD::PRE_INC;
361 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
366 }
367
368 if (HasMVEFP) {
369 setOperationAction(ISD::FMINNUM, VT, Legal);
370 setOperationAction(ISD::FMAXNUM, VT, Legal);
371 setOperationAction(ISD::FROUND, VT, Legal);
372 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
373 setOperationAction(ISD::FRINT, VT, Legal);
374 setOperationAction(ISD::FTRUNC, VT, Legal);
375 setOperationAction(ISD::FFLOOR, VT, Legal);
376 setOperationAction(ISD::FCEIL, VT, Legal);
377 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
378 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
379 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
380 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
381
382 // No native support for these.
385 setOperationAction(ISD::FSQRT, VT, Expand);
386 setOperationAction(ISD::FSIN, VT, Expand);
387 setOperationAction(ISD::FCOS, VT, Expand);
388 setOperationAction(ISD::FTAN, VT, Expand);
389 setOperationAction(ISD::FPOW, VT, Expand);
390 setOperationAction(ISD::FLOG, VT, Expand);
391 setOperationAction(ISD::FLOG2, VT, Expand);
392 setOperationAction(ISD::FLOG10, VT, Expand);
393 setOperationAction(ISD::FEXP, VT, Expand);
394 setOperationAction(ISD::FEXP2, VT, Expand);
395 setOperationAction(ISD::FEXP10, VT, Expand);
396 setOperationAction(ISD::FNEARBYINT, VT, Expand);
397 }
398 }
399
400 // Custom Expand smaller than legal vector reductions to prevent false zero
401 // items being added.
402 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
403 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
404 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
405 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
406 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
407 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
408 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
409 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
410
411 // We 'support' these types up to bitcast/load/store level, regardless of
412 // MVE integer-only / float support. Only doing FP data processing on the FP
413 // vector types is inhibited at integer-only level.
414 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
415 for (auto VT : LongTypes) {
416 addRegisterClass(VT, &ARM::MQPRRegClass);
417 setAllExpand(VT);
423 }
425
426 // We can do bitwise operations on v2i64 vectors
427 setOperationAction(ISD::AND, MVT::v2i64, Legal);
428 setOperationAction(ISD::OR, MVT::v2i64, Legal);
429 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
430
431 // It is legal to extload from v4i8 to v4i16 or v4i32.
432 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
433 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
435
436 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
442
443 // Some truncating stores are legal too.
444 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
445 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
446 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
447
448 // Pre and Post inc on these are legal, given the correct extends
449 for (unsigned im = (unsigned)ISD::PRE_INC;
450 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
451 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
456 }
457 }
458
459 // Predicate types
460 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
461 for (auto VT : pTypes) {
462 addRegisterClass(VT, &ARM::VCCRRegClass);
471 setOperationAction(ISD::LOAD, VT, Custom);
472 setOperationAction(ISD::STORE, VT, Custom);
477
478 if (!HasMVEFP) {
483 }
484 }
488 setOperationAction(ISD::OR, MVT::v2i1, Expand);
494
503}
504
506 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
507}
508
510 const ARMSubtarget &STI)
511 : TargetLowering(TM_), Subtarget(&STI),
512 RegInfo(Subtarget->getRegisterInfo()),
513 Itins(Subtarget->getInstrItineraryData()) {
514 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
515
518
519 const Triple &TT = TM.getTargetTriple();
520
521 if (TT.isOSBinFormatMachO()) {
522 // Uses VFP for Thumb libfuncs if available.
523 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
524 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
525 // clang-format off
526 static const struct {
527 const RTLIB::Libcall Op;
528 const RTLIB::LibcallImpl Impl;
529 } LibraryCalls[] = {
530 // Single-precision floating-point arithmetic.
531 { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
532 { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
533 { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
534 { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
535
536 // Double-precision floating-point arithmetic.
537 { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
538 { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
539 { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
540 { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
541
542 // Single-precision comparisons.
543 { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
544 { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
545 { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
546 { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
547 { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
548 { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
549 { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp },
550
551 // Double-precision comparisons.
552 { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
553 { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
554 { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
555 { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
556 { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
557 { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
558 { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp },
559
560 // Floating-point to integer conversions.
561 // i64 conversions are done via library routines even when generating VFP
562 // instructions, so use the same ones.
563 { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
564 { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
565 { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
566 { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
567
568 // Conversions between floating types.
569 { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
570 { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp },
571
572 // Integer to floating-point conversions.
573 // i64 conversions are done via library routines even when generating VFP
574 // instructions, so use the same ones.
575 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
576 // e.g., __floatunsidf vs. __floatunssidfvfp.
577 { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
578 { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
579 { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
580 { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
581 };
582 // clang-format on
583
584 for (const auto &LC : LibraryCalls)
585 setLibcallImpl(LC.Op, LC.Impl);
586 }
587 }
588
589 if (Subtarget->isThumb1Only())
590 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
591 else
592 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
593
594 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
595 Subtarget->hasFPRegs()) {
596 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
597 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
598
603
604 if (!Subtarget->hasVFP2Base())
605 setAllExpand(MVT::f32);
606 if (!Subtarget->hasFP64())
607 setAllExpand(MVT::f64);
608 }
609
610 if (Subtarget->hasFullFP16()) {
611 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
612 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
613 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
614
615 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
616 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
617 }
618
619 if (Subtarget->hasBF16()) {
620 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
621 setAllExpand(MVT::bf16);
622 if (!Subtarget->hasFullFP16())
623 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
624 } else {
625 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
626 setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
627 setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
628 setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom);
629 }
630
632 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
633 setTruncStoreAction(VT, InnerVT, Expand);
634 addAllExtLoads(VT, InnerVT, Expand);
635 }
636
639
641 }
642
643 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
645
646 if (!Subtarget->hasV8_1MMainlineOps())
648
649 if (!Subtarget->isThumb1Only())
651
654
657
658 if (Subtarget->hasMVEIntegerOps())
659 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
660
661 // Combine low-overhead loop intrinsics so that we can lower i1 types.
662 if (Subtarget->hasLOB()) {
663 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
664 }
665
666 if (Subtarget->hasNEON()) {
667 addDRTypeForNEON(MVT::v2f32);
668 addDRTypeForNEON(MVT::v8i8);
669 addDRTypeForNEON(MVT::v4i16);
670 addDRTypeForNEON(MVT::v2i32);
671 addDRTypeForNEON(MVT::v1i64);
672
673 addQRTypeForNEON(MVT::v4f32);
674 addQRTypeForNEON(MVT::v2f64);
675 addQRTypeForNEON(MVT::v16i8);
676 addQRTypeForNEON(MVT::v8i16);
677 addQRTypeForNEON(MVT::v4i32);
678 addQRTypeForNEON(MVT::v2i64);
679
680 if (Subtarget->hasFullFP16()) {
681 addQRTypeForNEON(MVT::v8f16);
682 addDRTypeForNEON(MVT::v4f16);
683 }
684
685 if (Subtarget->hasBF16()) {
686 addQRTypeForNEON(MVT::v8bf16);
687 addDRTypeForNEON(MVT::v4bf16);
688 }
689 }
690
691 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
692 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
693 // none of Neon, MVE or VFP supports any arithmetic operations on it.
694 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
695 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
696 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
697 // FIXME: Code duplication: FDIV and FREM are expanded always, see
698 // ARMTargetLowering::addTypeForNEON method for details.
699 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
700 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
701 // FIXME: Create unittest.
702 // In another words, find a way when "copysign" appears in DAG with vector
703 // operands.
705 // FIXME: Code duplication: SETCC has custom operation action, see
706 // ARMTargetLowering::addTypeForNEON method for details.
708 // FIXME: Create unittest for FNEG and for FABS.
709 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
710 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
711 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
712 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
713 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
714 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
715 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
716 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
717 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
718 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
719 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
720 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
721 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
722 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
723 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
724 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
725 setOperationAction(ISD::FROUNDEVEN, MVT::v2f64, Expand);
726 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
727 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
728 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
729 }
730
731 if (Subtarget->hasNEON()) {
732 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
733 // supported for v4f32.
734 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
735 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
736 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
737 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
738 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
739 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
740 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
741 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
742 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
743 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
744 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
745 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
746 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
747 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
748 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Expand);
749 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
750 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
751
752 // Mark v2f32 intrinsics.
753 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
754 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
755 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
756 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
757 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
758 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
759 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
760 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
761 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
762 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
763 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
764 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
765 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
766 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
767 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Expand);
768 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
769 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
770
771 for (ISD::NodeType Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
772 ISD::FRINT, ISD::FTRUNC, ISD::FROUNDEVEN}) {
773 setOperationAction(Op, MVT::v4f16, Expand);
774 setOperationAction(Op, MVT::v8f16, Expand);
775 }
776
777 // Neon does not support some operations on v1i64 and v2i64 types.
778 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
779 // Custom handling for some quad-vector types to detect VMULL.
780 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
781 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
782 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
783 // Custom handling for some vector types to avoid expensive expansions
784 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
786 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
788 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
789 // a destination type that is wider than the source, and nor does
790 // it have a FP_TO_[SU]INT instruction with a narrower destination than
791 // source.
800
802 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
803
804 // NEON does not have single instruction CTPOP for vectors with element
805 // types wider than 8-bits. However, custom lowering can leverage the
806 // v8i8/v16i8 vcnt instruction.
813
814 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
815 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
816
817 // NEON does not have single instruction CTTZ for vectors.
819 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
820 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
821 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
822
823 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
824 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
825 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
826 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
827
832
837
841 }
842
843 // NEON only has FMA instructions as of VFP4.
844 if (!Subtarget->hasVFP4Base()) {
845 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
846 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
847 }
848
850 ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
851
852 // It is legal to extload from v4i8 to v4i16 or v4i32.
853 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
854 MVT::v2i32}) {
859 }
860 }
861
862 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
863 MVT::v4i32}) {
864 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
865 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
866 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
867 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
868 }
869 }
870
871 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
877 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
878 }
879 if (Subtarget->hasMVEIntegerOps()) {
881 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
882 ISD::SETCC});
883 }
884 if (Subtarget->hasMVEFloatOps()) {
886 }
887
888 if (!Subtarget->hasFP64()) {
889 // When targeting a floating-point unit with only single-precision
890 // operations, f64 is legal for the few double-precision instructions which
891 // are present However, no double-precision operations other than moves,
892 // loads and stores are provided by the hardware.
901 setOperationAction(ISD::FNEG, MVT::f64, Expand);
902 setOperationAction(ISD::FABS, MVT::f64, Expand);
903 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
904 setOperationAction(ISD::FSIN, MVT::f64, Expand);
905 setOperationAction(ISD::FCOS, MVT::f64, Expand);
906 setOperationAction(ISD::FPOW, MVT::f64, Expand);
907 setOperationAction(ISD::FLOG, MVT::f64, Expand);
908 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
909 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
910 setOperationAction(ISD::FEXP, MVT::f64, Expand);
911 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
912 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
913 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
914 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
915 setOperationAction(ISD::FRINT, MVT::f64, Expand);
916 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Expand);
917 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
918 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
931 }
932
933 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
934 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
936 if (Subtarget->hasFullFP16()) {
939 }
940 }
941
942 if (!Subtarget->hasFP16()) {
943 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
945 }
946
947 computeRegisterProperties(Subtarget->getRegisterInfo());
948
949 // ARM does not have floating-point extending loads.
950 for (MVT VT : MVT::fp_valuetypes()) {
951 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
952 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
953 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
954 }
955
956 // ... or truncating stores
957 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
958 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
959 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
960 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
961 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
962
963 // ARM does not have i1 sign extending load.
964 for (MVT VT : MVT::integer_valuetypes())
966
967 // ARM supports all 4 flavors of integer indexed load / store.
968 if (!Subtarget->isThumb1Only()) {
969 for (unsigned im = (unsigned)ISD::PRE_INC;
971 setIndexedLoadAction(im, MVT::i1, Legal);
972 setIndexedLoadAction(im, MVT::i8, Legal);
973 setIndexedLoadAction(im, MVT::i16, Legal);
974 setIndexedLoadAction(im, MVT::i32, Legal);
975 setIndexedStoreAction(im, MVT::i1, Legal);
976 setIndexedStoreAction(im, MVT::i8, Legal);
977 setIndexedStoreAction(im, MVT::i16, Legal);
978 setIndexedStoreAction(im, MVT::i32, Legal);
979 }
980 } else {
981 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
984 }
985
990
993 if (Subtarget->hasDSP()) {
1002 }
1003 if (Subtarget->hasBaseDSP()) {
1006 }
1007
1008 // i64 operation support.
1011 if (Subtarget->isThumb1Only()) {
1014 }
1015 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1016 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1018
1026 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1027 setOperationAction(ISD::STORE, MVT::i64, Custom);
1028
1029 // MVE lowers 64 bit shifts to lsll and lsrl
1030 // assuming that ISD::SRL and SRA of i64 are already marked custom
1031 if (Subtarget->hasMVEIntegerOps())
1033
1034 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1035 if (Subtarget->isThumb1Only()) {
1039 }
1040
1041 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1043
1044 // ARM does not have ROTL.
1049 }
1051 // TODO: These two should be set to LibCall, but this currently breaks
1052 // the Linux kernel build. See #101786.
1055 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1058 }
1059
1060 // @llvm.readcyclecounter requires the Performance Monitors extension.
1061 // Default to the 0 expansion on unsupported platforms.
1062 // FIXME: Technically there are older ARM CPUs that have
1063 // implementation-specific ways of obtaining this information.
1064 if (Subtarget->hasPerfMon())
1065 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1066
1067 // Only ARMv6 has BSWAP.
1068 if (!Subtarget->hasV6Ops())
1070
1071 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1072 : Subtarget->hasDivideInARMMode();
1073 if (!hasDivide) {
1074 // These are expanded into libcalls if the cpu doesn't have HW divider.
1077 }
1078
1079 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1082
1085 }
1086
1089
1090 // Register based DivRem for AEABI (RTABI 4.2)
1091 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1092 TT.isTargetMuslAEABI() || TT.isOSWindows()) {
1095 HasStandaloneRem = false;
1096
1101 } else {
1104 }
1105
1110
1111 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1112 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1113
1114 // Use the default implementation.
1115 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1116 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1117 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1118 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1119 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1120 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1121
1122 if (TT.isOSWindows())
1123 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1124 else
1125 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1126
1127 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1128 // the default expansion.
1129 InsertFencesForAtomic = false;
1130 if (Subtarget->hasAnyDataBarrier() &&
1131 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1132 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1133 // to ldrex/strex loops already.
1134 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1135 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1136 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1137
1138 // On v8, we have particularly efficient implementations of atomic fences
1139 // if they can be combined with nearby atomic loads and stores.
1140 if (!Subtarget->hasAcquireRelease() ||
1141 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1142 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1143 InsertFencesForAtomic = true;
1144 }
1145 } else {
1146 // If there's anything we can use as a barrier, go through custom lowering
1147 // for ATOMIC_FENCE.
1148 // If target has DMB in thumb, Fences can be inserted.
1149 if (Subtarget->hasDataBarrier())
1150 InsertFencesForAtomic = true;
1151
1152 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1153 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1154
1155 // Set them all for libcall, which will force libcalls.
1156 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1157 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1158 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1159 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1160 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1161 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1162 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1163 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1164 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1165 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1166 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1167 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1168 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1169 // Unordered/Monotonic case.
1170 if (!InsertFencesForAtomic) {
1171 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1172 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1173 }
1174 }
1175
1176 // Compute supported atomic widths.
1177 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1178 // For targets where __sync_* routines are reliably available, we use them
1179 // if necessary.
1180 //
1181 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1182 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1183 //
1184 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1185 // such targets should provide __sync_* routines, which use the ARM mode
1186 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1187 // encoding; see ARMISD::MEMBARRIER_MCR.)
1189 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1190 Subtarget->hasForced32BitAtomics()) {
1191 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1193 } else {
1194 // We can't assume anything about other targets; just use libatomic
1195 // routines.
1197 }
1198
1200
1201 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1202
1203 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1204 if (!Subtarget->hasV6Ops()) {
1207 }
1209
1210 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1211 !Subtarget->isThumb1Only()) {
1212 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1213 // iff target supports vfp2.
1214 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1216 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1217 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1218 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1219 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1220 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1221 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1222 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1223 }
1224
1225 // We want to custom lower some of our intrinsics.
1230
1240 if (Subtarget->hasFullFP16()) {
1244 }
1245
1247
1248 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1249 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1250 if (Subtarget->hasFullFP16())
1251 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1252 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1253 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1254 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1255
1256 // We don't support sin/cos/fmod/copysign/pow
1257 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1258 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1259 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1260 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1261 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1262 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1265 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1266 !Subtarget->isThumb1Only()) {
1269 }
1270 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1271 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1272
1273 if (!Subtarget->hasVFP4Base()) {
1276 }
1277
1278 // Various VFP goodness
1279 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1280 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1281 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1282 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1283 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1284 }
1285
1286 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1287 if (!Subtarget->hasFP16()) {
1288 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1289 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1290 }
1291
1292 // Strict floating-point comparisons need custom lowering.
1299 }
1300
1301 // Use __sincos_stret if available.
1302 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1303 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1304 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1305 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1306 }
1307
1308 // FP-ARMv8 implements a lot of rounding-like FP operations.
1309 if (Subtarget->hasFPARMv8Base()) {
1310 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1311 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1312 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1313 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1314 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1315 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1316 setOperationAction(ISD::FROUNDEVEN, MVT::f32, Legal);
1317 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1318 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1319 if (Subtarget->hasNEON()) {
1320 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1321 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1322 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1323 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1324 }
1325
1326 if (Subtarget->hasFP64()) {
1327 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1328 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1329 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1330 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1331 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1332 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1333 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Legal);
1334 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1335 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1336 }
1337 }
1338
1339 // FP16 often need to be promoted to call lib functions
1340 if (Subtarget->hasFullFP16()) {
1343 setOperationAction(ISD::FSIN, MVT::f16, Promote);
1344 setOperationAction(ISD::FCOS, MVT::f16, Promote);
1345 setOperationAction(ISD::FTAN, MVT::f16, Promote);
1346 setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1347 setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1348 setOperationAction(ISD::FPOW, MVT::f16, Promote);
1349 setOperationAction(ISD::FEXP, MVT::f16, Promote);
1350 setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1351 setOperationAction(ISD::FEXP10, MVT::f16, Promote);
1352 setOperationAction(ISD::FLOG, MVT::f16, Promote);
1353 setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1354 setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1355 setOperationAction(ISD::LRINT, MVT::f16, Expand);
1356
1357 setOperationAction(ISD::FROUND, MVT::f16, Legal);
1358 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
1359 setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
1360 setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
1361 setOperationAction(ISD::FRINT, MVT::f16, Legal);
1362 setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
1363 setOperationAction(ISD::FCEIL, MVT::f16, Legal);
1364 }
1365
1366 if (Subtarget->hasNEON()) {
1367 // vmin and vmax aren't available in a scalar form, so we can use
1368 // a NEON instruction with an undef lane instead.
1369 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1370 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1371 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1372 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1373 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1374 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1375 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1376 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1377
1378 if (Subtarget->hasV8Ops()) {
1379 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1380 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1381 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1382 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1383 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Legal);
1384 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Legal);
1385 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1386 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1387 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1388 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1389 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1390 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1391 }
1392
1393 if (Subtarget->hasFullFP16()) {
1394 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1395 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1396 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1397 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1398
1399 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1400 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1401 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1402 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1403
1404 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1405 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1406 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1407 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1408 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Legal);
1409 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Legal);
1410 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1411 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1412 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1413 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1414 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1415 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1416 }
1417 }
1418
1419 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1420 // it, but it's just a wrapper around ldexp.
1421 if (TT.isOSWindows()) {
1422 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1423 if (isOperationExpand(Op, MVT::f32))
1424 setOperationAction(Op, MVT::f32, Promote);
1425 }
1426
1427 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1428 // isn't legal.
1429 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1430 if (isOperationExpand(Op, MVT::f16))
1431 setOperationAction(Op, MVT::f16, Promote);
1432
1433 // We have target-specific dag combine patterns for the following nodes:
1434 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1437
1438 if (Subtarget->hasMVEIntegerOps())
1440
1441 if (Subtarget->hasV6Ops())
1443 if (Subtarget->isThumb1Only())
1445 // Attempt to lower smin/smax to ssat/usat
1446 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1447 Subtarget->isThumb2()) {
1449 }
1450
1452
1453 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1454 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1456 else
1458
1459 //// temporary - rewrite interface to use type
1462 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1464 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1466
1467 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1468 // are at least 4 bytes aligned.
1470
1471 // Prefer likely predicted branches to selects on out-of-order cores.
1472 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1473
1474 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1476 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1477
1478 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1479}
1480
1482 return Subtarget->useSoftFloat();
1483}
1484
1486 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1487}
1488
1489// FIXME: It might make sense to define the representative register class as the
1490// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1491// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1492// SPR's representative would be DPR_VFP2. This should work well if register
1493// pressure tracking were modified such that a register use would increment the
1494// pressure of the register class's representative and all of it's super
1495// classes' representatives transitively. We have not implemented this because
1496// of the difficulty prior to coalescing of modeling operand register classes
1497// due to the common occurrence of cross class copies and subregister insertions
1498// and extractions.
1499std::pair<const TargetRegisterClass *, uint8_t>
1501 MVT VT) const {
1502 const TargetRegisterClass *RRC = nullptr;
1503 uint8_t Cost = 1;
1504 switch (VT.SimpleTy) {
1505 default:
1507 // Use DPR as representative register class for all floating point
1508 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1509 // the cost is 1 for both f32 and f64.
1510 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1511 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1512 RRC = &ARM::DPRRegClass;
1513 // When NEON is used for SP, only half of the register file is available
1514 // because operations that define both SP and DP results will be constrained
1515 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1516 // coalescing by double-counting the SP regs. See the FIXME above.
1517 if (Subtarget->useNEONForSinglePrecisionFP())
1518 Cost = 2;
1519 break;
1520 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1521 case MVT::v4f32: case MVT::v2f64:
1522 RRC = &ARM::DPRRegClass;
1523 Cost = 2;
1524 break;
1525 case MVT::v4i64:
1526 RRC = &ARM::DPRRegClass;
1527 Cost = 4;
1528 break;
1529 case MVT::v8i64:
1530 RRC = &ARM::DPRRegClass;
1531 Cost = 8;
1532 break;
1533 }
1534 return std::make_pair(RRC, Cost);
1535}
1536
1537const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1538#define MAKE_CASE(V) \
1539 case V: \
1540 return #V;
1541 switch ((ARMISD::NodeType)Opcode) {
1543 break;
1746#undef MAKE_CASE
1747 }
1748 return nullptr;
1749}
1750
1752 EVT VT) const {
1753 if (!VT.isVector())
1754 return getPointerTy(DL);
1755
1756 // MVE has a predicate register.
1757 if ((Subtarget->hasMVEIntegerOps() &&
1758 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1759 VT == MVT::v16i8)) ||
1760 (Subtarget->hasMVEFloatOps() &&
1761 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1762 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1764}
1765
1766/// getRegClassFor - Return the register class that should be used for the
1767/// specified value type.
1768const TargetRegisterClass *
1769ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1770 (void)isDivergent;
1771 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1772 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1773 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1774 // MVE Q registers.
1775 if (Subtarget->hasNEON()) {
1776 if (VT == MVT::v4i64)
1777 return &ARM::QQPRRegClass;
1778 if (VT == MVT::v8i64)
1779 return &ARM::QQQQPRRegClass;
1780 }
1781 if (Subtarget->hasMVEIntegerOps()) {
1782 if (VT == MVT::v4i64)
1783 return &ARM::MQQPRRegClass;
1784 if (VT == MVT::v8i64)
1785 return &ARM::MQQQQPRRegClass;
1786 }
1788}
1789
1790// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1791// source/dest is aligned and the copy size is large enough. We therefore want
1792// to align such objects passed to memory intrinsics.
1794 Align &PrefAlign) const {
1795 if (!isa<MemIntrinsic>(CI))
1796 return false;
1797 MinSize = 8;
1798 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1799 // cycle faster than 4-byte aligned LDM.
1800 PrefAlign =
1801 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1802 return true;
1803}
1804
1805// Create a fast isel object.
1806FastISel *
1808 const TargetLibraryInfo *libInfo) const {
1809 return ARM::createFastISel(funcInfo, libInfo);
1810}
1811
1813 unsigned NumVals = N->getNumValues();
1814 if (!NumVals)
1815 return Sched::RegPressure;
1816
1817 for (unsigned i = 0; i != NumVals; ++i) {
1818 EVT VT = N->getValueType(i);
1819 if (VT == MVT::Glue || VT == MVT::Other)
1820 continue;
1821 if (VT.isFloatingPoint() || VT.isVector())
1822 return Sched::ILP;
1823 }
1824
1825 if (!N->isMachineOpcode())
1826 return Sched::RegPressure;
1827
1828 // Load are scheduled for latency even if there instruction itinerary
1829 // is not available.
1830 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1831 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1832
1833 if (MCID.getNumDefs() == 0)
1834 return Sched::RegPressure;
1835 if (!Itins->isEmpty() &&
1836 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1837 return Sched::ILP;
1838
1839 return Sched::RegPressure;
1840}
1841
1842//===----------------------------------------------------------------------===//
1843// Lowering Code
1844//===----------------------------------------------------------------------===//
1845
1846static bool isSRL16(const SDValue &Op) {
1847 if (Op.getOpcode() != ISD::SRL)
1848 return false;
1849 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1850 return Const->getZExtValue() == 16;
1851 return false;
1852}
1853
1854static bool isSRA16(const SDValue &Op) {
1855 if (Op.getOpcode() != ISD::SRA)
1856 return false;
1857 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1858 return Const->getZExtValue() == 16;
1859 return false;
1860}
1861
1862static bool isSHL16(const SDValue &Op) {
1863 if (Op.getOpcode() != ISD::SHL)
1864 return false;
1865 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1866 return Const->getZExtValue() == 16;
1867 return false;
1868}
1869
1870// Check for a signed 16-bit value. We special case SRA because it makes it
1871// more simple when also looking for SRAs that aren't sign extending a
1872// smaller value. Without the check, we'd need to take extra care with
1873// checking order for some operations.
1874static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1875 if (isSRA16(Op))
1876 return isSHL16(Op.getOperand(0));
1877 return DAG.ComputeNumSignBits(Op) == 17;
1878}
1879
1880/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1882 switch (CC) {
1883 default: llvm_unreachable("Unknown condition code!");
1884 case ISD::SETNE: return ARMCC::NE;
1885 case ISD::SETEQ: return ARMCC::EQ;
1886 case ISD::SETGT: return ARMCC::GT;
1887 case ISD::SETGE: return ARMCC::GE;
1888 case ISD::SETLT: return ARMCC::LT;
1889 case ISD::SETLE: return ARMCC::LE;
1890 case ISD::SETUGT: return ARMCC::HI;
1891 case ISD::SETUGE: return ARMCC::HS;
1892 case ISD::SETULT: return ARMCC::LO;
1893 case ISD::SETULE: return ARMCC::LS;
1894 }
1895}
1896
1897/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1899 ARMCC::CondCodes &CondCode2) {
1900 CondCode2 = ARMCC::AL;
1901 switch (CC) {
1902 default: llvm_unreachable("Unknown FP condition!");
1903 case ISD::SETEQ:
1904 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1905 case ISD::SETGT:
1906 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1907 case ISD::SETGE:
1908 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1909 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1910 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1911 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1912 case ISD::SETO: CondCode = ARMCC::VC; break;
1913 case ISD::SETUO: CondCode = ARMCC::VS; break;
1914 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1915 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1916 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1917 case ISD::SETLT:
1918 case ISD::SETULT: CondCode = ARMCC::LT; break;
1919 case ISD::SETLE:
1920 case ISD::SETULE: CondCode = ARMCC::LE; break;
1921 case ISD::SETNE:
1922 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1923 }
1924}
1925
1926//===----------------------------------------------------------------------===//
1927// Calling Convention Implementation
1928//===----------------------------------------------------------------------===//
1929
1930/// getEffectiveCallingConv - Get the effective calling convention, taking into
1931/// account presence of floating point hardware and calling convention
1932/// limitations, such as support for variadic functions.
1934ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1935 bool isVarArg) const {
1936 switch (CC) {
1937 default:
1938 report_fatal_error("Unsupported calling convention");
1941 case CallingConv::GHC:
1943 return CC;
1949 case CallingConv::Swift:
1952 case CallingConv::C:
1953 case CallingConv::Tail:
1954 if (!getTM().isAAPCS_ABI())
1955 return CallingConv::ARM_APCS;
1956 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1957 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1958 !isVarArg)
1960 else
1962 case CallingConv::Fast:
1964 if (!getTM().isAAPCS_ABI()) {
1965 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1966 return CallingConv::Fast;
1967 return CallingConv::ARM_APCS;
1968 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1969 !isVarArg)
1971 else
1973 }
1974}
1975
1977 bool isVarArg) const {
1978 return CCAssignFnForNode(CC, false, isVarArg);
1979}
1980
1982 bool isVarArg) const {
1983 return CCAssignFnForNode(CC, true, isVarArg);
1984}
1985
1986/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1987/// CallingConvention.
1988CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1989 bool Return,
1990 bool isVarArg) const {
1991 switch (getEffectiveCallingConv(CC, isVarArg)) {
1992 default:
1993 report_fatal_error("Unsupported calling convention");
1995 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1997 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1999 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2000 case CallingConv::Fast:
2001 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2002 case CallingConv::GHC:
2003 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2005 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2007 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2009 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2010 }
2011}
2012
2013SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2014 MVT LocVT, MVT ValVT, SDValue Val) const {
2015 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2016 Val);
2017 if (Subtarget->hasFullFP16()) {
2018 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2019 } else {
2020 Val = DAG.getNode(ISD::TRUNCATE, dl,
2021 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2022 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2023 }
2024 return Val;
2025}
2026
2027SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2028 MVT LocVT, MVT ValVT,
2029 SDValue Val) const {
2030 if (Subtarget->hasFullFP16()) {
2031 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2032 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2033 } else {
2034 Val = DAG.getNode(ISD::BITCAST, dl,
2035 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2036 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2037 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2038 }
2039 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2040}
2041
2042/// LowerCallResult - Lower the result values of a call into the
2043/// appropriate copies out of appropriate physical registers.
2044SDValue ARMTargetLowering::LowerCallResult(
2045 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2046 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2047 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2048 SDValue ThisVal, bool isCmseNSCall) const {
2049 // Assign locations to each value returned by this call.
2051 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2052 *DAG.getContext());
2053 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2054
2055 // Copy all of the result registers out of their specified physreg.
2056 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2057 CCValAssign VA = RVLocs[i];
2058
2059 // Pass 'this' value directly from the argument to return value, to avoid
2060 // reg unit interference
2061 if (i == 0 && isThisReturn) {
2062 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2063 "unexpected return calling convention register assignment");
2064 InVals.push_back(ThisVal);
2065 continue;
2066 }
2067
2068 SDValue Val;
2069 if (VA.needsCustom() &&
2070 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2071 // Handle f64 or half of a v2f64.
2072 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2073 InGlue);
2074 Chain = Lo.getValue(1);
2075 InGlue = Lo.getValue(2);
2076 VA = RVLocs[++i]; // skip ahead to next loc
2077 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2078 InGlue);
2079 Chain = Hi.getValue(1);
2080 InGlue = Hi.getValue(2);
2081 if (!Subtarget->isLittle())
2082 std::swap (Lo, Hi);
2083 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2084
2085 if (VA.getLocVT() == MVT::v2f64) {
2086 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2087 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2088 DAG.getConstant(0, dl, MVT::i32));
2089
2090 VA = RVLocs[++i]; // skip ahead to next loc
2091 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2092 Chain = Lo.getValue(1);
2093 InGlue = Lo.getValue(2);
2094 VA = RVLocs[++i]; // skip ahead to next loc
2095 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2096 Chain = Hi.getValue(1);
2097 InGlue = Hi.getValue(2);
2098 if (!Subtarget->isLittle())
2099 std::swap (Lo, Hi);
2100 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2101 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2102 DAG.getConstant(1, dl, MVT::i32));
2103 }
2104 } else {
2105 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2106 InGlue);
2107 Chain = Val.getValue(1);
2108 InGlue = Val.getValue(2);
2109 }
2110
2111 switch (VA.getLocInfo()) {
2112 default: llvm_unreachable("Unknown loc info!");
2113 case CCValAssign::Full: break;
2114 case CCValAssign::BCvt:
2115 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2116 break;
2117 }
2118
2119 // f16 arguments have their size extended to 4 bytes and passed as if they
2120 // had been copied to the LSBs of a 32-bit register.
2121 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2122 if (VA.needsCustom() &&
2123 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2124 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2125
2126 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2127 // is less than 32 bits must be sign- or zero-extended after the call for
2128 // security reasons. Although the ABI mandates an extension done by the
2129 // callee, the latter cannot be trusted to follow the rules of the ABI.
2130 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2131 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2132 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2133 Val = handleCMSEValue(Val, Arg, DAG, dl);
2134
2135 InVals.push_back(Val);
2136 }
2137
2138 return Chain;
2139}
2140
2141std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2142 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2143 bool IsTailCall, int SPDiff) const {
2144 SDValue DstAddr;
2145 MachinePointerInfo DstInfo;
2146 int32_t Offset = VA.getLocMemOffset();
2147 MachineFunction &MF = DAG.getMachineFunction();
2148
2149 if (IsTailCall) {
2150 Offset += SPDiff;
2151 auto PtrVT = getPointerTy(DAG.getDataLayout());
2152 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2153 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2154 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2155 DstInfo =
2157 } else {
2158 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2159 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2160 StackPtr, PtrOff);
2161 DstInfo =
2163 }
2164
2165 return std::make_pair(DstAddr, DstInfo);
2166}
2167
2168// Returns the type of copying which is required to set up a byval argument to
2169// a tail-called function. This isn't needed for non-tail calls, because they
2170// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2171// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2172// optimised to zero copies when forwarding an argument from the caller's
2173// caller (NoCopy).
2174ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
2175 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2176 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2177 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2178
2179 // Globals are always safe to copy from.
2181 return CopyOnce;
2182
2183 // Can only analyse frame index nodes, conservatively assume we need a
2184 // temporary.
2185 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
2186 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
2187 if (!SrcFrameIdxNode || !DstFrameIdxNode)
2188 return CopyViaTemp;
2189
2190 int SrcFI = SrcFrameIdxNode->getIndex();
2191 int DstFI = DstFrameIdxNode->getIndex();
2192 assert(MFI.isFixedObjectIndex(DstFI) &&
2193 "byval passed in non-fixed stack slot");
2194
2195 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2196 int64_t DstOffset = MFI.getObjectOffset(DstFI);
2197
2198 // If the source is in the local frame, then the copy to the argument memory
2199 // is always valid.
2200 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2201 if (!FixedSrc ||
2202 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2203 return CopyOnce;
2204
2205 // In the case of byval arguments split between registers and the stack,
2206 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
2207 // stack portion, but the Src SDValue will refer to the full value, including
2208 // the local stack memory that the register portion gets stored into. We only
2209 // need to compare them for equality, so normalise on the full value version.
2210 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2211 DstOffset -= RegSize;
2212
2213 // If the value is already in the correct location, then no copying is
2214 // needed. If not, then we need to copy via a temporary.
2215 if (SrcOffset == DstOffset)
2216 return NoCopy;
2217 else
2218 return CopyViaTemp;
2219}
2220
2221void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2222 SDValue Chain, SDValue &Arg,
2223 RegsToPassVector &RegsToPass,
2224 CCValAssign &VA, CCValAssign &NextVA,
2225 SDValue &StackPtr,
2226 SmallVectorImpl<SDValue> &MemOpChains,
2227 bool IsTailCall,
2228 int SPDiff) const {
2229 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2230 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2231 unsigned id = Subtarget->isLittle() ? 0 : 1;
2232 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2233
2234 if (NextVA.isRegLoc())
2235 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2236 else {
2237 assert(NextVA.isMemLoc());
2238 if (!StackPtr.getNode())
2239 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2241
2242 SDValue DstAddr;
2243 MachinePointerInfo DstInfo;
2244 std::tie(DstAddr, DstInfo) =
2245 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2246 MemOpChains.push_back(
2247 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2248 }
2249}
2250
2251static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2252 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2254}
2255
2256/// LowerCall - Lowering a call into a callseq_start <-
2257/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2258/// nodes.
2259SDValue
2260ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2261 SmallVectorImpl<SDValue> &InVals) const {
2262 SelectionDAG &DAG = CLI.DAG;
2263 SDLoc &dl = CLI.DL;
2264 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2265 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2266 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2267 SDValue Chain = CLI.Chain;
2268 SDValue Callee = CLI.Callee;
2269 bool &isTailCall = CLI.IsTailCall;
2270 CallingConv::ID CallConv = CLI.CallConv;
2271 bool doesNotRet = CLI.DoesNotReturn;
2272 bool isVarArg = CLI.IsVarArg;
2273 const CallBase *CB = CLI.CB;
2274
2275 MachineFunction &MF = DAG.getMachineFunction();
2276 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2277 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2278 MachineFunction::CallSiteInfo CSInfo;
2279 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2280 bool isThisReturn = false;
2281 bool isCmseNSCall = false;
2282 bool isSibCall = false;
2283 bool PreferIndirect = false;
2284 bool GuardWithBTI = false;
2285
2286 // Analyze operands of the call, assigning locations to each operand.
2288 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2289 *DAG.getContext());
2290 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2291
2292 // Lower 'returns_twice' calls to a pseudo-instruction.
2293 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2294 !Subtarget->noBTIAtReturnTwice())
2295 GuardWithBTI = AFI->branchTargetEnforcement();
2296
2297 // Set type id for call site info.
2298 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2299 CSInfo = MachineFunction::CallSiteInfo(*CB);
2300
2301 // Determine whether this is a non-secure function call.
2302 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2303 isCmseNSCall = true;
2304
2305 // Disable tail calls if they're not supported.
2306 if (!Subtarget->supportsTailCall())
2307 isTailCall = false;
2308
2309 // For both the non-secure calls and the returns from a CMSE entry function,
2310 // the function needs to do some extra work after the call, or before the
2311 // return, respectively, thus it cannot end with a tail call
2312 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2313 isTailCall = false;
2314
2315 if (isa<GlobalAddressSDNode>(Callee)) {
2316 // If we're optimizing for minimum size and the function is called three or
2317 // more times in this block, we can improve codesize by calling indirectly
2318 // as BLXr has a 16-bit encoding.
2319 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2320 if (CLI.CB) {
2321 auto *BB = CLI.CB->getParent();
2322 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2323 count_if(GV->users(), [&BB](const User *U) {
2324 return isa<Instruction>(U) &&
2325 cast<Instruction>(U)->getParent() == BB;
2326 }) > 2;
2327 }
2328 }
2329 if (isTailCall) {
2330 // Check if it's really possible to do a tail call.
2331 isTailCall =
2332 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2333
2334 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2335 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2336 isSibCall = true;
2337
2338 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2339 // detected sibcalls.
2340 if (isTailCall)
2341 ++NumTailCalls;
2342 }
2343
2344 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2345 report_fatal_error("failed to perform tail call elimination on a call "
2346 "site marked musttail");
2347
2348 // Get a count of how many bytes are to be pushed on the stack.
2349 unsigned NumBytes = CCInfo.getStackSize();
2350
2351 // SPDiff is the byte offset of the call's argument area from the callee's.
2352 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2353 // by this amount for a tail call. In a sibling call it must be 0 because the
2354 // caller will deallocate the entire stack and the callee still expects its
2355 // arguments to begin at SP+0. Completely unused for non-tail calls.
2356 int SPDiff = 0;
2357
2358 if (isTailCall && !isSibCall) {
2359 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2360 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2361
2362 // Since callee will pop argument stack as a tail call, we must keep the
2363 // popped size 16-byte aligned.
2364 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2365 assert(StackAlign && "data layout string is missing stack alignment");
2366 NumBytes = alignTo(NumBytes, *StackAlign);
2367
2368 // SPDiff will be negative if this tail call requires more space than we
2369 // would automatically have in our incoming argument space. Positive if we
2370 // can actually shrink the stack.
2371 SPDiff = NumReusableBytes - NumBytes;
2372
2373 // If this call requires more stack than we have available from
2374 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2375 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2376 AFI->setArgRegsSaveSize(-SPDiff);
2377 }
2378
2379 if (isSibCall) {
2380 // For sibling tail calls, memory operands are available in our caller's stack.
2381 NumBytes = 0;
2382 } else {
2383 // Adjust the stack pointer for the new arguments...
2384 // These operations are automatically eliminated by the prolog/epilog pass
2385 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2386 }
2387
2389 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2390
2391 RegsToPassVector RegsToPass;
2392 SmallVector<SDValue, 8> MemOpChains;
2393
2394 // If we are doing a tail-call, any byval arguments will be written to stack
2395 // space which was used for incoming arguments. If any the values being used
2396 // are incoming byval arguments to this function, then they might be
2397 // overwritten by the stores of the outgoing arguments. To avoid this, we
2398 // need to make a temporary copy of them in local stack space, then copy back
2399 // to the argument area.
2400 DenseMap<unsigned, SDValue> ByValTemporaries;
2401 SDValue ByValTempChain;
2402 if (isTailCall) {
2403 SmallVector<SDValue, 8> ByValCopyChains;
2404 for (const CCValAssign &VA : ArgLocs) {
2405 unsigned ArgIdx = VA.getValNo();
2406 SDValue Src = OutVals[ArgIdx];
2407 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2408
2409 if (!Flags.isByVal())
2410 continue;
2411
2412 SDValue Dst;
2413 MachinePointerInfo DstInfo;
2414 std::tie(Dst, DstInfo) =
2415 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2416 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2417
2418 if (Copy == NoCopy) {
2419 // If the argument is already at the correct offset on the stack
2420 // (because we are forwarding a byval argument from our caller), we
2421 // don't need any copying.
2422 continue;
2423 } else if (Copy == CopyOnce) {
2424 // If the argument is in our local stack frame, no other argument
2425 // preparation can clobber it, so we can copy it to the final location
2426 // later.
2427 ByValTemporaries[ArgIdx] = Src;
2428 } else {
2429 assert(Copy == CopyViaTemp && "unexpected enum value");
2430 // If we might be copying this argument from the outgoing argument
2431 // stack area, we need to copy via a temporary in the local stack
2432 // frame.
2433 int TempFrameIdx = MFI.CreateStackObject(
2434 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2435 SDValue Temp =
2436 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2437
2438 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2439 SDValue AlignNode =
2440 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2441
2442 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2443 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2444 ByValCopyChains.push_back(
2445 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2446 ByValTemporaries[ArgIdx] = Temp;
2447 }
2448 }
2449 if (!ByValCopyChains.empty())
2450 ByValTempChain =
2451 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2452 }
2453
2454 // During a tail call, stores to the argument area must happen after all of
2455 // the function's incoming arguments have been loaded because they may alias.
2456 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2457 // there's no point in doing so repeatedly so this tracks whether that's
2458 // happened yet.
2459 bool AfterFormalArgLoads = false;
2460
2461 // Walk the register/memloc assignments, inserting copies/loads. In the case
2462 // of tail call optimization, arguments are handled later.
2463 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2464 i != e;
2465 ++i, ++realArgIdx) {
2466 CCValAssign &VA = ArgLocs[i];
2467 SDValue Arg = OutVals[realArgIdx];
2468 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2469 bool isByVal = Flags.isByVal();
2470
2471 // Promote the value if needed.
2472 switch (VA.getLocInfo()) {
2473 default: llvm_unreachable("Unknown loc info!");
2474 case CCValAssign::Full: break;
2475 case CCValAssign::SExt:
2476 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2477 break;
2478 case CCValAssign::ZExt:
2479 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2480 break;
2481 case CCValAssign::AExt:
2482 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2483 break;
2484 case CCValAssign::BCvt:
2485 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2486 break;
2487 }
2488
2489 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2490 Chain = DAG.getStackArgumentTokenFactor(Chain);
2491 if (ByValTempChain)
2492 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2493 ByValTempChain);
2494 AfterFormalArgLoads = true;
2495 }
2496
2497 // f16 arguments have their size extended to 4 bytes and passed as if they
2498 // had been copied to the LSBs of a 32-bit register.
2499 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2500 if (VA.needsCustom() &&
2501 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2502 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2503 } else {
2504 // f16 arguments could have been extended prior to argument lowering.
2505 // Mask them arguments if this is a CMSE nonsecure call.
2506 auto ArgVT = Outs[realArgIdx].ArgVT;
2507 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2508 auto LocBits = VA.getLocVT().getSizeInBits();
2509 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2510 SDValue Mask =
2511 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2512 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2513 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2514 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2515 }
2516 }
2517
2518 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2519 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2520 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2521 DAG.getConstant(0, dl, MVT::i32));
2522 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2523 DAG.getConstant(1, dl, MVT::i32));
2524
2525 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2526 StackPtr, MemOpChains, isTailCall, SPDiff);
2527
2528 VA = ArgLocs[++i]; // skip ahead to next loc
2529 if (VA.isRegLoc()) {
2530 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2531 StackPtr, MemOpChains, isTailCall, SPDiff);
2532 } else {
2533 assert(VA.isMemLoc());
2534 SDValue DstAddr;
2535 MachinePointerInfo DstInfo;
2536 std::tie(DstAddr, DstInfo) =
2537 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2538 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2539 }
2540 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2541 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2542 StackPtr, MemOpChains, isTailCall, SPDiff);
2543 } else if (VA.isRegLoc()) {
2544 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2545 Outs[0].VT == MVT::i32) {
2546 assert(VA.getLocVT() == MVT::i32 &&
2547 "unexpected calling convention register assignment");
2548 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2549 "unexpected use of 'returned'");
2550 isThisReturn = true;
2551 }
2552 const TargetOptions &Options = DAG.getTarget().Options;
2553 if (Options.EmitCallSiteInfo)
2554 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2555 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2556 } else if (isByVal) {
2557 assert(VA.isMemLoc());
2558 unsigned offset = 0;
2559
2560 // True if this byval aggregate will be split between registers
2561 // and memory.
2562 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2563 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2564
2565 SDValue ByValSrc;
2566 bool NeedsStackCopy;
2567 if (auto It = ByValTemporaries.find(realArgIdx);
2568 It != ByValTemporaries.end()) {
2569 ByValSrc = It->second;
2570 NeedsStackCopy = true;
2571 } else {
2572 ByValSrc = Arg;
2573 NeedsStackCopy = !isTailCall;
2574 }
2575
2576 // If part of the argument is in registers, load them.
2577 if (CurByValIdx < ByValArgsCount) {
2578 unsigned RegBegin, RegEnd;
2579 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2580
2581 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2582 unsigned int i, j;
2583 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2584 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2585 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2586 SDValue Load =
2587 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2588 DAG.InferPtrAlign(AddArg));
2589 MemOpChains.push_back(Load.getValue(1));
2590 RegsToPass.push_back(std::make_pair(j, Load));
2591 }
2592
2593 // If parameter size outsides register area, "offset" value
2594 // helps us to calculate stack slot for remained part properly.
2595 offset = RegEnd - RegBegin;
2596
2597 CCInfo.nextInRegsParam();
2598 }
2599
2600 // If the memory part of the argument isn't already in the correct place
2601 // (which can happen with tail calls), copy it into the argument area.
2602 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2603 auto PtrVT = getPointerTy(DAG.getDataLayout());
2604 SDValue Dst;
2605 MachinePointerInfo DstInfo;
2606 std::tie(Dst, DstInfo) =
2607 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2608 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2609 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2610 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2611 MVT::i32);
2612 SDValue AlignNode =
2613 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2614
2615 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2616 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2617 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2618 Ops));
2619 }
2620 } else {
2621 assert(VA.isMemLoc());
2622 SDValue DstAddr;
2623 MachinePointerInfo DstInfo;
2624 std::tie(DstAddr, DstInfo) =
2625 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2626
2627 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2628 MemOpChains.push_back(Store);
2629 }
2630 }
2631
2632 if (!MemOpChains.empty())
2633 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2634
2635 // Build a sequence of copy-to-reg nodes chained together with token chain
2636 // and flag operands which copy the outgoing args into the appropriate regs.
2637 SDValue InGlue;
2638 for (const auto &[Reg, N] : RegsToPass) {
2639 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2640 InGlue = Chain.getValue(1);
2641 }
2642
2643 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2644 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2645 // node so that legalize doesn't hack it.
2646 bool isDirect = false;
2647
2648 const TargetMachine &TM = getTargetMachine();
2649 const GlobalValue *GVal = nullptr;
2650 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2651 GVal = G->getGlobal();
2652 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2653
2654 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2655 bool isLocalARMFunc = false;
2656 auto PtrVt = getPointerTy(DAG.getDataLayout());
2657
2658 if (Subtarget->genLongCalls()) {
2659 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2660 "long-calls codegen is not position independent!");
2661 // Handle a global address or an external symbol. If it's not one of
2662 // those, the target's already in a register, so we don't need to do
2663 // anything extra.
2664 if (isa<GlobalAddressSDNode>(Callee)) {
2665 if (Subtarget->genExecuteOnly()) {
2666 if (Subtarget->useMovt())
2667 ++NumMovwMovt;
2668 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2669 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2670 } else {
2671 // Create a constant pool entry for the callee address
2672 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2673 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2674 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2675
2676 // Get the address of the callee into a register
2677 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2678 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2679 Callee = DAG.getLoad(
2680 PtrVt, dl, DAG.getEntryNode(), Addr,
2682 }
2683 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2684 const char *Sym = S->getSymbol();
2685
2686 if (Subtarget->genExecuteOnly()) {
2687 if (Subtarget->useMovt())
2688 ++NumMovwMovt;
2689 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2690 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2691 } else {
2692 // Create a constant pool entry for the callee address
2693 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2694 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2695 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2696
2697 // Get the address of the callee into a register
2698 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2699 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2700 Callee = DAG.getLoad(
2701 PtrVt, dl, DAG.getEntryNode(), Addr,
2703 }
2704 }
2705 } else if (isa<GlobalAddressSDNode>(Callee)) {
2706 if (!PreferIndirect) {
2707 isDirect = true;
2708 bool isDef = GVal->isStrongDefinitionForLinker();
2709
2710 // ARM call to a local ARM function is predicable.
2711 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2712 // tBX takes a register source operand.
2713 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2714 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2715 Callee = DAG.getNode(
2716 ARMISD::WrapperPIC, dl, PtrVt,
2717 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2718 Callee = DAG.getLoad(
2719 PtrVt, dl, DAG.getEntryNode(), Callee,
2723 } else if (Subtarget->isTargetCOFF()) {
2724 assert(Subtarget->isTargetWindows() &&
2725 "Windows is the only supported COFF target");
2726 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2727 if (GVal->hasDLLImportStorageClass())
2728 TargetFlags = ARMII::MO_DLLIMPORT;
2729 else if (!TM.shouldAssumeDSOLocal(GVal))
2730 TargetFlags = ARMII::MO_COFFSTUB;
2731 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2732 TargetFlags);
2733 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2734 Callee =
2735 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2736 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2738 } else {
2739 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2740 }
2741 }
2742 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2743 isDirect = true;
2744 // tBX takes a register source operand.
2745 const char *Sym = S->getSymbol();
2746 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2747 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2748 ARMConstantPoolValue *CPV =
2750 ARMPCLabelIndex, 4);
2751 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2752 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2753 Callee = DAG.getLoad(
2754 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2756 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2757 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2758 } else {
2759 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2760 }
2761 }
2762
2763 if (isCmseNSCall) {
2764 assert(!isARMFunc && !isDirect &&
2765 "Cannot handle call to ARM function or direct call");
2766 if (NumBytes > 0) {
2767 DAG.getContext()->diagnose(
2768 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2769 "call to non-secure function would require "
2770 "passing arguments on stack",
2771 dl.getDebugLoc()));
2772 }
2773 if (isStructRet) {
2774 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2776 "call to non-secure function would return value through pointer",
2777 dl.getDebugLoc()));
2778 }
2779 }
2780
2781 // FIXME: handle tail calls differently.
2782 unsigned CallOpc;
2783 if (Subtarget->isThumb()) {
2784 if (GuardWithBTI)
2785 CallOpc = ARMISD::t2CALL_BTI;
2786 else if (isCmseNSCall)
2787 CallOpc = ARMISD::tSECALL;
2788 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2789 CallOpc = ARMISD::CALL_NOLINK;
2790 else
2791 CallOpc = ARMISD::CALL;
2792 } else {
2793 if (!isDirect && !Subtarget->hasV5TOps())
2794 CallOpc = ARMISD::CALL_NOLINK;
2795 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2796 // Emit regular call when code size is the priority
2797 !Subtarget->hasMinSize())
2798 // "mov lr, pc; b _foo" to avoid confusing the RSP
2799 CallOpc = ARMISD::CALL_NOLINK;
2800 else
2801 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2802 }
2803
2804 // We don't usually want to end the call-sequence here because we would tidy
2805 // the frame up *after* the call, however in the ABI-changing tail-call case
2806 // we've carefully laid out the parameters so that when sp is reset they'll be
2807 // in the correct location.
2808 if (isTailCall && !isSibCall) {
2809 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2810 InGlue = Chain.getValue(1);
2811 }
2812
2813 std::vector<SDValue> Ops;
2814 Ops.push_back(Chain);
2815 Ops.push_back(Callee);
2816
2817 if (isTailCall) {
2818 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2819 }
2820
2821 // Add argument registers to the end of the list so that they are known live
2822 // into the call.
2823 for (const auto &[Reg, N] : RegsToPass)
2824 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2825
2826 // Add a register mask operand representing the call-preserved registers.
2827 const uint32_t *Mask;
2828 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2829 if (isThisReturn) {
2830 // For 'this' returns, use the R0-preserving mask if applicable
2831 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2832 if (!Mask) {
2833 // Set isThisReturn to false if the calling convention is not one that
2834 // allows 'returned' to be modeled in this way, so LowerCallResult does
2835 // not try to pass 'this' straight through
2836 isThisReturn = false;
2837 Mask = ARI->getCallPreservedMask(MF, CallConv);
2838 }
2839 } else
2840 Mask = ARI->getCallPreservedMask(MF, CallConv);
2841
2842 assert(Mask && "Missing call preserved mask for calling convention");
2843 Ops.push_back(DAG.getRegisterMask(Mask));
2844
2845 if (InGlue.getNode())
2846 Ops.push_back(InGlue);
2847
2848 if (isTailCall) {
2850 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2851 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2852 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2853 return Ret;
2854 }
2855
2856 // Returns a chain and a flag for retval copy to use.
2857 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2858 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2859 InGlue = Chain.getValue(1);
2860 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2861
2862 // If we're guaranteeing tail-calls will be honoured, the callee must
2863 // pop its own argument stack on return. But this call is *not* a tail call so
2864 // we need to undo that after it returns to restore the status-quo.
2865 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2866 uint64_t CalleePopBytes =
2867 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2868
2869 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2870 if (!Ins.empty())
2871 InGlue = Chain.getValue(1);
2872
2873 // Handle result values, copying them out of physregs into vregs that we
2874 // return.
2875 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2876 InVals, isThisReturn,
2877 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2878}
2879
2880/// HandleByVal - Every parameter *after* a byval parameter is passed
2881/// on the stack. Remember the next parameter register to allocate,
2882/// and then confiscate the rest of the parameter registers to insure
2883/// this.
2884void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2885 Align Alignment) const {
2886 // Byval (as with any stack) slots are always at least 4 byte aligned.
2887 Alignment = std::max(Alignment, Align(4));
2888
2889 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2890 if (!Reg)
2891 return;
2892
2893 unsigned AlignInRegs = Alignment.value() / 4;
2894 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2895 for (unsigned i = 0; i < Waste; ++i)
2896 Reg = State->AllocateReg(GPRArgRegs);
2897
2898 if (!Reg)
2899 return;
2900
2901 unsigned Excess = 4 * (ARM::R4 - Reg);
2902
2903 // Special case when NSAA != SP and parameter size greater than size of
2904 // all remained GPR regs. In that case we can't split parameter, we must
2905 // send it to stack. We also must set NCRN to R4, so waste all
2906 // remained registers.
2907 const unsigned NSAAOffset = State->getStackSize();
2908 if (NSAAOffset != 0 && Size > Excess) {
2909 while (State->AllocateReg(GPRArgRegs))
2910 ;
2911 return;
2912 }
2913
2914 // First register for byval parameter is the first register that wasn't
2915 // allocated before this method call, so it would be "reg".
2916 // If parameter is small enough to be saved in range [reg, r4), then
2917 // the end (first after last) register would be reg + param-size-in-regs,
2918 // else parameter would be splitted between registers and stack,
2919 // end register would be r4 in this case.
2920 unsigned ByValRegBegin = Reg;
2921 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2922 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2923 // Note, first register is allocated in the beginning of function already,
2924 // allocate remained amount of registers we need.
2925 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2926 State->AllocateReg(GPRArgRegs);
2927 // A byval parameter that is split between registers and memory needs its
2928 // size truncated here.
2929 // In the case where the entire structure fits in registers, we set the
2930 // size in memory to zero.
2931 Size = std::max<int>(Size - Excess, 0);
2932}
2933
2934/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2935/// for tail call optimization. Targets which want to do tail call
2936/// optimization should implement this function. Note that this function also
2937/// processes musttail calls, so when this function returns false on a valid
2938/// musttail call, a fatal backend error occurs.
2939bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2941 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2942 CallingConv::ID CalleeCC = CLI.CallConv;
2943 SDValue Callee = CLI.Callee;
2944 bool isVarArg = CLI.IsVarArg;
2945 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2946 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2947 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2948 const SelectionDAG &DAG = CLI.DAG;
2949 MachineFunction &MF = DAG.getMachineFunction();
2950 const Function &CallerF = MF.getFunction();
2951 CallingConv::ID CallerCC = CallerF.getCallingConv();
2952
2953 assert(Subtarget->supportsTailCall());
2954
2955 // Indirect tail-calls require a register to hold the target address. That
2956 // register must be:
2957 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2958 // * Not callee-saved, so must be one of r0-r3 or r12.
2959 // * Not used to hold an argument to the tail-called function, which might be
2960 // in r0-r3.
2961 // * Not used to hold the return address authentication code, which is in r12
2962 // if enabled.
2963 // Sometimes, no register matches all of these conditions, so we can't do a
2964 // tail-call.
2965 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2966 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2967 ARM::R3};
2968 if (!(Subtarget->isThumb1Only() ||
2969 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2970 AddressRegisters.insert(ARM::R12);
2971 for (const CCValAssign &AL : ArgLocs)
2972 if (AL.isRegLoc())
2973 AddressRegisters.erase(AL.getLocReg());
2974 if (AddressRegisters.empty()) {
2975 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2976 return false;
2977 }
2978 }
2979
2980 // Look for obvious safe cases to perform tail call optimization that do not
2981 // require ABI changes. This is what gcc calls sibcall.
2982
2983 // Exception-handling functions need a special set of instructions to indicate
2984 // a return to the hardware. Tail-calling another function would probably
2985 // break this.
2986 if (CallerF.hasFnAttribute("interrupt")) {
2987 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2988 return false;
2989 }
2990
2991 if (canGuaranteeTCO(CalleeCC,
2992 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2993 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2994 << " (guaranteed tail-call CC)\n");
2995 return CalleeCC == CallerCC;
2996 }
2997
2998 // Also avoid sibcall optimization if either caller or callee uses struct
2999 // return semantics.
3000 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
3001 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
3002 if (isCalleeStructRet != isCallerStructRet) {
3003 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
3004 return false;
3005 }
3006
3007 // Externally-defined functions with weak linkage should not be
3008 // tail-called on ARM when the OS does not support dynamic
3009 // pre-emption of symbols, as the AAELF spec requires normal calls
3010 // to undefined weak functions to be replaced with a NOP or jump to the
3011 // next instruction. The behaviour of branch instructions in this
3012 // situation (as used for tail calls) is implementation-defined, so we
3013 // cannot rely on the linker replacing the tail call with a return.
3014 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3015 const GlobalValue *GV = G->getGlobal();
3016 const Triple &TT = getTargetMachine().getTargetTriple();
3017 if (GV->hasExternalWeakLinkage() &&
3018 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
3019 TT.isOSBinFormatMachO())) {
3020 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
3021 return false;
3022 }
3023 }
3024
3025 // Check that the call results are passed in the same way.
3026 LLVMContext &C = *DAG.getContext();
3028 getEffectiveCallingConv(CalleeCC, isVarArg),
3029 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3030 CCAssignFnForReturn(CalleeCC, isVarArg),
3031 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
3032 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
3033 return false;
3034 }
3035 // The callee has to preserve all registers the caller needs to preserve.
3036 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3037 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3038 if (CalleeCC != CallerCC) {
3039 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3040 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
3041 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
3042 return false;
3043 }
3044 }
3045
3046 // If Caller's vararg argument has been split between registers and stack, do
3047 // not perform tail call, since part of the argument is in caller's local
3048 // frame.
3049 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3050 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
3051 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
3052 return false;
3053 }
3054
3055 // If the callee takes no arguments then go on to check the results of the
3056 // call.
3057 const MachineRegisterInfo &MRI = MF.getRegInfo();
3058 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3059 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3060 return false;
3061 }
3062
3063 // If the stack arguments for this call do not fit into our own save area then
3064 // the call cannot be made tail.
3065 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
3066 return false;
3067
3068 LLVM_DEBUG(dbgs() << "true\n");
3069 return true;
3070}
3071
3072bool
3073ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3074 MachineFunction &MF, bool isVarArg,
3076 LLVMContext &Context, const Type *RetTy) const {
3078 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3079 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3080}
3081
3083 const SDLoc &DL, SelectionDAG &DAG) {
3084 const MachineFunction &MF = DAG.getMachineFunction();
3085 const Function &F = MF.getFunction();
3086
3087 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3088
3089 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3090 // version of the "preferred return address". These offsets affect the return
3091 // instruction if this is a return from PL1 without hypervisor extensions.
3092 // IRQ/FIQ: +4 "subs pc, lr, #4"
3093 // SWI: 0 "subs pc, lr, #0"
3094 // ABORT: +4 "subs pc, lr, #4"
3095 // UNDEF: +4/+2 "subs pc, lr, #0"
3096 // UNDEF varies depending on where the exception came from ARM or Thumb
3097 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3098
3099 int64_t LROffset;
3100 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3101 IntKind == "ABORT")
3102 LROffset = 4;
3103 else if (IntKind == "SWI" || IntKind == "UNDEF")
3104 LROffset = 0;
3105 else
3106 report_fatal_error("Unsupported interrupt attribute. If present, value "
3107 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3108
3109 RetOps.insert(RetOps.begin() + 1,
3110 DAG.getConstant(LROffset, DL, MVT::i32, false));
3111
3112 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3113}
3114
3115SDValue
3116ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3117 bool isVarArg,
3119 const SmallVectorImpl<SDValue> &OutVals,
3120 const SDLoc &dl, SelectionDAG &DAG) const {
3121 // CCValAssign - represent the assignment of the return value to a location.
3123
3124 // CCState - Info about the registers and stack slots.
3125 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3126 *DAG.getContext());
3127
3128 // Analyze outgoing return values.
3129 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3130
3131 SDValue Glue;
3133 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3134 bool isLittleEndian = Subtarget->isLittle();
3135
3136 MachineFunction &MF = DAG.getMachineFunction();
3137 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3138 AFI->setReturnRegsCount(RVLocs.size());
3139
3140 // Report error if cmse entry function returns structure through first ptr arg.
3141 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3142 // Note: using an empty SDLoc(), as the first line of the function is a
3143 // better place to report than the last line.
3144 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
3146 "secure entry function would return value through pointer",
3147 SDLoc().getDebugLoc()));
3148 }
3149
3150 // Copy the result values into the output registers.
3151 for (unsigned i = 0, realRVLocIdx = 0;
3152 i != RVLocs.size();
3153 ++i, ++realRVLocIdx) {
3154 CCValAssign &VA = RVLocs[i];
3155 assert(VA.isRegLoc() && "Can only return in registers!");
3156
3157 SDValue Arg = OutVals[realRVLocIdx];
3158 bool ReturnF16 = false;
3159
3160 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
3161 // Half-precision return values can be returned like this:
3162 //
3163 // t11 f16 = fadd ...
3164 // t12: i16 = bitcast t11
3165 // t13: i32 = zero_extend t12
3166 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3167 //
3168 // to avoid code generation for bitcasts, we simply set Arg to the node
3169 // that produces the f16 value, t11 in this case.
3170 //
3171 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3172 SDValue ZE = Arg.getOperand(0);
3173 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3174 SDValue BC = ZE.getOperand(0);
3175 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3176 Arg = BC.getOperand(0);
3177 ReturnF16 = true;
3178 }
3179 }
3180 }
3181 }
3182
3183 switch (VA.getLocInfo()) {
3184 default: llvm_unreachable("Unknown loc info!");
3185 case CCValAssign::Full: break;
3186 case CCValAssign::BCvt:
3187 if (!ReturnF16)
3188 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3189 break;
3190 }
3191
3192 // Mask f16 arguments if this is a CMSE nonsecure entry.
3193 auto RetVT = Outs[realRVLocIdx].ArgVT;
3194 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3195 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3196 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3197 } else {
3198 auto LocBits = VA.getLocVT().getSizeInBits();
3199 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3200 SDValue Mask =
3201 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3202 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3203 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3204 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3205 }
3206 }
3207
3208 if (VA.needsCustom() &&
3209 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3210 if (VA.getLocVT() == MVT::v2f64) {
3211 // Extract the first half and return it in two registers.
3212 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3213 DAG.getConstant(0, dl, MVT::i32));
3214 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3215 DAG.getVTList(MVT::i32, MVT::i32), Half);
3216
3217 Chain =
3218 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3219 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3220 Glue = Chain.getValue(1);
3221 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3222 VA = RVLocs[++i]; // skip ahead to next loc
3223 Chain =
3224 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3225 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3226 Glue = Chain.getValue(1);
3227 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3228 VA = RVLocs[++i]; // skip ahead to next loc
3229
3230 // Extract the 2nd half and fall through to handle it as an f64 value.
3231 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3232 DAG.getConstant(1, dl, MVT::i32));
3233 }
3234 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3235 // available.
3236 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3237 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3238 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3239 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3240 Glue = Chain.getValue(1);
3241 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3242 VA = RVLocs[++i]; // skip ahead to next loc
3243 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3244 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3245 } else
3246 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3247
3248 // Guarantee that all emitted copies are
3249 // stuck together, avoiding something bad.
3250 Glue = Chain.getValue(1);
3251 RetOps.push_back(DAG.getRegister(
3252 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3253 }
3254 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3255 const MCPhysReg *I =
3256 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3257 if (I) {
3258 for (; *I; ++I) {
3259 if (ARM::GPRRegClass.contains(*I))
3260 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3261 else if (ARM::DPRRegClass.contains(*I))
3263 else
3264 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3265 }
3266 }
3267
3268 // Update chain and glue.
3269 RetOps[0] = Chain;
3270 if (Glue.getNode())
3271 RetOps.push_back(Glue);
3272
3273 // CPUs which aren't M-class use a special sequence to return from
3274 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3275 // though we use "subs pc, lr, #N").
3276 //
3277 // M-class CPUs actually use a normal return sequence with a special
3278 // (hardware-provided) value in LR, so the normal code path works.
3279 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3280 !Subtarget->isMClass()) {
3281 if (Subtarget->isThumb1Only())
3282 report_fatal_error("interrupt attribute is not supported in Thumb1");
3283 return LowerInterruptReturn(RetOps, dl, DAG);
3284 }
3285
3288 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3289}
3290
3291bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3292 if (N->getNumValues() != 1)
3293 return false;
3294 if (!N->hasNUsesOfValue(1, 0))
3295 return false;
3296
3297 SDValue TCChain = Chain;
3298 SDNode *Copy = *N->user_begin();
3299 if (Copy->getOpcode() == ISD::CopyToReg) {
3300 // If the copy has a glue operand, we conservatively assume it isn't safe to
3301 // perform a tail call.
3302 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3303 return false;
3304 TCChain = Copy->getOperand(0);
3305 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3306 SDNode *VMov = Copy;
3307 // f64 returned in a pair of GPRs.
3308 SmallPtrSet<SDNode*, 2> Copies;
3309 for (SDNode *U : VMov->users()) {
3310 if (U->getOpcode() != ISD::CopyToReg)
3311 return false;
3312 Copies.insert(U);
3313 }
3314 if (Copies.size() > 2)
3315 return false;
3316
3317 for (SDNode *U : VMov->users()) {
3318 SDValue UseChain = U->getOperand(0);
3319 if (Copies.count(UseChain.getNode()))
3320 // Second CopyToReg
3321 Copy = U;
3322 else {
3323 // We are at the top of this chain.
3324 // If the copy has a glue operand, we conservatively assume it
3325 // isn't safe to perform a tail call.
3326 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3327 return false;
3328 // First CopyToReg
3329 TCChain = UseChain;
3330 }
3331 }
3332 } else if (Copy->getOpcode() == ISD::BITCAST) {
3333 // f32 returned in a single GPR.
3334 if (!Copy->hasOneUse())
3335 return false;
3336 Copy = *Copy->user_begin();
3337 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3338 return false;
3339 // If the copy has a glue operand, we conservatively assume it isn't safe to
3340 // perform a tail call.
3341 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3342 return false;
3343 TCChain = Copy->getOperand(0);
3344 } else {
3345 return false;
3346 }
3347
3348 bool HasRet = false;
3349 for (const SDNode *U : Copy->users()) {
3350 if (U->getOpcode() != ARMISD::RET_GLUE &&
3351 U->getOpcode() != ARMISD::INTRET_GLUE)
3352 return false;
3353 HasRet = true;
3354 }
3355
3356 if (!HasRet)
3357 return false;
3358
3359 Chain = TCChain;
3360 return true;
3361}
3362
3363bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3364 if (!Subtarget->supportsTailCall())
3365 return false;
3366
3367 if (!CI->isTailCall())
3368 return false;
3369
3370 return true;
3371}
3372
3373// Trying to write a 64 bit value so need to split into two 32 bit values first,
3374// and pass the lower and high parts through.
3376 SDLoc DL(Op);
3377 SDValue WriteValue = Op->getOperand(2);
3378
3379 // This function is only supposed to be called for i64 type argument.
3380 assert(WriteValue.getValueType() == MVT::i64
3381 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3382
3383 SDValue Lo, Hi;
3384 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3385 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3386 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3387}
3388
3389// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3390// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3391// one of the above mentioned nodes. It has to be wrapped because otherwise
3392// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3393// be used to form addressing mode. These wrapped nodes will be selected
3394// into MOVi.
3395SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3396 SelectionDAG &DAG) const {
3397 EVT PtrVT = Op.getValueType();
3398 // FIXME there is no actual debug info here
3399 SDLoc dl(Op);
3400 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3401 SDValue Res;
3402
3403 // When generating execute-only code Constant Pools must be promoted to the
3404 // global data section. It's a bit ugly that we can't share them across basic
3405 // blocks, but this way we guarantee that execute-only behaves correct with
3406 // position-independent addressing modes.
3407 if (Subtarget->genExecuteOnly()) {
3408 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3409 auto *T = CP->getType();
3410 auto C = const_cast<Constant*>(CP->getConstVal());
3411 auto M = DAG.getMachineFunction().getFunction().getParent();
3412 auto GV = new GlobalVariable(
3413 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3414 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3415 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3416 Twine(AFI->createPICLabelUId())
3417 );
3419 dl, PtrVT);
3420 return LowerGlobalAddress(GA, DAG);
3421 }
3422
3423 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3424 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3425 Align CPAlign = CP->getAlign();
3426 if (Subtarget->isThumb1Only())
3427 CPAlign = std::max(CPAlign, Align(4));
3428 if (CP->isMachineConstantPoolEntry())
3429 Res =
3430 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3431 else
3432 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3433 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3434}
3435
3437 // If we don't have a 32-bit pc-relative branch instruction then the jump
3438 // table consists of block addresses. Usually this is inline, but for
3439 // execute-only it must be placed out-of-line.
3440 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3443}
3444
3445SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3446 SelectionDAG &DAG) const {
3449 unsigned ARMPCLabelIndex = 0;
3450 SDLoc DL(Op);
3451 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3452 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3453 SDValue CPAddr;
3454 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3455 if (!IsPositionIndependent) {
3456 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3457 } else {
3458 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3459 ARMPCLabelIndex = AFI->createPICLabelUId();
3461 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3462 ARMCP::CPBlockAddress, PCAdj);
3463 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3464 }
3465 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3466 SDValue Result = DAG.getLoad(
3467 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3469 if (!IsPositionIndependent)
3470 return Result;
3471 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3472 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3473}
3474
3475/// Convert a TLS address reference into the correct sequence of loads
3476/// and calls to compute the variable's address for Darwin, and return an
3477/// SDValue containing the final node.
3478
3479/// Darwin only has one TLS scheme which must be capable of dealing with the
3480/// fully general situation, in the worst case. This means:
3481/// + "extern __thread" declaration.
3482/// + Defined in a possibly unknown dynamic library.
3483///
3484/// The general system is that each __thread variable has a [3 x i32] descriptor
3485/// which contains information used by the runtime to calculate the address. The
3486/// only part of this the compiler needs to know about is the first word, which
3487/// contains a function pointer that must be called with the address of the
3488/// entire descriptor in "r0".
3489///
3490/// Since this descriptor may be in a different unit, in general access must
3491/// proceed along the usual ARM rules. A common sequence to produce is:
3492///
3493/// movw rT1, :lower16:_var$non_lazy_ptr
3494/// movt rT1, :upper16:_var$non_lazy_ptr
3495/// ldr r0, [rT1]
3496/// ldr rT2, [r0]
3497/// blx rT2
3498/// [...address now in r0...]
3499SDValue
3500ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3501 SelectionDAG &DAG) const {
3502 assert(Subtarget->isTargetDarwin() &&
3503 "This function expects a Darwin target");
3504 SDLoc DL(Op);
3505
3506 // First step is to get the address of the actua global symbol. This is where
3507 // the TLS descriptor lives.
3508 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3509
3510 // The first entry in the descriptor is a function pointer that we must call
3511 // to obtain the address of the variable.
3512 SDValue Chain = DAG.getEntryNode();
3513 SDValue FuncTLVGet = DAG.getLoad(
3514 MVT::i32, DL, Chain, DescAddr,
3518 Chain = FuncTLVGet.getValue(1);
3519
3520 MachineFunction &F = DAG.getMachineFunction();
3521 MachineFrameInfo &MFI = F.getFrameInfo();
3522 MFI.setAdjustsStack(true);
3523
3524 // TLS calls preserve all registers except those that absolutely must be
3525 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3526 // silly).
3527 auto TRI =
3529 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3530 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3531
3532 // Finally, we can make the call. This is just a degenerate version of a
3533 // normal AArch64 call node: r0 takes the address of the descriptor, and
3534 // returns the address of the variable in this thread.
3535 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3536 Chain =
3537 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3538 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3539 DAG.getRegisterMask(Mask), Chain.getValue(1));
3540 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3541}
3542
3543SDValue
3544ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3545 SelectionDAG &DAG) const {
3546 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3547
3548 SDValue Chain = DAG.getEntryNode();
3549 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3550 SDLoc DL(Op);
3551
3552 // Load the current TEB (thread environment block)
3553 SDValue Ops[] = {Chain,
3554 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3555 DAG.getTargetConstant(15, DL, MVT::i32),
3556 DAG.getTargetConstant(0, DL, MVT::i32),
3557 DAG.getTargetConstant(13, DL, MVT::i32),
3558 DAG.getTargetConstant(0, DL, MVT::i32),
3559 DAG.getTargetConstant(2, DL, MVT::i32)};
3560 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3561 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3562
3563 SDValue TEB = CurrentTEB.getValue(0);
3564 Chain = CurrentTEB.getValue(1);
3565
3566 // Load the ThreadLocalStoragePointer from the TEB
3567 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3568 SDValue TLSArray =
3569 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3570 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3571
3572 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3573 // offset into the TLSArray.
3574
3575 // Load the TLS index from the C runtime
3576 SDValue TLSIndex =
3577 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3578 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3579 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3580
3581 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3582 DAG.getConstant(2, DL, MVT::i32));
3583 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3584 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3585 MachinePointerInfo());
3586
3587 // Get the offset of the start of the .tls section (section base)
3588 const auto *GA = cast<GlobalAddressSDNode>(Op);
3589 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3590 SDValue Offset = DAG.getLoad(
3591 PtrVT, DL, Chain,
3592 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3593 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3595
3596 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3597}
3598
3599// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3600SDValue
3601ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3602 SelectionDAG &DAG) const {
3603 SDLoc dl(GA);
3604 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3605 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3606 MachineFunction &MF = DAG.getMachineFunction();
3607 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3608 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3609 ARMConstantPoolValue *CPV =
3610 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3611 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3612 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3613 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3614 Argument = DAG.getLoad(
3615 PtrVT, dl, DAG.getEntryNode(), Argument,
3617 SDValue Chain = Argument.getValue(1);
3618
3619 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3620 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3621
3622 // call __tls_get_addr.
3624 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3625
3626 // FIXME: is there useful debug info available here?
3627 TargetLowering::CallLoweringInfo CLI(DAG);
3628 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3630 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3631
3632 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3633 return CallResult.first;
3634}
3635
3636// Lower ISD::GlobalTLSAddress using the "initial exec" or
3637// "local exec" model.
3638SDValue
3639ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3640 SelectionDAG &DAG,
3641 TLSModel::Model model) const {
3642 const GlobalValue *GV = GA->getGlobal();
3643 SDLoc dl(GA);
3645 SDValue Chain = DAG.getEntryNode();
3646 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3647 // Get the Thread Pointer
3649
3650 if (model == TLSModel::InitialExec) {
3651 MachineFunction &MF = DAG.getMachineFunction();
3652 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3653 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3654 // Initial exec model.
3655 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3656 ARMConstantPoolValue *CPV =
3657 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3659 true);
3660 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3661 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3662 Offset = DAG.getLoad(
3663 PtrVT, dl, Chain, Offset,
3665 Chain = Offset.getValue(1);
3666
3667 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3668 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3669
3670 Offset = DAG.getLoad(
3671 PtrVT, dl, Chain, Offset,
3673 } else {
3674 // local exec model
3675 assert(model == TLSModel::LocalExec);
3676 ARMConstantPoolValue *CPV =
3678 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3679 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3680 Offset = DAG.getLoad(
3681 PtrVT, dl, Chain, Offset,
3683 }
3684
3685 // The address of the thread local variable is the add of the thread
3686 // pointer with the offset of the variable.
3687 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3688}
3689
3690SDValue
3691ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3692 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3693 if (DAG.getTarget().useEmulatedTLS())
3694 return LowerToTLSEmulatedModel(GA, DAG);
3695
3696 if (Subtarget->isTargetDarwin())
3697 return LowerGlobalTLSAddressDarwin(Op, DAG);
3698
3699 if (Subtarget->isTargetWindows())
3700 return LowerGlobalTLSAddressWindows(Op, DAG);
3701
3702 // TODO: implement the "local dynamic" model
3703 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3705
3706 switch (model) {
3709 return LowerToTLSGeneralDynamicModel(GA, DAG);
3712 return LowerToTLSExecModels(GA, DAG, model);
3713 }
3714 llvm_unreachable("bogus TLS model");
3715}
3716
3717/// Return true if all users of V are within function F, looking through
3718/// ConstantExprs.
3719static bool allUsersAreInFunction(const Value *V, const Function *F) {
3720 SmallVector<const User*,4> Worklist(V->users());
3721 while (!Worklist.empty()) {
3722 auto *U = Worklist.pop_back_val();
3723 if (isa<ConstantExpr>(U)) {
3724 append_range(Worklist, U->users());
3725 continue;
3726 }
3727
3728 auto *I = dyn_cast<Instruction>(U);
3729 if (!I || I->getParent()->getParent() != F)
3730 return false;
3731 }
3732 return true;
3733}
3734
3736 const GlobalValue *GV, SelectionDAG &DAG,
3737 EVT PtrVT, const SDLoc &dl) {
3738 // If we're creating a pool entry for a constant global with unnamed address,
3739 // and the global is small enough, we can emit it inline into the constant pool
3740 // to save ourselves an indirection.
3741 //
3742 // This is a win if the constant is only used in one function (so it doesn't
3743 // need to be duplicated) or duplicating the constant wouldn't increase code
3744 // size (implying the constant is no larger than 4 bytes).
3745 const Function &F = DAG.getMachineFunction().getFunction();
3746
3747 // We rely on this decision to inline being idemopotent and unrelated to the
3748 // use-site. We know that if we inline a variable at one use site, we'll
3749 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3750 // doesn't know about this optimization, so bail out if it's enabled else
3751 // we could decide to inline here (and thus never emit the GV) but require
3752 // the GV from fast-isel generated code.
3755 return SDValue();
3756
3757 auto *GVar = dyn_cast<GlobalVariable>(GV);
3758 if (!GVar || !GVar->hasInitializer() ||
3759 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3760 !GVar->hasLocalLinkage())
3761 return SDValue();
3762
3763 // If we inline a value that contains relocations, we move the relocations
3764 // from .data to .text. This is not allowed in position-independent code.
3765 auto *Init = GVar->getInitializer();
3766 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3767 Init->needsDynamicRelocation())
3768 return SDValue();
3769
3770 // The constant islands pass can only really deal with alignment requests
3771 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3772 // any type wanting greater alignment requirements than 4 bytes. We also
3773 // can only promote constants that are multiples of 4 bytes in size or
3774 // are paddable to a multiple of 4. Currently we only try and pad constants
3775 // that are strings for simplicity.
3776 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3777 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3778 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3779 unsigned RequiredPadding = 4 - (Size % 4);
3780 bool PaddingPossible =
3781 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3782 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3783 Size == 0)
3784 return SDValue();
3785
3786 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3788 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3789
3790 // We can't bloat the constant pool too much, else the ConstantIslands pass
3791 // may fail to converge. If we haven't promoted this global yet (it may have
3792 // multiple uses), and promoting it would increase the constant pool size (Sz
3793 // > 4), ensure we have space to do so up to MaxTotal.
3794 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3795 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3797 return SDValue();
3798
3799 // This is only valid if all users are in a single function; we can't clone
3800 // the constant in general. The LLVM IR unnamed_addr allows merging
3801 // constants, but not cloning them.
3802 //
3803 // We could potentially allow cloning if we could prove all uses of the
3804 // constant in the current function don't care about the address, like
3805 // printf format strings. But that isn't implemented for now.
3806 if (!allUsersAreInFunction(GVar, &F))
3807 return SDValue();
3808
3809 // We're going to inline this global. Pad it out if needed.
3810 if (RequiredPadding != 4) {
3811 StringRef S = CDAInit->getAsString();
3812
3814 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3815 while (RequiredPadding--)
3816 V.push_back(0);
3818 }
3819
3820 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3821 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3822 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3825 PaddedSize - 4);
3826 }
3827 ++NumConstpoolPromoted;
3828 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3829}
3830
3832 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3833 if (!(GV = GA->getAliaseeObject()))
3834 return false;
3835 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3836 return V->isConstant();
3837 return isa<Function>(GV);
3838}
3839
3840SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3841 SelectionDAG &DAG) const {
3842 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3843 default: llvm_unreachable("unknown object format");
3844 case Triple::COFF:
3845 return LowerGlobalAddressWindows(Op, DAG);
3846 case Triple::ELF:
3847 return LowerGlobalAddressELF(Op, DAG);
3848 case Triple::MachO:
3849 return LowerGlobalAddressDarwin(Op, DAG);
3850 }
3851}
3852
3853SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3854 SelectionDAG &DAG) const {
3855 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3856 SDLoc dl(Op);
3857 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3858 bool IsRO = isReadOnly(GV);
3859
3860 // promoteToConstantPool only if not generating XO text section
3861 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3862 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3863 return V;
3864
3865 if (isPositionIndependent()) {
3867 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3868 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3869 if (!GV->isDSOLocal())
3870 Result =
3871 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3873 return Result;
3874 } else if (Subtarget->isROPI() && IsRO) {
3875 // PC-relative.
3876 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3877 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3878 return Result;
3879 } else if (Subtarget->isRWPI() && !IsRO) {
3880 // SB-relative.
3881 SDValue RelAddr;
3882 if (Subtarget->useMovt()) {
3883 ++NumMovwMovt;
3884 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3885 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3886 } else { // use literal pool for address constant
3887 ARMConstantPoolValue *CPV =
3889 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3890 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3891 RelAddr = DAG.getLoad(
3892 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3894 }
3895 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3896 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3897 return Result;
3898 }
3899
3900 // If we have T2 ops, we can materialize the address directly via movt/movw
3901 // pair. This is always cheaper. If need to generate Execute Only code, and we
3902 // only have Thumb1 available, we can't use a constant pool and are forced to
3903 // use immediate relocations.
3904 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3905 if (Subtarget->useMovt())
3906 ++NumMovwMovt;
3907 // FIXME: Once remat is capable of dealing with instructions with register
3908 // operands, expand this into two nodes.
3909 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3910 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3911 } else {
3912 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3913 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3914 return DAG.getLoad(
3915 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3917 }
3918}
3919
3920SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3921 SelectionDAG &DAG) const {
3922 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3923 "ROPI/RWPI not currently supported for Darwin");
3924 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3925 SDLoc dl(Op);
3926 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3927
3928 if (Subtarget->useMovt())
3929 ++NumMovwMovt;
3930
3931 // FIXME: Once remat is capable of dealing with instructions with register
3932 // operands, expand this into multiple nodes
3933 unsigned Wrapper =
3935
3936 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3937 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3938
3939 if (Subtarget->isGVIndirectSymbol(GV))
3940 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3942 return Result;
3943}
3944
3945SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3946 SelectionDAG &DAG) const {
3947 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3948 assert(Subtarget->useMovt() &&
3949 "Windows on ARM expects to use movw/movt");
3950 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3951 "ROPI/RWPI not currently supported for Windows");
3952
3953 const TargetMachine &TM = getTargetMachine();
3954 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3955 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3956 if (GV->hasDLLImportStorageClass())
3957 TargetFlags = ARMII::MO_DLLIMPORT;
3958 else if (!TM.shouldAssumeDSOLocal(GV))
3959 TargetFlags = ARMII::MO_COFFSTUB;
3960 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3962 SDLoc DL(Op);
3963
3964 ++NumMovwMovt;
3965
3966 // FIXME: Once remat is capable of dealing with instructions with register
3967 // operands, expand this into two nodes.
3968 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3969 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3970 TargetFlags));
3971 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3972 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3974 return Result;
3975}
3976
3977SDValue
3978ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3979 SDLoc dl(Op);
3980 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3981 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3982 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3983 Op.getOperand(1), Val);
3984}
3985
3986SDValue
3987ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3988 SDLoc dl(Op);
3989 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3990 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3991}
3992
3993SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3994 SelectionDAG &DAG) const {
3995 SDLoc dl(Op);
3996 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3997 Op.getOperand(0));
3998}
3999
4000SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
4001 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
4002 unsigned IntNo =
4003 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4004 switch (IntNo) {
4005 default:
4006 return SDValue(); // Don't custom lower most intrinsics.
4007 case Intrinsic::arm_gnu_eabi_mcount: {
4008 MachineFunction &MF = DAG.getMachineFunction();
4009 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4010 SDLoc dl(Op);
4011 SDValue Chain = Op.getOperand(0);
4012 // call "\01__gnu_mcount_nc"
4013 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4014 const uint32_t *Mask =
4016 assert(Mask && "Missing call preserved mask for calling convention");
4017 // Mark LR an implicit live-in.
4018 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4019 SDValue ReturnAddress =
4020 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4021 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4022 SDValue Callee =
4023 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4025 if (Subtarget->isThumb())
4026 return SDValue(
4027 DAG.getMachineNode(
4028 ARM::tBL_PUSHLR, dl, ResultTys,
4029 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4030 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4031 0);
4032 return SDValue(
4033 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4034 {ReturnAddress, Callee, RegisterMask, Chain}),
4035 0);
4036 }
4037 }
4038}
4039
4040SDValue
4041ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4042 const ARMSubtarget *Subtarget) const {
4043 unsigned IntNo = Op.getConstantOperandVal(0);
4044 SDLoc dl(Op);
4045 switch (IntNo) {
4046 default: return SDValue(); // Don't custom lower most intrinsics.
4047 case Intrinsic::thread_pointer: {
4048 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4049 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4050 }
4051 case Intrinsic::arm_cls: {
4052 const SDValue &Operand = Op.getOperand(1);
4053 const EVT VTy = Op.getValueType();
4054 SDValue SRA =
4055 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4056 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4057 SDValue SHL =
4058 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4059 SDValue OR =
4060 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4061 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4062 return Result;
4063 }
4064 case Intrinsic::arm_cls64: {
4065 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4066 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4067 const SDValue &Operand = Op.getOperand(1);
4068 const EVT VTy = Op.getValueType();
4069 SDValue Lo, Hi;
4070 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4071 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4072 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4073 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4074 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4075 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4076 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4077 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4078 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4079 SDValue CheckLo =
4080 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4081 SDValue HiIsZero =
4082 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4083 SDValue AdjustedLo =
4084 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4085 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4086 SDValue Result =
4087 DAG.getSelect(dl, VTy, CheckLo,
4088 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4089 return Result;
4090 }
4091 case Intrinsic::eh_sjlj_lsda: {
4092 MachineFunction &MF = DAG.getMachineFunction();
4093 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4094 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4095 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4096 SDValue CPAddr;
4097 bool IsPositionIndependent = isPositionIndependent();
4098 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4099 ARMConstantPoolValue *CPV =
4100 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4101 ARMCP::CPLSDA, PCAdj);
4102 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4103 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4104 SDValue Result = DAG.getLoad(
4105 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4107
4108 if (IsPositionIndependent) {
4109 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4110 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4111 }
4112 return Result;
4113 }
4114 case Intrinsic::arm_neon_vabs:
4115 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4116 Op.getOperand(1));
4117 case Intrinsic::arm_neon_vabds:
4118 if (Op.getValueType().isInteger())
4119 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4120 Op.getOperand(1), Op.getOperand(2));
4121 return SDValue();
4122 case Intrinsic::arm_neon_vabdu:
4123 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4124 Op.getOperand(1), Op.getOperand(2));
4125 case Intrinsic::arm_neon_vmulls:
4126 case Intrinsic::arm_neon_vmullu: {
4127 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4128 ? ARMISD::VMULLs : ARMISD::VMULLu;
4129 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4130 Op.getOperand(1), Op.getOperand(2));
4131 }
4132 case Intrinsic::arm_neon_vminnm:
4133 case Intrinsic::arm_neon_vmaxnm: {
4134 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4135 ? ISD::FMINNUM : ISD::FMAXNUM;
4136 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4137 Op.getOperand(1), Op.getOperand(2));
4138 }
4139 case Intrinsic::arm_neon_vminu:
4140 case Intrinsic::arm_neon_vmaxu: {
4141 if (Op.getValueType().isFloatingPoint())
4142 return SDValue();
4143 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4144 ? ISD::UMIN : ISD::UMAX;
4145 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4146 Op.getOperand(1), Op.getOperand(2));
4147 }
4148 case Intrinsic::arm_neon_vmins:
4149 case Intrinsic::arm_neon_vmaxs: {
4150 // v{min,max}s is overloaded between signed integers and floats.
4151 if (!Op.getValueType().isFloatingPoint()) {
4152 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4153 ? ISD::SMIN : ISD::SMAX;
4154 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4155 Op.getOperand(1), Op.getOperand(2));
4156 }
4157 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4158 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4159 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4160 Op.getOperand(1), Op.getOperand(2));
4161 }
4162 case Intrinsic::arm_neon_vtbl1:
4163 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4164 Op.getOperand(1), Op.getOperand(2));
4165 case Intrinsic::arm_neon_vtbl2:
4166 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4167 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4168 case Intrinsic::arm_mve_pred_i2v:
4169 case Intrinsic::arm_mve_pred_v2i:
4170 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4171 Op.getOperand(1));
4172 case Intrinsic::arm_mve_vreinterpretq:
4173 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4174 Op.getOperand(1));
4175 case Intrinsic::arm_mve_lsll:
4176 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4177 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4178 case Intrinsic::arm_mve_asrl:
4179 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4180 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4181 }
4182}
4183
4185 const ARMSubtarget *Subtarget) {
4186 SDLoc dl(Op);
4187 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4188 if (SSID == SyncScope::SingleThread)
4189 return Op;
4190
4191 if (!Subtarget->hasDataBarrier()) {
4192 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4193 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4194 // here.
4195 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4196 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4197 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4198 DAG.getConstant(0, dl, MVT::i32));
4199 }
4200
4201 AtomicOrdering Ord =
4202 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4204 if (Subtarget->isMClass()) {
4205 // Only a full system barrier exists in the M-class architectures.
4207 } else if (Subtarget->preferISHSTBarriers() &&
4208 Ord == AtomicOrdering::Release) {
4209 // Swift happens to implement ISHST barriers in a way that's compatible with
4210 // Release semantics but weaker than ISH so we'd be fools not to use
4211 // it. Beware: other processors probably don't!
4213 }
4214
4215 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4216 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4217 DAG.getConstant(Domain, dl, MVT::i32));
4218}
4219
4221 const ARMSubtarget *Subtarget) {
4222 // ARM pre v5TE and Thumb1 does not have preload instructions.
4223 if (!(Subtarget->isThumb2() ||
4224 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4225 // Just preserve the chain.
4226 return Op.getOperand(0);
4227
4228 SDLoc dl(Op);
4229 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4230 if (!isRead &&
4231 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4232 // ARMv7 with MP extension has PLDW.
4233 return Op.getOperand(0);
4234
4235 unsigned isData = Op.getConstantOperandVal(4);
4236 if (Subtarget->isThumb()) {
4237 // Invert the bits.
4238 isRead = ~isRead & 1;
4239 isData = ~isData & 1;
4240 }
4241
4242 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4243 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4244 DAG.getConstant(isData, dl, MVT::i32));
4245}
4246
4249 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4250
4251 // vastart just stores the address of the VarArgsFrameIndex slot into the
4252 // memory location argument.
4253 SDLoc dl(Op);
4255 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4256 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4257 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4258 MachinePointerInfo(SV));
4259}
4260
4261SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4262 CCValAssign &NextVA,
4263 SDValue &Root,
4264 SelectionDAG &DAG,
4265 const SDLoc &dl) const {
4266 MachineFunction &MF = DAG.getMachineFunction();
4267 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4268
4269 const TargetRegisterClass *RC;
4270 if (AFI->isThumb1OnlyFunction())
4271 RC = &ARM::tGPRRegClass;
4272 else
4273 RC = &ARM::GPRRegClass;
4274
4275 // Transform the arguments stored in physical registers into virtual ones.
4276 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4277 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4278
4279 SDValue ArgValue2;
4280 if (NextVA.isMemLoc()) {
4281 MachineFrameInfo &MFI = MF.getFrameInfo();
4282 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4283
4284 // Create load node to retrieve arguments from the stack.
4285 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4286 ArgValue2 = DAG.getLoad(
4287 MVT::i32, dl, Root, FIN,
4289 } else {
4290 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4291 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4292 }
4293 if (!Subtarget->isLittle())
4294 std::swap (ArgValue, ArgValue2);
4295 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4296}
4297
4298// The remaining GPRs hold either the beginning of variable-argument
4299// data, or the beginning of an aggregate passed by value (usually
4300// byval). Either way, we allocate stack slots adjacent to the data
4301// provided by our caller, and store the unallocated registers there.
4302// If this is a variadic function, the va_list pointer will begin with
4303// these values; otherwise, this reassembles a (byval) structure that
4304// was split between registers and memory.
4305// Return: The frame index registers were stored into.
4306int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4307 const SDLoc &dl, SDValue &Chain,
4308 const Value *OrigArg,
4309 unsigned InRegsParamRecordIdx,
4310 int ArgOffset, unsigned ArgSize) const {
4311 // Currently, two use-cases possible:
4312 // Case #1. Non-var-args function, and we meet first byval parameter.
4313 // Setup first unallocated register as first byval register;
4314 // eat all remained registers
4315 // (these two actions are performed by HandleByVal method).
4316 // Then, here, we initialize stack frame with
4317 // "store-reg" instructions.
4318 // Case #2. Var-args function, that doesn't contain byval parameters.
4319 // The same: eat all remained unallocated registers,
4320 // initialize stack frame.
4321
4322 MachineFunction &MF = DAG.getMachineFunction();
4323 MachineFrameInfo &MFI = MF.getFrameInfo();
4324 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4325 unsigned RBegin, REnd;
4326 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4327 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4328 } else {
4329 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4330 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4331 REnd = ARM::R4;
4332 }
4333
4334 if (REnd != RBegin)
4335 ArgOffset = -4 * (ARM::R4 - RBegin);
4336
4337 auto PtrVT = getPointerTy(DAG.getDataLayout());
4338 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4339 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4340
4342 const TargetRegisterClass *RC =
4343 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4344
4345 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4346 Register VReg = MF.addLiveIn(Reg, RC);
4347 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4348 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4349 MachinePointerInfo(OrigArg, 4 * i));
4350 MemOps.push_back(Store);
4351 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4352 }
4353
4354 if (!MemOps.empty())
4355 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4356 return FrameIndex;
4357}
4358
4359// Setup stack frame, the va_list pointer will start from.
4360void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4361 const SDLoc &dl, SDValue &Chain,
4362 unsigned ArgOffset,
4363 unsigned TotalArgRegsSaveSize,
4364 bool ForceMutable) const {
4365 MachineFunction &MF = DAG.getMachineFunction();
4366 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4367
4368 // Try to store any remaining integer argument regs
4369 // to their spots on the stack so that they may be loaded by dereferencing
4370 // the result of va_next.
4371 // If there is no regs to be stored, just point address after last
4372 // argument passed via stack.
4373 int FrameIndex = StoreByValRegs(
4374 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4375 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4376 AFI->setVarArgsFrameIndex(FrameIndex);
4377}
4378
4379bool ARMTargetLowering::splitValueIntoRegisterParts(
4380 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4381 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4382 EVT ValueVT = Val.getValueType();
4383 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4384 unsigned ValueBits = ValueVT.getSizeInBits();
4385 unsigned PartBits = PartVT.getSizeInBits();
4386 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4387 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4388 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4389 Parts[0] = Val;
4390 return true;
4391 }
4392 return false;
4393}
4394
4395SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4396 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4397 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4398 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4399 unsigned ValueBits = ValueVT.getSizeInBits();
4400 unsigned PartBits = PartVT.getSizeInBits();
4401 SDValue Val = Parts[0];
4402
4403 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4404 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4405 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4406 return Val;
4407 }
4408 return SDValue();
4409}
4410
4411SDValue ARMTargetLowering::LowerFormalArguments(
4412 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4413 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4414 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4415 MachineFunction &MF = DAG.getMachineFunction();
4416 MachineFrameInfo &MFI = MF.getFrameInfo();
4417
4418 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4419
4420 // Assign locations to all of the incoming arguments.
4422 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4423 *DAG.getContext());
4424 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4425
4427 unsigned CurArgIdx = 0;
4428
4429 // Initially ArgRegsSaveSize is zero.
4430 // Then we increase this value each time we meet byval parameter.
4431 // We also increase this value in case of varargs function.
4432 AFI->setArgRegsSaveSize(0);
4433
4434 // Calculate the amount of stack space that we need to allocate to store
4435 // byval and variadic arguments that are passed in registers.
4436 // We need to know this before we allocate the first byval or variadic
4437 // argument, as they will be allocated a stack slot below the CFA (Canonical
4438 // Frame Address, the stack pointer at entry to the function).
4439 unsigned ArgRegBegin = ARM::R4;
4440 for (const CCValAssign &VA : ArgLocs) {
4441 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4442 break;
4443
4444 unsigned Index = VA.getValNo();
4445 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4446 if (!Flags.isByVal())
4447 continue;
4448
4449 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4450 unsigned RBegin, REnd;
4451 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4452 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4453
4454 CCInfo.nextInRegsParam();
4455 }
4456 CCInfo.rewindByValRegsInfo();
4457
4458 int lastInsIndex = -1;
4459 if (isVarArg && MFI.hasVAStart()) {
4460 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4461 if (RegIdx != std::size(GPRArgRegs))
4462 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4463 }
4464
4465 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4466 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4467 auto PtrVT = getPointerTy(DAG.getDataLayout());
4468
4469 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4470 CCValAssign &VA = ArgLocs[i];
4471 if (Ins[VA.getValNo()].isOrigArg()) {
4472 std::advance(CurOrigArg,
4473 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4474 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4475 }
4476 // Arguments stored in registers.
4477 if (VA.isRegLoc()) {
4478 EVT RegVT = VA.getLocVT();
4479 SDValue ArgValue;
4480
4481 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4482 // f64 and vector types are split up into multiple registers or
4483 // combinations of registers and stack slots.
4484 SDValue ArgValue1 =
4485 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4486 VA = ArgLocs[++i]; // skip ahead to next loc
4487 SDValue ArgValue2;
4488 if (VA.isMemLoc()) {
4489 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4490 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4491 ArgValue2 = DAG.getLoad(
4492 MVT::f64, dl, Chain, FIN,
4494 } else {
4495 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4496 }
4497 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4498 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4499 ArgValue1, DAG.getIntPtrConstant(0, dl));
4500 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4501 ArgValue2, DAG.getIntPtrConstant(1, dl));
4502 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4503 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4504 } else {
4505 const TargetRegisterClass *RC;
4506
4507 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4508 RC = &ARM::HPRRegClass;
4509 else if (RegVT == MVT::f32)
4510 RC = &ARM::SPRRegClass;
4511 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4512 RegVT == MVT::v4bf16)
4513 RC = &ARM::DPRRegClass;
4514 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4515 RegVT == MVT::v8bf16)
4516 RC = &ARM::QPRRegClass;
4517 else if (RegVT == MVT::i32)
4518 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4519 : &ARM::GPRRegClass;
4520 else
4521 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4522
4523 // Transform the arguments in physical registers into virtual ones.
4524 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4525 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4526
4527 // If this value is passed in r0 and has the returned attribute (e.g.
4528 // C++ 'structors), record this fact for later use.
4529 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4530 AFI->setPreservesR0();
4531 }
4532 }
4533
4534 // If this is an 8 or 16-bit value, it is really passed promoted
4535 // to 32 bits. Insert an assert[sz]ext to capture this, then
4536 // truncate to the right size.
4537 switch (VA.getLocInfo()) {
4538 default: llvm_unreachable("Unknown loc info!");
4539 case CCValAssign::Full: break;
4540 case CCValAssign::BCvt:
4541 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4542 break;
4543 }
4544
4545 // f16 arguments have their size extended to 4 bytes and passed as if they
4546 // had been copied to the LSBs of a 32-bit register.
4547 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4548 if (VA.needsCustom() &&
4549 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4550 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4551
4552 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4553 // less than 32 bits must be sign- or zero-extended in the callee for
4554 // security reasons. Although the ABI mandates an extension done by the
4555 // caller, the latter cannot be trusted to follow the rules of the ABI.
4556 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4557 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4558 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4559 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4560
4561 InVals.push_back(ArgValue);
4562 } else { // VA.isRegLoc()
4563 // Only arguments passed on the stack should make it here.
4564 assert(VA.isMemLoc());
4565 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4566
4567 int index = VA.getValNo();
4568
4569 // Some Ins[] entries become multiple ArgLoc[] entries.
4570 // Process them only once.
4571 if (index != lastInsIndex)
4572 {
4573 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4574 // FIXME: For now, all byval parameter objects are marked mutable.
4575 // This can be changed with more analysis.
4576 // In case of tail call optimization mark all arguments mutable.
4577 // Since they could be overwritten by lowering of arguments in case of
4578 // a tail call.
4579 if (Flags.isByVal()) {
4580 assert(Ins[index].isOrigArg() &&
4581 "Byval arguments cannot be implicit");
4582 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4583
4584 int FrameIndex = StoreByValRegs(
4585 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4586 VA.getLocMemOffset(), Flags.getByValSize());
4587 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4588 CCInfo.nextInRegsParam();
4589 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4590 VA.getValVT() == MVT::bf16)) {
4591 // f16 and bf16 values are passed in the least-significant half of
4592 // a 4 byte stack slot. This is done as-if the extension was done
4593 // in a 32-bit register, so the actual bytes used for the value
4594 // differ between little and big endian.
4595 assert(VA.getLocVT().getSizeInBits() == 32);
4596 unsigned FIOffset = VA.getLocMemOffset();
4597 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4598 FIOffset, true);
4599
4600 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4601 if (DAG.getDataLayout().isBigEndian())
4602 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4603
4604 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4606 DAG.getMachineFunction(), FI)));
4607
4608 } else {
4609 unsigned FIOffset = VA.getLocMemOffset();
4610 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4611 FIOffset, true);
4612
4613 // Create load nodes to retrieve arguments from the stack.
4614 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4615 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4617 DAG.getMachineFunction(), FI)));
4618 }
4619 lastInsIndex = index;
4620 }
4621 }
4622 }
4623
4624 // varargs
4625 if (isVarArg && MFI.hasVAStart()) {
4626 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4627 TotalArgRegsSaveSize);
4628 if (AFI->isCmseNSEntryFunction()) {
4629 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4631 "secure entry function must not be variadic", dl.getDebugLoc()));
4632 }
4633 }
4634
4635 unsigned StackArgSize = CCInfo.getStackSize();
4636 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4637 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4638 // The only way to guarantee a tail call is if the callee restores its
4639 // argument area, but it must also keep the stack aligned when doing so.
4640 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4641 assert(StackAlign && "data layout string is missing stack alignment");
4642 StackArgSize = alignTo(StackArgSize, *StackAlign);
4643
4644 AFI->setArgumentStackToRestore(StackArgSize);
4645 }
4646 AFI->setArgumentStackSize(StackArgSize);
4647
4648 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4649 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4651 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4652 }
4653
4654 return Chain;
4655}
4656
4657/// isFloatingPointZero - Return true if this is +0.0.
4660 return CFP->getValueAPF().isPosZero();
4661 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4662 // Maybe this has already been legalized into the constant pool?
4663 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4664 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4666 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4667 return CFP->getValueAPF().isPosZero();
4668 }
4669 } else if (Op->getOpcode() == ISD::BITCAST &&
4670 Op->getValueType(0) == MVT::f64) {
4671 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4672 // created by LowerConstantFP().
4673 SDValue BitcastOp = Op->getOperand(0);
4674 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4675 isNullConstant(BitcastOp->getOperand(0)))
4676 return true;
4677 }
4678 return false;
4679}
4680
4681/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4682/// the given operands.
4683SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4684 SDValue &ARMcc, SelectionDAG &DAG,
4685 const SDLoc &dl) const {
4686 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4687 unsigned C = RHSC->getZExtValue();
4688 if (!isLegalICmpImmediate((int32_t)C)) {
4689 // Constant does not fit, try adjusting it by one.
4690 switch (CC) {
4691 default: break;
4692 case ISD::SETLT:
4693 case ISD::SETGE:
4694 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4695 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4696 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4697 }
4698 break;
4699 case ISD::SETULT:
4700 case ISD::SETUGE:
4701 if (C != 0 && isLegalICmpImmediate(C-1)) {
4702 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4703 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4704 }
4705 break;
4706 case ISD::SETLE:
4707 case ISD::SETGT:
4708 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4709 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4710 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4711 }
4712 break;
4713 case ISD::SETULE:
4714 case ISD::SETUGT:
4715 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4716 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4717 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4718 }
4719 break;
4720 }
4721 }
4722 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4724 // In ARM and Thumb-2, the compare instructions can shift their second
4725 // operand.
4727 std::swap(LHS, RHS);
4728 }
4729
4730 // Thumb1 has very limited immediate modes, so turning an "and" into a
4731 // shift can save multiple instructions.
4732 //
4733 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4734 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4735 // own. If it's the operand to an unsigned comparison with an immediate,
4736 // we can eliminate one of the shifts: we transform
4737 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4738 //
4739 // We avoid transforming cases which aren't profitable due to encoding
4740 // details:
4741 //
4742 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4743 // would not; in that case, we're essentially trading one immediate load for
4744 // another.
4745 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4746 // 3. C2 is zero; we have other code for this special case.
4747 //
4748 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4749 // instruction, since the AND is always one instruction anyway, but we could
4750 // use narrow instructions in some cases.
4751 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4752 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4753 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4754 !isSignedIntSetCC(CC)) {
4755 unsigned Mask = LHS.getConstantOperandVal(1);
4756 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4757 uint64_t RHSV = RHSC->getZExtValue();
4758 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4759 unsigned ShiftBits = llvm::countl_zero(Mask);
4760 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4761 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4762 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4763 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4764 }
4765 }
4766 }
4767
4768 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4769 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4770 // way a cmp would.
4771 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4772 // some tweaks to the heuristics for the previous and->shift transform.
4773 // FIXME: Optimize cases where the LHS isn't a shift.
4774 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4775 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4776 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4777 LHS.getConstantOperandVal(1) < 31) {
4778 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4779 SDValue Shift =
4780 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4781 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4782 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4783 return Shift.getValue(1);
4784 }
4785
4787
4788 // If the RHS is a constant zero then the V (overflow) flag will never be
4789 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4790 // simpler for other passes (like the peephole optimiser) to deal with.
4791 if (isNullConstant(RHS)) {
4792 switch (CondCode) {
4793 default: break;
4794 case ARMCC::GE:
4796 break;
4797 case ARMCC::LT:
4799 break;
4800 }
4801 }
4802
4803 ARMISD::NodeType CompareType;
4804 switch (CondCode) {
4805 default:
4806 CompareType = ARMISD::CMP;
4807 break;
4808 case ARMCC::EQ:
4809 case ARMCC::NE:
4810 // Uses only Z Flag
4811 CompareType = ARMISD::CMPZ;
4812 break;
4813 }
4814 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4815 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4816}
4817
4818/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4819SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4820 SelectionDAG &DAG, const SDLoc &dl,
4821 bool Signaling) const {
4822 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4823 SDValue Flags;
4825 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4826 LHS, RHS);
4827 else
4828 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4829 FlagsVT, LHS);
4830 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4831}
4832
4833// This function returns three things: the arithmetic computation itself
4834// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4835// comparison and the condition code define the case in which the arithmetic
4836// computation *does not* overflow.
4837std::pair<SDValue, SDValue>
4838ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4839 SDValue &ARMcc) const {
4840 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4841
4842 SDValue Value, OverflowCmp;
4843 SDValue LHS = Op.getOperand(0);
4844 SDValue RHS = Op.getOperand(1);
4845 SDLoc dl(Op);
4846
4847 // FIXME: We are currently always generating CMPs because we don't support
4848 // generating CMN through the backend. This is not as good as the natural
4849 // CMP case because it causes a register dependency and cannot be folded
4850 // later.
4851
4852 switch (Op.getOpcode()) {
4853 default:
4854 llvm_unreachable("Unknown overflow instruction!");
4855 case ISD::SADDO:
4856 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4857 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4858 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4859 break;
4860 case ISD::UADDO:
4861 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4862 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4863 // We do not use it in the USUBO case as Value may not be used.
4864 Value = DAG.getNode(ARMISD::ADDC, dl,
4865 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4866 .getValue(0);
4867 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4868 break;
4869 case ISD::SSUBO:
4870 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4871 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4872 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4873 break;
4874 case ISD::USUBO:
4875 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4876 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4877 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4878 break;
4879 case ISD::UMULO:
4880 // We generate a UMUL_LOHI and then check if the high word is 0.
4881 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4882 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4883 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4884 LHS, RHS);
4885 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4886 DAG.getConstant(0, dl, MVT::i32));
4887 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4888 break;
4889 case ISD::SMULO:
4890 // We generate a SMUL_LOHI and then check if all the bits of the high word
4891 // are the same as the sign bit of the low word.
4892 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4893 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4894 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4895 LHS, RHS);
4896 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4897 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4898 Value.getValue(0),
4899 DAG.getConstant(31, dl, MVT::i32)));
4900 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4901 break;
4902 } // switch (...)
4903
4904 return std::make_pair(Value, OverflowCmp);
4905}
4906
4907SDValue
4908ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4909 // Let legalize expand this if it isn't a legal type yet.
4910 if (!isTypeLegal(Op.getValueType()))
4911 return SDValue();
4912
4913 SDValue Value, OverflowCmp;
4914 SDValue ARMcc;
4915 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4916 SDLoc dl(Op);
4917 // We use 0 and 1 as false and true values.
4918 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4919 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4920 EVT VT = Op.getValueType();
4921
4922 SDValue Overflow =
4923 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4924
4925 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4926 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4927}
4928
4930 SelectionDAG &DAG) {
4931 SDLoc DL(BoolCarry);
4932 EVT CarryVT = BoolCarry.getValueType();
4933
4934 // This converts the boolean value carry into the carry flag by doing
4935 // ARMISD::SUBC Carry, 1
4936 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4937 DAG.getVTList(CarryVT, MVT::i32),
4938 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4939 return Carry.getValue(1);
4940}
4941
4943 SelectionDAG &DAG) {
4944 SDLoc DL(Flags);
4945
4946 // Now convert the carry flag into a boolean carry. We do this
4947 // using ARMISD:ADDE 0, 0, Carry
4948 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4949 DAG.getConstant(0, DL, MVT::i32),
4950 DAG.getConstant(0, DL, MVT::i32), Flags);
4951}
4952
4953SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4954 SelectionDAG &DAG) const {
4955 // Let legalize expand this if it isn't a legal type yet.
4956 if (!isTypeLegal(Op.getValueType()))
4957 return SDValue();
4958
4959 SDValue LHS = Op.getOperand(0);
4960 SDValue RHS = Op.getOperand(1);
4961 SDLoc dl(Op);
4962
4963 EVT VT = Op.getValueType();
4964 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4965 SDValue Value;
4966 SDValue Overflow;
4967 switch (Op.getOpcode()) {
4968 default:
4969 llvm_unreachable("Unknown overflow instruction!");
4970 case ISD::UADDO:
4971 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4972 // Convert the carry flag into a boolean value.
4973 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4974 break;
4975 case ISD::USUBO: {
4976 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4977 // Convert the carry flag into a boolean value.
4978 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4979 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4980 // value. So compute 1 - C.
4981 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4982 DAG.getConstant(1, dl, MVT::i32), Overflow);
4983 break;
4984 }
4985 }
4986
4987 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4988}
4989
4991 const ARMSubtarget *Subtarget) {
4992 EVT VT = Op.getValueType();
4993 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4994 return SDValue();
4995 if (!VT.isSimple())
4996 return SDValue();
4997
4998 unsigned NewOpcode;
4999 switch (VT.getSimpleVT().SimpleTy) {
5000 default:
5001 return SDValue();
5002 case MVT::i8:
5003 switch (Op->getOpcode()) {
5004 case ISD::UADDSAT:
5005 NewOpcode = ARMISD::UQADD8b;
5006 break;
5007 case ISD::SADDSAT:
5008 NewOpcode = ARMISD::QADD8b;
5009 break;
5010 case ISD::USUBSAT:
5011 NewOpcode = ARMISD::UQSUB8b;
5012 break;
5013 case ISD::SSUBSAT:
5014 NewOpcode = ARMISD::QSUB8b;
5015 break;
5016 }
5017 break;
5018 case MVT::i16:
5019 switch (Op->getOpcode()) {
5020 case ISD::UADDSAT:
5021 NewOpcode = ARMISD::UQADD16b;
5022 break;
5023 case ISD::SADDSAT:
5024 NewOpcode = ARMISD::QADD16b;
5025 break;
5026 case ISD::USUBSAT:
5027 NewOpcode = ARMISD::UQSUB16b;
5028 break;
5029 case ISD::SSUBSAT:
5030 NewOpcode = ARMISD::QSUB16b;
5031 break;
5032 }
5033 break;
5034 }
5035
5036 SDLoc dl(Op);
5037 SDValue Add =
5038 DAG.getNode(NewOpcode, dl, MVT::i32,
5039 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5040 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5041 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5042}
5043
5044SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5045 SDValue Cond = Op.getOperand(0);
5046 SDValue SelectTrue = Op.getOperand(1);
5047 SDValue SelectFalse = Op.getOperand(2);
5048 SDLoc dl(Op);
5049 unsigned Opc = Cond.getOpcode();
5050
5051 if (Cond.getResNo() == 1 &&
5052 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5053 Opc == ISD::USUBO)) {
5054 if (!isTypeLegal(Cond->getValueType(0)))
5055 return SDValue();
5056
5057 SDValue Value, OverflowCmp;
5058 SDValue ARMcc;
5059 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5060 EVT VT = Op.getValueType();
5061
5062 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
5063 }
5064
5065 // Convert:
5066 //
5067 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5068 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5069 //
5070 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5071 const ConstantSDNode *CMOVTrue =
5072 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5073 const ConstantSDNode *CMOVFalse =
5074 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5075
5076 if (CMOVTrue && CMOVFalse) {
5077 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5078 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5079
5080 SDValue True;
5081 SDValue False;
5082 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5083 True = SelectTrue;
5084 False = SelectFalse;
5085 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5086 True = SelectFalse;
5087 False = SelectTrue;
5088 }
5089
5090 if (True.getNode() && False.getNode())
5091 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
5092 Cond.getOperand(3), DAG);
5093 }
5094 }
5095
5096 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5097 // undefined bits before doing a full-word comparison with zero.
5098 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5099 DAG.getConstant(1, dl, Cond.getValueType()));
5100
5101 return DAG.getSelectCC(dl, Cond,
5102 DAG.getConstant(0, dl, Cond.getValueType()),
5103 SelectTrue, SelectFalse, ISD::SETNE);
5104}
5105
5107 bool &swpCmpOps, bool &swpVselOps) {
5108 // Start by selecting the GE condition code for opcodes that return true for
5109 // 'equality'
5110 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5111 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5112 CondCode = ARMCC::GE;
5113
5114 // and GT for opcodes that return false for 'equality'.
5115 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5116 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5117 CondCode = ARMCC::GT;
5118
5119 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5120 // to swap the compare operands.
5121 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5122 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5123 swpCmpOps = true;
5124
5125 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5126 // If we have an unordered opcode, we need to swap the operands to the VSEL
5127 // instruction (effectively negating the condition).
5128 //
5129 // This also has the effect of swapping which one of 'less' or 'greater'
5130 // returns true, so we also swap the compare operands. It also switches
5131 // whether we return true for 'equality', so we compensate by picking the
5132 // opposite condition code to our original choice.
5133 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5134 CC == ISD::SETUGT) {
5135 swpCmpOps = !swpCmpOps;
5136 swpVselOps = !swpVselOps;
5137 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5138 }
5139
5140 // 'ordered' is 'anything but unordered', so use the VS condition code and
5141 // swap the VSEL operands.
5142 if (CC == ISD::SETO) {
5143 CondCode = ARMCC::VS;
5144 swpVselOps = true;
5145 }
5146
5147 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5148 // code and swap the VSEL operands. Also do this if we don't care about the
5149 // unordered case.
5150 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5151 CondCode = ARMCC::EQ;
5152 swpVselOps = true;
5153 }
5154}
5155
5156SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5157 SDValue TrueVal, SDValue ARMcc,
5158 SDValue Flags, SelectionDAG &DAG) const {
5159 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5161 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5163 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5164
5165 SDValue TrueLow = TrueVal.getValue(0);
5166 SDValue TrueHigh = TrueVal.getValue(1);
5167 SDValue FalseLow = FalseVal.getValue(0);
5168 SDValue FalseHigh = FalseVal.getValue(1);
5169
5170 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5171 ARMcc, Flags);
5172 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5173 ARMcc, Flags);
5174
5175 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5176 }
5177 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5178}
5179
5180static bool isGTorGE(ISD::CondCode CC) {
5181 return CC == ISD::SETGT || CC == ISD::SETGE;
5182}
5183
5184static bool isLTorLE(ISD::CondCode CC) {
5185 return CC == ISD::SETLT || CC == ISD::SETLE;
5186}
5187
5188// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5189// All of these conditions (and their <= and >= counterparts) will do:
5190// x < k ? k : x
5191// x > k ? x : k
5192// k < x ? x : k
5193// k > x ? k : x
5194static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5195 const SDValue TrueVal, const SDValue FalseVal,
5196 const ISD::CondCode CC, const SDValue K) {
5197 return (isGTorGE(CC) &&
5198 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5199 (isLTorLE(CC) &&
5200 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5201}
5202
5203// Check if two chained conditionals could be converted into SSAT or USAT.
5204//
5205// SSAT can replace a set of two conditional selectors that bound a number to an
5206// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5207//
5208// x < -k ? -k : (x > k ? k : x)
5209// x < -k ? -k : (x < k ? x : k)
5210// x > -k ? (x > k ? k : x) : -k
5211// x < k ? (x < -k ? -k : x) : k
5212// etc.
5213//
5214// LLVM canonicalizes these to either a min(max()) or a max(min())
5215// pattern. This function tries to match one of these and will return a SSAT
5216// node if successful.
5217//
5218// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5219// is a power of 2.
5221 EVT VT = Op.getValueType();
5222 SDValue V1 = Op.getOperand(0);
5223 SDValue K1 = Op.getOperand(1);
5224 SDValue TrueVal1 = Op.getOperand(2);
5225 SDValue FalseVal1 = Op.getOperand(3);
5226 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5227
5228 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5229 if (Op2.getOpcode() != ISD::SELECT_CC)
5230 return SDValue();
5231
5232 SDValue V2 = Op2.getOperand(0);
5233 SDValue K2 = Op2.getOperand(1);
5234 SDValue TrueVal2 = Op2.getOperand(2);
5235 SDValue FalseVal2 = Op2.getOperand(3);
5236 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5237
5238 SDValue V1Tmp = V1;
5239 SDValue V2Tmp = V2;
5240
5241 // Check that the registers and the constants match a max(min()) or min(max())
5242 // pattern
5243 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5244 K2 != FalseVal2 ||
5245 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5246 return SDValue();
5247
5248 // Check that the constant in the lower-bound check is
5249 // the opposite of the constant in the upper-bound check
5250 // in 1's complement.
5252 return SDValue();
5253
5254 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5255 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5256 int64_t PosVal = std::max(Val1, Val2);
5257 int64_t NegVal = std::min(Val1, Val2);
5258
5259 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5260 !isPowerOf2_64(PosVal + 1))
5261 return SDValue();
5262
5263 // Handle the difference between USAT (unsigned) and SSAT (signed)
5264 // saturation
5265 // At this point, PosVal is guaranteed to be positive
5266 uint64_t K = PosVal;
5267 SDLoc dl(Op);
5268 if (Val1 == ~Val2)
5269 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5270 DAG.getConstant(llvm::countr_one(K), dl, VT));
5271 if (NegVal == 0)
5272 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5273 DAG.getConstant(llvm::countr_one(K), dl, VT));
5274
5275 return SDValue();
5276}
5277
5278// Check if a condition of the type x < k ? k : x can be converted into a
5279// bit operation instead of conditional moves.
5280// Currently this is allowed given:
5281// - The conditions and values match up
5282// - k is 0 or -1 (all ones)
5283// This function will not check the last condition, thats up to the caller
5284// It returns true if the transformation can be made, and in such case
5285// returns x in V, and k in SatK.
5287 SDValue &SatK)
5288{
5289 SDValue LHS = Op.getOperand(0);
5290 SDValue RHS = Op.getOperand(1);
5291 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5292 SDValue TrueVal = Op.getOperand(2);
5293 SDValue FalseVal = Op.getOperand(3);
5294
5296 ? &RHS
5297 : nullptr;
5298
5299 // No constant operation in comparison, early out
5300 if (!K)
5301 return false;
5302
5303 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5304 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5305 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5306
5307 // If the constant on left and right side, or variable on left and right,
5308 // does not match, early out
5309 if (*K != KTmp || V != VTmp)
5310 return false;
5311
5312 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5313 SatK = *K;
5314 return true;
5315 }
5316
5317 return false;
5318}
5319
5320bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5321 if (VT == MVT::f32)
5322 return !Subtarget->hasVFP2Base();
5323 if (VT == MVT::f64)
5324 return !Subtarget->hasFP64();
5325 if (VT == MVT::f16)
5326 return !Subtarget->hasFullFP16();
5327 return false;
5328}
5329
5330SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5331 EVT VT = Op.getValueType();
5332 SDLoc dl(Op);
5333
5334 // Try to convert two saturating conditional selects into a single SSAT
5335 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5336 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5337 return SatValue;
5338
5339 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5340 // into more efficient bit operations, which is possible when k is 0 or -1
5341 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5342 // single instructions. On Thumb the shift and the bit operation will be two
5343 // instructions.
5344 // Only allow this transformation on full-width (32-bit) operations
5345 SDValue LowerSatConstant;
5346 SDValue SatValue;
5347 if (VT == MVT::i32 &&
5348 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5349 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5350 DAG.getConstant(31, dl, VT));
5351 if (isNullConstant(LowerSatConstant)) {
5352 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5353 DAG.getAllOnesConstant(dl, VT));
5354 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5355 } else if (isAllOnesConstant(LowerSatConstant))
5356 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5357 }
5358
5359 SDValue LHS = Op.getOperand(0);
5360 SDValue RHS = Op.getOperand(1);
5361 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5362 SDValue TrueVal = Op.getOperand(2);
5363 SDValue FalseVal = Op.getOperand(3);
5364 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5365 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5366 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5367 if (Op.getValueType().isInteger()) {
5368
5369 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5370 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5371 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5372 // Both require less instructions than compare and conditional select.
5373 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5374 RHSC->isZero() && CFVal && CFVal->isZero() &&
5375 LHS.getValueType() == RHS.getValueType()) {
5376 EVT VT = LHS.getValueType();
5377 SDValue Shift =
5378 DAG.getNode(ISD::SRA, dl, VT, LHS,
5379 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5380
5381 if (CC == ISD::SETGT)
5382 Shift = DAG.getNOT(dl, Shift, VT);
5383
5384 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5385 }
5386 }
5387
5388 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5389 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5390 unsigned TVal = CTVal->getZExtValue();
5391 unsigned FVal = CFVal->getZExtValue();
5392 unsigned Opcode = 0;
5393
5394 if (TVal == ~FVal) {
5395 Opcode = ARMISD::CSINV;
5396 } else if (TVal == ~FVal + 1) {
5397 Opcode = ARMISD::CSNEG;
5398 } else if (TVal + 1 == FVal) {
5399 Opcode = ARMISD::CSINC;
5400 } else if (TVal == FVal + 1) {
5401 Opcode = ARMISD::CSINC;
5402 std::swap(TrueVal, FalseVal);
5403 std::swap(TVal, FVal);
5404 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5405 }
5406
5407 if (Opcode) {
5408 // If one of the constants is cheaper than another, materialise the
5409 // cheaper one and let the csel generate the other.
5410 if (Opcode != ARMISD::CSINC &&
5411 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5412 std::swap(TrueVal, FalseVal);
5413 std::swap(TVal, FVal);
5414 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5415 }
5416
5417 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5418 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5419 // -(-a) == a, but (a+1)+1 != a).
5420 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5421 std::swap(TrueVal, FalseVal);
5422 std::swap(TVal, FVal);
5423 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5424 }
5425
5426 // Drops F's value because we can get it by inverting/negating TVal.
5427 FalseVal = TrueVal;
5428
5429 SDValue ARMcc;
5430 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5431 EVT VT = TrueVal.getValueType();
5432 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5433 }
5434 }
5435
5436 if (isUnsupportedFloatingType(LHS.getValueType())) {
5437 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5438
5439 // If softenSetCCOperands only returned one value, we should compare it to
5440 // zero.
5441 if (!RHS.getNode()) {
5442 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5443 CC = ISD::SETNE;
5444 }
5445 }
5446
5447 if (LHS.getValueType() == MVT::i32) {
5448 // Try to generate VSEL on ARMv8.
5449 // The VSEL instruction can't use all the usual ARM condition
5450 // codes: it only has two bits to select the condition code, so it's
5451 // constrained to use only GE, GT, VS and EQ.
5452 //
5453 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5454 // swap the operands of the previous compare instruction (effectively
5455 // inverting the compare condition, swapping 'less' and 'greater') and
5456 // sometimes need to swap the operands to the VSEL (which inverts the
5457 // condition in the sense of firing whenever the previous condition didn't)
5458 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5459 TrueVal.getValueType() == MVT::f32 ||
5460 TrueVal.getValueType() == MVT::f64)) {
5462 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5463 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5464 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5465 std::swap(TrueVal, FalseVal);
5466 }
5467 }
5468
5469 SDValue ARMcc;
5470 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5471 // Choose GE over PL, which vsel does now support
5472 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5473 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5474 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5475 }
5476
5477 ARMCC::CondCodes CondCode, CondCode2;
5478 FPCCToARMCC(CC, CondCode, CondCode2);
5479
5480 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5481 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5482 // must use VSEL (limited condition codes), due to not having conditional f16
5483 // moves.
5484 if (Subtarget->hasFPARMv8Base() &&
5485 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5486 (TrueVal.getValueType() == MVT::f16 ||
5487 TrueVal.getValueType() == MVT::f32 ||
5488 TrueVal.getValueType() == MVT::f64)) {
5489 bool swpCmpOps = false;
5490 bool swpVselOps = false;
5491 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5492
5493 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5494 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5495 if (swpCmpOps)
5496 std::swap(LHS, RHS);
5497 if (swpVselOps)
5498 std::swap(TrueVal, FalseVal);
5499 }
5500 }
5501
5502 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5503 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5504 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5505 if (CondCode2 != ARMCC::AL) {
5506 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5507 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5508 }
5509 return Result;
5510}
5511
5512/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5513/// to morph to an integer compare sequence.
5514static bool canChangeToInt(SDValue Op, bool &SeenZero,
5515 const ARMSubtarget *Subtarget) {
5516 SDNode *N = Op.getNode();
5517 if (!N->hasOneUse())
5518 // Otherwise it requires moving the value from fp to integer registers.
5519 return false;
5520 if (!N->getNumValues())
5521 return false;
5522 EVT VT = Op.getValueType();
5523 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5524 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5525 // vmrs are very slow, e.g. cortex-a8.
5526 return false;
5527
5528 if (isFloatingPointZero(Op)) {
5529 SeenZero = true;
5530 return true;
5531 }
5532 return ISD::isNormalLoad(N);
5533}
5534
5537 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5538
5540 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5541 Ld->getPointerInfo(), Ld->getAlign(),
5542 Ld->getMemOperand()->getFlags());
5543
5544 llvm_unreachable("Unknown VFP cmp argument!");
5545}
5546
5548 SDValue &RetVal1, SDValue &RetVal2) {
5549 SDLoc dl(Op);
5550
5551 if (isFloatingPointZero(Op)) {
5552 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5553 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5554 return;
5555 }
5556
5557 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5558 SDValue Ptr = Ld->getBasePtr();
5559 RetVal1 =
5560 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5561 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5562
5563 EVT PtrType = Ptr.getValueType();
5564 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5565 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5566 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5567 Ld->getPointerInfo().getWithOffset(4),
5568 commonAlignment(Ld->getAlign(), 4),
5569 Ld->getMemOperand()->getFlags());
5570 return;
5571 }
5572
5573 llvm_unreachable("Unknown VFP cmp argument!");
5574}
5575
5576/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5577/// f32 and even f64 comparisons to integer ones.
5578SDValue
5579ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5580 SDValue Chain = Op.getOperand(0);
5581 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5582 SDValue LHS = Op.getOperand(2);
5583 SDValue RHS = Op.getOperand(3);
5584 SDValue Dest = Op.getOperand(4);
5585 SDLoc dl(Op);
5586
5587 bool LHSSeenZero = false;
5588 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5589 bool RHSSeenZero = false;
5590 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5591 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5592 // If unsafe fp math optimization is enabled and there are no other uses of
5593 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5594 // to an integer comparison.
5595 if (CC == ISD::SETOEQ)
5596 CC = ISD::SETEQ;
5597 else if (CC == ISD::SETUNE)
5598 CC = ISD::SETNE;
5599
5600 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5601 SDValue ARMcc;
5602 if (LHS.getValueType() == MVT::f32) {
5603 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5604 bitcastf32Toi32(LHS, DAG), Mask);
5605 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5606 bitcastf32Toi32(RHS, DAG), Mask);
5607 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5608 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5609 Cmp);
5610 }
5611
5612 SDValue LHS1, LHS2;
5613 SDValue RHS1, RHS2;
5614 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5615 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5616 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5617 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5619 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5620 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5621 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5622 }
5623
5624 return SDValue();
5625}
5626
5627// Generate CMP + CMOV for integer abs.
5628SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5629 SDLoc DL(Op);
5630
5631 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5632
5633 // Generate CMP & CMOV.
5634 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5635 DAG.getConstant(0, DL, MVT::i32));
5636 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5637 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5638}
5639
5640SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5641 SDValue Chain = Op.getOperand(0);
5642 SDValue Cond = Op.getOperand(1);
5643 SDValue Dest = Op.getOperand(2);
5644 SDLoc dl(Op);
5645
5646 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5647 // instruction.
5648 unsigned Opc = Cond.getOpcode();
5649 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5650 !Subtarget->isThumb1Only();
5651 if (Cond.getResNo() == 1 &&
5652 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5653 Opc == ISD::USUBO || OptimizeMul)) {
5654 // Only lower legal XALUO ops.
5655 if (!isTypeLegal(Cond->getValueType(0)))
5656 return SDValue();
5657
5658 // The actual operation with overflow check.
5659 SDValue Value, OverflowCmp;
5660 SDValue ARMcc;
5661 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5662
5663 // Reverse the condition code.
5665 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5667 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5668
5669 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5670 OverflowCmp);
5671 }
5672
5673 return SDValue();
5674}
5675
5676SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5677 SDValue Chain = Op.getOperand(0);
5678 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5679 SDValue LHS = Op.getOperand(2);
5680 SDValue RHS = Op.getOperand(3);
5681 SDValue Dest = Op.getOperand(4);
5682 SDLoc dl(Op);
5683
5684 if (isUnsupportedFloatingType(LHS.getValueType())) {
5685 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5686
5687 // If softenSetCCOperands only returned one value, we should compare it to
5688 // zero.
5689 if (!RHS.getNode()) {
5690 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5691 CC = ISD::SETNE;
5692 }
5693 }
5694
5695 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5696 // instruction.
5697 unsigned Opc = LHS.getOpcode();
5698 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5699 !Subtarget->isThumb1Only();
5700 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5701 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5702 Opc == ISD::USUBO || OptimizeMul) &&
5703 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5704 // Only lower legal XALUO ops.
5705 if (!isTypeLegal(LHS->getValueType(0)))
5706 return SDValue();
5707
5708 // The actual operation with overflow check.
5709 SDValue Value, OverflowCmp;
5710 SDValue ARMcc;
5711 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5712
5713 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5714 // Reverse the condition code.
5716 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5718 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5719 }
5720
5721 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5722 OverflowCmp);
5723 }
5724
5725 if (LHS.getValueType() == MVT::i32) {
5726 SDValue ARMcc;
5727 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5728 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5729 }
5730
5731 SDNodeFlags Flags = Op->getFlags();
5732 if (Flags.hasNoNaNs() &&
5733 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5734 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5735 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5736 CC == ISD::SETUNE)) {
5737 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5738 return Result;
5739 }
5740
5741 ARMCC::CondCodes CondCode, CondCode2;
5742 FPCCToARMCC(CC, CondCode, CondCode2);
5743
5744 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5745 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5746 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5747 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5748 if (CondCode2 != ARMCC::AL) {
5749 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5750 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5751 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5752 }
5753 return Res;
5754}
5755
5756SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5757 SDValue Chain = Op.getOperand(0);
5758 SDValue Table = Op.getOperand(1);
5759 SDValue Index = Op.getOperand(2);
5760 SDLoc dl(Op);
5761
5762 EVT PTy = getPointerTy(DAG.getDataLayout());
5763 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5764 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5765 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5766 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5767 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5768 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5769 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5770 // which does another jump to the destination. This also makes it easier
5771 // to translate it to TBB / TBH later (Thumb2 only).
5772 // FIXME: This might not work if the function is extremely large.
5773 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5774 Addr, Op.getOperand(2), JTI);
5775 }
5776 if (isPositionIndependent() || Subtarget->isROPI()) {
5777 Addr =
5778 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5780 Chain = Addr.getValue(1);
5781 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5782 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5783 } else {
5784 Addr =
5785 DAG.getLoad(PTy, dl, Chain, Addr,
5787 Chain = Addr.getValue(1);
5788 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5789 }
5790}
5791
5793 EVT VT = Op.getValueType();
5794 SDLoc dl(Op);
5795
5796 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5797 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5798 return Op;
5799 return DAG.UnrollVectorOp(Op.getNode());
5800 }
5801
5802 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5803
5804 EVT NewTy;
5805 const EVT OpTy = Op.getOperand(0).getValueType();
5806 if (OpTy == MVT::v4f32)
5807 NewTy = MVT::v4i32;
5808 else if (OpTy == MVT::v4f16 && HasFullFP16)
5809 NewTy = MVT::v4i16;
5810 else if (OpTy == MVT::v8f16 && HasFullFP16)
5811 NewTy = MVT::v8i16;
5812 else
5813 llvm_unreachable("Invalid type for custom lowering!");
5814
5815 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5816 return DAG.UnrollVectorOp(Op.getNode());
5817
5818 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5819 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5820}
5821
5822SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5823 EVT VT = Op.getValueType();
5824 if (VT.isVector())
5825 return LowerVectorFP_TO_INT(Op, DAG);
5826
5827 bool IsStrict = Op->isStrictFPOpcode();
5828 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5829
5830 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5831 RTLIB::Libcall LC;
5832 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5833 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5834 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5835 Op.getValueType());
5836 else
5837 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5838 Op.getValueType());
5839 SDLoc Loc(Op);
5840 MakeLibCallOptions CallOptions;
5841 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5843 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5844 CallOptions, Loc, Chain);
5845 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5846 }
5847
5848 // FIXME: Remove this when we have strict fp instruction selection patterns
5849 if (IsStrict) {
5850 SDLoc Loc(Op);
5851 SDValue Result =
5854 Loc, Op.getValueType(), SrcVal);
5855 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5856 }
5857
5858 return Op;
5859}
5860
5862 const ARMSubtarget *Subtarget) {
5863 EVT VT = Op.getValueType();
5864 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5865 EVT FromVT = Op.getOperand(0).getValueType();
5866
5867 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5868 return Op;
5869 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5870 Subtarget->hasFP64())
5871 return Op;
5872 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5873 Subtarget->hasFullFP16())
5874 return Op;
5875 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5876 Subtarget->hasMVEFloatOps())
5877 return Op;
5878 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5879 Subtarget->hasMVEFloatOps())
5880 return Op;
5881
5882 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5883 return SDValue();
5884
5885 SDLoc DL(Op);
5886 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5887 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5888 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5889 DAG.getValueType(VT.getScalarType()));
5890 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5891 DAG.getConstant((1 << BW) - 1, DL, VT));
5892 if (IsSigned)
5893 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5894 DAG.getSignedConstant(-(1 << BW), DL, VT));
5895 return Max;
5896}
5897
5899 EVT VT = Op.getValueType();
5900 SDLoc dl(Op);
5901
5902 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5903 if (VT.getVectorElementType() == MVT::f32)
5904 return Op;
5905 return DAG.UnrollVectorOp(Op.getNode());
5906 }
5907
5908 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5909 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5910 "Invalid type for custom lowering!");
5911
5912 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5913
5914 EVT DestVecType;
5915 if (VT == MVT::v4f32)
5916 DestVecType = MVT::v4i32;
5917 else if (VT == MVT::v4f16 && HasFullFP16)
5918 DestVecType = MVT::v4i16;
5919 else if (VT == MVT::v8f16 && HasFullFP16)
5920 DestVecType = MVT::v8i16;
5921 else
5922 return DAG.UnrollVectorOp(Op.getNode());
5923
5924 unsigned CastOpc;
5925 unsigned Opc;
5926 switch (Op.getOpcode()) {
5927 default: llvm_unreachable("Invalid opcode!");
5928 case ISD::SINT_TO_FP:
5929 CastOpc = ISD::SIGN_EXTEND;
5931 break;
5932 case ISD::UINT_TO_FP:
5933 CastOpc = ISD::ZERO_EXTEND;
5935 break;
5936 }
5937
5938 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5939 return DAG.getNode(Opc, dl, VT, Op);
5940}
5941
5942SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5943 EVT VT = Op.getValueType();
5944 if (VT.isVector())
5945 return LowerVectorINT_TO_FP(Op, DAG);
5946 if (isUnsupportedFloatingType(VT)) {
5947 RTLIB::Libcall LC;
5948 if (Op.getOpcode() == ISD::SINT_TO_FP)
5949 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5950 Op.getValueType());
5951 else
5952 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5953 Op.getValueType());
5954 MakeLibCallOptions CallOptions;
5955 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5956 CallOptions, SDLoc(Op)).first;
5957 }
5958
5959 return Op;
5960}
5961
5962SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5963 // Implement fcopysign with a fabs and a conditional fneg.
5964 SDValue Tmp0 = Op.getOperand(0);
5965 SDValue Tmp1 = Op.getOperand(1);
5966 SDLoc dl(Op);
5967 EVT VT = Op.getValueType();
5968 EVT SrcVT = Tmp1.getValueType();
5969 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5970 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5971 bool UseNEON = !InGPR && Subtarget->hasNEON();
5972
5973 if (UseNEON) {
5974 // Use VBSL to copy the sign bit.
5975 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5976 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5977 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5978 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5979 if (VT == MVT::f64)
5980 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5981 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5982 DAG.getConstant(32, dl, MVT::i32));
5983 else /*if (VT == MVT::f32)*/
5984 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5985 if (SrcVT == MVT::f32) {
5986 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5987 if (VT == MVT::f64)
5988 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5989 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5990 DAG.getConstant(32, dl, MVT::i32));
5991 } else if (VT == MVT::f32)
5992 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5993 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5994 DAG.getConstant(32, dl, MVT::i32));
5995 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5996 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5997
5999 dl, MVT::i32);
6000 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
6001 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
6002 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
6003
6004 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
6005 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
6006 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
6007 if (VT == MVT::f32) {
6008 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
6009 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
6010 DAG.getConstant(0, dl, MVT::i32));
6011 } else {
6012 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
6013 }
6014
6015 return Res;
6016 }
6017
6018 // Bitcast operand 1 to i32.
6019 if (SrcVT == MVT::f64)
6020 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6021 Tmp1).getValue(1);
6022 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6023
6024 // Or in the signbit with integer operations.
6025 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6026 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6027 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6028 if (VT == MVT::f32) {
6029 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6030 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6031 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6032 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6033 }
6034
6035 // f64: Or the high part with signbit and then combine two parts.
6036 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6037 Tmp0);
6038 SDValue Lo = Tmp0.getValue(0);
6039 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6040 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6041 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6042}
6043
6044SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6045 MachineFunction &MF = DAG.getMachineFunction();
6046 MachineFrameInfo &MFI = MF.getFrameInfo();
6047 MFI.setReturnAddressIsTaken(true);
6048
6049 EVT VT = Op.getValueType();
6050 SDLoc dl(Op);
6051 unsigned Depth = Op.getConstantOperandVal(0);
6052 if (Depth) {
6053 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6054 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6055 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6056 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6057 MachinePointerInfo());
6058 }
6059
6060 // Return LR, which contains the return address. Mark it an implicit live-in.
6061 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6062 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6063}
6064
6065SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6066 const ARMBaseRegisterInfo &ARI =
6067 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6068 MachineFunction &MF = DAG.getMachineFunction();
6069 MachineFrameInfo &MFI = MF.getFrameInfo();
6070 MFI.setFrameAddressIsTaken(true);
6071
6072 EVT VT = Op.getValueType();
6073 SDLoc dl(Op); // FIXME probably not meaningful
6074 unsigned Depth = Op.getConstantOperandVal(0);
6075 Register FrameReg = ARI.getFrameRegister(MF);
6076 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6077 while (Depth--)
6078 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6079 MachinePointerInfo());
6080 return FrameAddr;
6081}
6082
6083// FIXME? Maybe this could be a TableGen attribute on some registers and
6084// this table could be generated automatically from RegInfo.
6085Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6086 const MachineFunction &MF) const {
6087 return StringSwitch<Register>(RegName)
6088 .Case("sp", ARM::SP)
6089 .Default(Register());
6090}
6091
6092// Result is 64 bit value so split into two 32 bit values and return as a
6093// pair of values.
6095 SelectionDAG &DAG) {
6096 SDLoc DL(N);
6097
6098 // This function is only supposed to be called for i64 type destination.
6099 assert(N->getValueType(0) == MVT::i64
6100 && "ExpandREAD_REGISTER called for non-i64 type result.");
6101
6103 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6104 N->getOperand(0),
6105 N->getOperand(1));
6106
6107 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6108 Read.getValue(1)));
6109 Results.push_back(Read.getValue(2)); // Chain
6110}
6111
6112/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6113/// When \p DstVT, the destination type of \p BC, is on the vector
6114/// register bank and the source of bitcast, \p Op, operates on the same bank,
6115/// it might be possible to combine them, such that everything stays on the
6116/// vector register bank.
6117/// \p return The node that would replace \p BT, if the combine
6118/// is possible.
6120 SelectionDAG &DAG) {
6121 SDValue Op = BC->getOperand(0);
6122 EVT DstVT = BC->getValueType(0);
6123
6124 // The only vector instruction that can produce a scalar (remember,
6125 // since the bitcast was about to be turned into VMOVDRR, the source
6126 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6127 // Moreover, we can do this combine only if there is one use.
6128 // Finally, if the destination type is not a vector, there is not
6129 // much point on forcing everything on the vector bank.
6130 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6131 !Op.hasOneUse())
6132 return SDValue();
6133
6134 // If the index is not constant, we will introduce an additional
6135 // multiply that will stick.
6136 // Give up in that case.
6137 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6138 if (!Index)
6139 return SDValue();
6140 unsigned DstNumElt = DstVT.getVectorNumElements();
6141
6142 // Compute the new index.
6143 const APInt &APIntIndex = Index->getAPIntValue();
6144 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6145 NewIndex *= APIntIndex;
6146 // Check if the new constant index fits into i32.
6147 if (NewIndex.getBitWidth() > 32)
6148 return SDValue();
6149
6150 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6151 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6152 SDLoc dl(Op);
6153 SDValue ExtractSrc = Op.getOperand(0);
6154 EVT VecVT = EVT::getVectorVT(
6155 *DAG.getContext(), DstVT.getScalarType(),
6156 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6157 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6158 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6159 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6160}
6161
6162/// ExpandBITCAST - If the target supports VFP, this function is called to
6163/// expand a bit convert where either the source or destination type is i64 to
6164/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6165/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6166/// vectors), since the legalizer won't know what to do with that.
6167SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6168 const ARMSubtarget *Subtarget) const {
6169 SDLoc dl(N);
6170 SDValue Op = N->getOperand(0);
6171
6172 // This function is only supposed to be called for i16 and i64 types, either
6173 // as the source or destination of the bit convert.
6174 EVT SrcVT = Op.getValueType();
6175 EVT DstVT = N->getValueType(0);
6176
6177 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6178 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6179 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6180 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6181
6182 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6183 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6184 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6185 Op = DAG.getBitcast(MVT::f16, Op);
6186 return DAG.getNode(
6187 ISD::TRUNCATE, SDLoc(N), DstVT,
6188 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6189 }
6190
6191 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6192 return SDValue();
6193
6194 // Turn i64->f64 into VMOVDRR.
6195 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6196 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6197 // if we can combine the bitcast with its source.
6199 return Val;
6200 SDValue Lo, Hi;
6201 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6202 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6203 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6204 }
6205
6206 // Turn f64->i64 into VMOVRRD.
6207 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6208 SDValue Cvt;
6209 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6210 SrcVT.getVectorNumElements() > 1)
6211 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6212 DAG.getVTList(MVT::i32, MVT::i32),
6213 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6214 else
6215 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6216 DAG.getVTList(MVT::i32, MVT::i32), Op);
6217 // Merge the pieces into a single i64 value.
6218 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6219 }
6220
6221 return SDValue();
6222}
6223
6224/// getZeroVector - Returns a vector of specified type with all zero elements.
6225/// Zero vectors are used to represent vector negation and in those cases
6226/// will be implemented with the NEON VNEG instruction. However, VNEG does
6227/// not support i64 elements, so sometimes the zero vectors will need to be
6228/// explicitly constructed. Regardless, use a canonical VMOV to create the
6229/// zero vector.
6230static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6231 assert(VT.isVector() && "Expected a vector type");
6232 // The canonical modified immediate encoding of a zero vector is....0!
6233 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6234 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6235 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6236 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6237}
6238
6239/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6240/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6241SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6242 SelectionDAG &DAG) const {
6243 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6244 EVT VT = Op.getValueType();
6245 unsigned VTBits = VT.getSizeInBits();
6246 SDLoc dl(Op);
6247 SDValue ShOpLo = Op.getOperand(0);
6248 SDValue ShOpHi = Op.getOperand(1);
6249 SDValue ShAmt = Op.getOperand(2);
6250 SDValue ARMcc;
6251 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6252
6253 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6254
6255 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6256 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6257 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6258 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6259 DAG.getConstant(VTBits, dl, MVT::i32));
6260 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6261 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6262 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6263 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6264 ISD::SETGE, ARMcc, DAG, dl);
6265 SDValue Lo =
6266 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6267
6268 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6269 SDValue HiBigShift = Opc == ISD::SRA
6270 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6271 DAG.getConstant(VTBits - 1, dl, VT))
6272 : DAG.getConstant(0, dl, VT);
6273 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6274 ISD::SETGE, ARMcc, DAG, dl);
6275 SDValue Hi =
6276 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6277
6278 SDValue Ops[2] = { Lo, Hi };
6279 return DAG.getMergeValues(Ops, dl);
6280}
6281
6282/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6283/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6284SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6285 SelectionDAG &DAG) const {
6286 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6287 EVT VT = Op.getValueType();
6288 unsigned VTBits = VT.getSizeInBits();
6289 SDLoc dl(Op);
6290 SDValue ShOpLo = Op.getOperand(0);
6291 SDValue ShOpHi = Op.getOperand(1);
6292 SDValue ShAmt = Op.getOperand(2);
6293 SDValue ARMcc;
6294
6295 assert(Op.getOpcode() == ISD::SHL_PARTS);
6296 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6297 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6298 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6299 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6300 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6301
6302 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6303 DAG.getConstant(VTBits, dl, MVT::i32));
6304 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6305 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6306 ISD::SETGE, ARMcc, DAG, dl);
6307 SDValue Hi =
6308 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6309
6310 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6311 ISD::SETGE, ARMcc, DAG, dl);
6312 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6313 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6314 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6315
6316 SDValue Ops[2] = { Lo, Hi };
6317 return DAG.getMergeValues(Ops, dl);
6318}
6319
6320SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6321 SelectionDAG &DAG) const {
6322 // The rounding mode is in bits 23:22 of the FPSCR.
6323 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6324 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6325 // so that the shift + and get folded into a bitfield extract.
6326 SDLoc dl(Op);
6327 SDValue Chain = Op.getOperand(0);
6328 SDValue Ops[] = {Chain,
6329 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6330
6331 SDValue FPSCR =
6332 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6333 Chain = FPSCR.getValue(1);
6334 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6335 DAG.getConstant(1U << 22, dl, MVT::i32));
6336 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6337 DAG.getConstant(22, dl, MVT::i32));
6338 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6339 DAG.getConstant(3, dl, MVT::i32));
6340 return DAG.getMergeValues({And, Chain}, dl);
6341}
6342
6343SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6344 SelectionDAG &DAG) const {
6345 SDLoc DL(Op);
6346 SDValue Chain = Op->getOperand(0);
6347 SDValue RMValue = Op->getOperand(1);
6348
6349 // The rounding mode is in bits 23:22 of the FPSCR.
6350 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6351 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6352 // ((arg - 1) & 3) << 22).
6353 //
6354 // It is expected that the argument of llvm.set.rounding is within the
6355 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6356 // responsibility of the code generated llvm.set.rounding to ensure this
6357 // condition.
6358
6359 // Calculate new value of FPSCR[23:22].
6360 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6361 DAG.getConstant(1, DL, MVT::i32));
6362 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6363 DAG.getConstant(0x3, DL, MVT::i32));
6364 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6365 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6366
6367 // Get current value of FPSCR.
6368 SDValue Ops[] = {Chain,
6369 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6370 SDValue FPSCR =
6371 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6372 Chain = FPSCR.getValue(1);
6373 FPSCR = FPSCR.getValue(0);
6374
6375 // Put new rounding mode into FPSCR[23:22].
6376 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6377 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6378 DAG.getConstant(RMMask, DL, MVT::i32));
6379 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6380 SDValue Ops2[] = {
6381 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6382 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6383}
6384
6385SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6386 SelectionDAG &DAG) const {
6387 SDLoc DL(Op);
6388 SDValue Chain = Op->getOperand(0);
6389 SDValue Mode = Op->getOperand(1);
6390
6391 // Generate nodes to build:
6392 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6393 SDValue Ops[] = {Chain,
6394 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6395 SDValue FPSCR =
6396 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6397 Chain = FPSCR.getValue(1);
6398 FPSCR = FPSCR.getValue(0);
6399
6400 SDValue FPSCRMasked =
6401 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6402 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6403 SDValue InputMasked =
6404 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6405 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6406 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6407
6408 SDValue Ops2[] = {
6409 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6410 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6411}
6412
6413SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6414 SelectionDAG &DAG) const {
6415 SDLoc DL(Op);
6416 SDValue Chain = Op->getOperand(0);
6417
6418 // To get the default FP mode all control bits are cleared:
6419 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6420 SDValue Ops[] = {Chain,
6421 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6422 SDValue FPSCR =
6423 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6424 Chain = FPSCR.getValue(1);
6425 FPSCR = FPSCR.getValue(0);
6426
6427 SDValue FPSCRMasked = DAG.getNode(
6428 ISD::AND, DL, MVT::i32, FPSCR,
6430 SDValue Ops2[] = {Chain,
6431 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6432 FPSCRMasked};
6433 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6434}
6435
6437 const ARMSubtarget *ST) {
6438 SDLoc dl(N);
6439 EVT VT = N->getValueType(0);
6440 if (VT.isVector() && ST->hasNEON()) {
6441
6442 // Compute the least significant set bit: LSB = X & -X
6443 SDValue X = N->getOperand(0);
6444 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6445 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6446
6447 EVT ElemTy = VT.getVectorElementType();
6448
6449 if (ElemTy == MVT::i8) {
6450 // Compute with: cttz(x) = ctpop(lsb - 1)
6451 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6452 DAG.getTargetConstant(1, dl, ElemTy));
6453 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6454 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6455 }
6456
6457 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6458 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6459 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6460 unsigned NumBits = ElemTy.getSizeInBits();
6461 SDValue WidthMinus1 =
6462 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6463 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6464 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6465 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6466 }
6467
6468 // Compute with: cttz(x) = ctpop(lsb - 1)
6469
6470 // Compute LSB - 1.
6471 SDValue Bits;
6472 if (ElemTy == MVT::i64) {
6473 // Load constant 0xffff'ffff'ffff'ffff to register.
6474 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6475 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6476 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6477 } else {
6478 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6479 DAG.getTargetConstant(1, dl, ElemTy));
6480 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6481 }
6482 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6483 }
6484
6485 if (!ST->hasV6T2Ops())
6486 return SDValue();
6487
6488 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6489 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6490}
6491
6493 const ARMSubtarget *ST) {
6494 EVT VT = N->getValueType(0);
6495 SDLoc DL(N);
6496
6497 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6498 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6499 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6500 "Unexpected type for custom ctpop lowering");
6501
6502 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6503 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6504 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6505 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6506
6507 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6508 unsigned EltSize = 8;
6509 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6510 while (EltSize != VT.getScalarSizeInBits()) {
6512 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6513 TLI.getPointerTy(DAG.getDataLayout())));
6514 Ops.push_back(Res);
6515
6516 EltSize *= 2;
6517 NumElts /= 2;
6518 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6519 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6520 }
6521
6522 return Res;
6523}
6524
6525/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6526/// operand of a vector shift operation, where all the elements of the
6527/// build_vector must have the same constant integer value.
6528static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6529 // Ignore bit_converts.
6530 while (Op.getOpcode() == ISD::BITCAST)
6531 Op = Op.getOperand(0);
6533 APInt SplatBits, SplatUndef;
6534 unsigned SplatBitSize;
6535 bool HasAnyUndefs;
6536 if (!BVN ||
6537 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6538 ElementBits) ||
6539 SplatBitSize > ElementBits)
6540 return false;
6541 Cnt = SplatBits.getSExtValue();
6542 return true;
6543}
6544
6545/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6546/// operand of a vector shift left operation. That value must be in the range:
6547/// 0 <= Value < ElementBits for a left shift; or
6548/// 0 <= Value <= ElementBits for a long left shift.
6549static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6550 assert(VT.isVector() && "vector shift count is not a vector type");
6551 int64_t ElementBits = VT.getScalarSizeInBits();
6552 if (!getVShiftImm(Op, ElementBits, Cnt))
6553 return false;
6554 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6555}
6556
6557/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6558/// operand of a vector shift right operation. For a shift opcode, the value
6559/// is positive, but for an intrinsic the value count must be negative. The
6560/// absolute value must be in the range:
6561/// 1 <= |Value| <= ElementBits for a right shift; or
6562/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6563static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6564 int64_t &Cnt) {
6565 assert(VT.isVector() && "vector shift count is not a vector type");
6566 int64_t ElementBits = VT.getScalarSizeInBits();
6567 if (!getVShiftImm(Op, ElementBits, Cnt))
6568 return false;
6569 if (!isIntrinsic)
6570 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6571 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6572 Cnt = -Cnt;
6573 return true;
6574 }
6575 return false;
6576}
6577
6579 const ARMSubtarget *ST) {
6580 EVT VT = N->getValueType(0);
6581 SDLoc dl(N);
6582 int64_t Cnt;
6583
6584 if (!VT.isVector())
6585 return SDValue();
6586
6587 // We essentially have two forms here. Shift by an immediate and shift by a
6588 // vector register (there are also shift by a gpr, but that is just handled
6589 // with a tablegen pattern). We cannot easily match shift by an immediate in
6590 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6591 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6592 // signed or unsigned, and a negative shift indicates a shift right).
6593 if (N->getOpcode() == ISD::SHL) {
6594 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6595 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6596 DAG.getConstant(Cnt, dl, MVT::i32));
6597 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6598 N->getOperand(1));
6599 }
6600
6601 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6602 "unexpected vector shift opcode");
6603
6604 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6605 unsigned VShiftOpc =
6606 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6607 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6608 DAG.getConstant(Cnt, dl, MVT::i32));
6609 }
6610
6611 // Other right shifts we don't have operations for (we use a shift left by a
6612 // negative number).
6613 EVT ShiftVT = N->getOperand(1).getValueType();
6614 SDValue NegatedCount = DAG.getNode(
6615 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6616 unsigned VShiftOpc =
6617 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6618 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6619}
6620
6622 const ARMSubtarget *ST) {
6623 EVT VT = N->getValueType(0);
6624 SDLoc dl(N);
6625
6626 // We can get here for a node like i32 = ISD::SHL i32, i64
6627 if (VT != MVT::i64)
6628 return SDValue();
6629
6630 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6631 N->getOpcode() == ISD::SHL) &&
6632 "Unknown shift to lower!");
6633
6634 unsigned ShOpc = N->getOpcode();
6635 if (ST->hasMVEIntegerOps()) {
6636 SDValue ShAmt = N->getOperand(1);
6637 unsigned ShPartsOpc = ARMISD::LSLL;
6639
6640 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6641 // then do the default optimisation
6642 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6643 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6644 return SDValue();
6645
6646 // Extract the lower 32 bits of the shift amount if it's not an i32
6647 if (ShAmt->getValueType(0) != MVT::i32)
6648 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6649
6650 if (ShOpc == ISD::SRL) {
6651 if (!Con)
6652 // There is no t2LSRLr instruction so negate and perform an lsll if the
6653 // shift amount is in a register, emulating a right shift.
6654 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6655 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6656 else
6657 // Else generate an lsrl on the immediate shift amount
6658 ShPartsOpc = ARMISD::LSRL;
6659 } else if (ShOpc == ISD::SRA)
6660 ShPartsOpc = ARMISD::ASRL;
6661
6662 // Split Lower/Upper 32 bits of the destination/source
6663 SDValue Lo, Hi;
6664 std::tie(Lo, Hi) =
6665 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6666 // Generate the shift operation as computed above
6667 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6668 ShAmt);
6669 // The upper 32 bits come from the second return value of lsll
6670 Hi = SDValue(Lo.getNode(), 1);
6671 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6672 }
6673
6674 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6675 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6676 return SDValue();
6677
6678 // If we are in thumb mode, we don't have RRX.
6679 if (ST->isThumb1Only())
6680 return SDValue();
6681
6682 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6683 SDValue Lo, Hi;
6684 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6685
6686 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6687 // captures the shifted out bit into a carry flag.
6688 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6689 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6690
6691 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6692 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6693
6694 // Merge the pieces into a single i64 value.
6695 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6696}
6697
6699 const ARMSubtarget *ST) {
6700 bool Invert = false;
6701 bool Swap = false;
6702 unsigned Opc = ARMCC::AL;
6703
6704 SDValue Op0 = Op.getOperand(0);
6705 SDValue Op1 = Op.getOperand(1);
6706 SDValue CC = Op.getOperand(2);
6707 EVT VT = Op.getValueType();
6708 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6709 SDLoc dl(Op);
6710
6711 EVT CmpVT;
6712 if (ST->hasNEON())
6714 else {
6715 assert(ST->hasMVEIntegerOps() &&
6716 "No hardware support for integer vector comparison!");
6717
6718 if (Op.getValueType().getVectorElementType() != MVT::i1)
6719 return SDValue();
6720
6721 // Make sure we expand floating point setcc to scalar if we do not have
6722 // mve.fp, so that we can handle them from there.
6723 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6724 return SDValue();
6725
6726 CmpVT = VT;
6727 }
6728
6729 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6730 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6731 // Special-case integer 64-bit equality comparisons. They aren't legal,
6732 // but they can be lowered with a few vector instructions.
6733 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6734 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6735 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6736 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6737 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6738 DAG.getCondCode(ISD::SETEQ));
6739 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6740 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6741 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6742 if (SetCCOpcode == ISD::SETNE)
6743 Merged = DAG.getNOT(dl, Merged, CmpVT);
6744 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6745 return Merged;
6746 }
6747
6748 if (CmpVT.getVectorElementType() == MVT::i64)
6749 // 64-bit comparisons are not legal in general.
6750 return SDValue();
6751
6752 if (Op1.getValueType().isFloatingPoint()) {
6753 switch (SetCCOpcode) {
6754 default: llvm_unreachable("Illegal FP comparison");
6755 case ISD::SETUNE:
6756 case ISD::SETNE:
6757 if (ST->hasMVEFloatOps()) {
6758 Opc = ARMCC::NE; break;
6759 } else {
6760 Invert = true; [[fallthrough]];
6761 }
6762 case ISD::SETOEQ:
6763 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6764 case ISD::SETOLT:
6765 case ISD::SETLT: Swap = true; [[fallthrough]];
6766 case ISD::SETOGT:
6767 case ISD::SETGT: Opc = ARMCC::GT; break;
6768 case ISD::SETOLE:
6769 case ISD::SETLE: Swap = true; [[fallthrough]];
6770 case ISD::SETOGE:
6771 case ISD::SETGE: Opc = ARMCC::GE; break;
6772 case ISD::SETUGE: Swap = true; [[fallthrough]];
6773 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6774 case ISD::SETUGT: Swap = true; [[fallthrough]];
6775 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6776 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6777 case ISD::SETONE: {
6778 // Expand this to (OLT | OGT).
6779 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6780 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6781 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6782 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6783 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6784 if (Invert)
6785 Result = DAG.getNOT(dl, Result, VT);
6786 return Result;
6787 }
6788 case ISD::SETUO: Invert = true; [[fallthrough]];
6789 case ISD::SETO: {
6790 // Expand this to (OLT | OGE).
6791 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6792 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6793 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6794 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6795 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6796 if (Invert)
6797 Result = DAG.getNOT(dl, Result, VT);
6798 return Result;
6799 }
6800 }
6801 } else {
6802 // Integer comparisons.
6803 switch (SetCCOpcode) {
6804 default: llvm_unreachable("Illegal integer comparison");
6805 case ISD::SETNE:
6806 if (ST->hasMVEIntegerOps()) {
6807 Opc = ARMCC::NE; break;
6808 } else {
6809 Invert = true; [[fallthrough]];
6810 }
6811 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6812 case ISD::SETLT: Swap = true; [[fallthrough]];
6813 case ISD::SETGT: Opc = ARMCC::GT; break;
6814 case ISD::SETLE: Swap = true; [[fallthrough]];
6815 case ISD::SETGE: Opc = ARMCC::GE; break;
6816 case ISD::SETULT: Swap = true; [[fallthrough]];
6817 case ISD::SETUGT: Opc = ARMCC::HI; break;
6818 case ISD::SETULE: Swap = true; [[fallthrough]];
6819 case ISD::SETUGE: Opc = ARMCC::HS; break;
6820 }
6821
6822 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6823 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6824 SDValue AndOp;
6826 AndOp = Op0;
6827 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6828 AndOp = Op1;
6829
6830 // Ignore bitconvert.
6831 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6832 AndOp = AndOp.getOperand(0);
6833
6834 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6835 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6836 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6837 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6838 if (!Invert)
6839 Result = DAG.getNOT(dl, Result, VT);
6840 return Result;
6841 }
6842 }
6843 }
6844
6845 if (Swap)
6846 std::swap(Op0, Op1);
6847
6848 // If one of the operands is a constant vector zero, attempt to fold the
6849 // comparison to a specialized compare-against-zero form.
6851 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6852 Opc == ARMCC::NE)) {
6853 if (Opc == ARMCC::GE)
6854 Opc = ARMCC::LE;
6855 else if (Opc == ARMCC::GT)
6856 Opc = ARMCC::LT;
6857 std::swap(Op0, Op1);
6858 }
6859
6860 SDValue Result;
6862 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6863 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6864 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6865 DAG.getConstant(Opc, dl, MVT::i32));
6866 else
6867 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6868 DAG.getConstant(Opc, dl, MVT::i32));
6869
6870 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6871
6872 if (Invert)
6873 Result = DAG.getNOT(dl, Result, VT);
6874
6875 return Result;
6876}
6877
6879 SDValue LHS = Op.getOperand(0);
6880 SDValue RHS = Op.getOperand(1);
6881 SDValue Carry = Op.getOperand(2);
6882 SDValue Cond = Op.getOperand(3);
6883 SDLoc DL(Op);
6884
6885 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6886
6887 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6888 // have to invert the carry first.
6889 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6890 DAG.getConstant(1, DL, MVT::i32), Carry);
6891 // This converts the boolean value carry into the carry flag.
6892 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6893
6894 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6895 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6896
6897 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6898 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6899 SDValue ARMcc = DAG.getConstant(
6900 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6901 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6902 Cmp.getValue(1));
6903}
6904
6905/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6906/// valid vector constant for a NEON or MVE instruction with a "modified
6907/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6908static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6909 unsigned SplatBitSize, SelectionDAG &DAG,
6910 const SDLoc &dl, EVT &VT, EVT VectorVT,
6911 VMOVModImmType type) {
6912 unsigned OpCmode, Imm;
6913 bool is128Bits = VectorVT.is128BitVector();
6914
6915 // SplatBitSize is set to the smallest size that splats the vector, so a
6916 // zero vector will always have SplatBitSize == 8. However, NEON modified
6917 // immediate instructions others than VMOV do not support the 8-bit encoding
6918 // of a zero vector, and the default encoding of zero is supposed to be the
6919 // 32-bit version.
6920 if (SplatBits == 0)
6921 SplatBitSize = 32;
6922
6923 switch (SplatBitSize) {
6924 case 8:
6925 if (type != VMOVModImm)
6926 return SDValue();
6927 // Any 1-byte value is OK. Op=0, Cmode=1110.
6928 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6929 OpCmode = 0xe;
6930 Imm = SplatBits;
6931 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6932 break;
6933
6934 case 16:
6935 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6936 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6937 if ((SplatBits & ~0xff) == 0) {
6938 // Value = 0x00nn: Op=x, Cmode=100x.
6939 OpCmode = 0x8;
6940 Imm = SplatBits;
6941 break;
6942 }
6943 if ((SplatBits & ~0xff00) == 0) {
6944 // Value = 0xnn00: Op=x, Cmode=101x.
6945 OpCmode = 0xa;
6946 Imm = SplatBits >> 8;
6947 break;
6948 }
6949 return SDValue();
6950
6951 case 32:
6952 // NEON's 32-bit VMOV supports splat values where:
6953 // * only one byte is nonzero, or
6954 // * the least significant byte is 0xff and the second byte is nonzero, or
6955 // * the least significant 2 bytes are 0xff and the third is nonzero.
6956 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6957 if ((SplatBits & ~0xff) == 0) {
6958 // Value = 0x000000nn: Op=x, Cmode=000x.
6959 OpCmode = 0;
6960 Imm = SplatBits;
6961 break;
6962 }
6963 if ((SplatBits & ~0xff00) == 0) {
6964 // Value = 0x0000nn00: Op=x, Cmode=001x.
6965 OpCmode = 0x2;
6966 Imm = SplatBits >> 8;
6967 break;
6968 }
6969 if ((SplatBits & ~0xff0000) == 0) {
6970 // Value = 0x00nn0000: Op=x, Cmode=010x.
6971 OpCmode = 0x4;
6972 Imm = SplatBits >> 16;
6973 break;
6974 }
6975 if ((SplatBits & ~0xff000000) == 0) {
6976 // Value = 0xnn000000: Op=x, Cmode=011x.
6977 OpCmode = 0x6;
6978 Imm = SplatBits >> 24;
6979 break;
6980 }
6981
6982 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6983 if (type == OtherModImm) return SDValue();
6984
6985 if ((SplatBits & ~0xffff) == 0 &&
6986 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6987 // Value = 0x0000nnff: Op=x, Cmode=1100.
6988 OpCmode = 0xc;
6989 Imm = SplatBits >> 8;
6990 break;
6991 }
6992
6993 // cmode == 0b1101 is not supported for MVE VMVN
6994 if (type == MVEVMVNModImm)
6995 return SDValue();
6996
6997 if ((SplatBits & ~0xffffff) == 0 &&
6998 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6999 // Value = 0x00nnffff: Op=x, Cmode=1101.
7000 OpCmode = 0xd;
7001 Imm = SplatBits >> 16;
7002 break;
7003 }
7004
7005 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
7006 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
7007 // VMOV.I32. A (very) minor optimization would be to replicate the value
7008 // and fall through here to test for a valid 64-bit splat. But, then the
7009 // caller would also need to check and handle the change in size.
7010 return SDValue();
7011
7012 case 64: {
7013 if (type != VMOVModImm)
7014 return SDValue();
7015 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
7016 uint64_t BitMask = 0xff;
7017 unsigned ImmMask = 1;
7018 Imm = 0;
7019 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7020 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7021 Imm |= ImmMask;
7022 } else if ((SplatBits & BitMask) != 0) {
7023 return SDValue();
7024 }
7025 BitMask <<= 8;
7026 ImmMask <<= 1;
7027 }
7028
7029 // Op=1, Cmode=1110.
7030 OpCmode = 0x1e;
7031 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7032 break;
7033 }
7034
7035 default:
7036 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7037 }
7038
7039 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7040 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7041}
7042
7043SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7044 const ARMSubtarget *ST) const {
7045 EVT VT = Op.getValueType();
7046 bool IsDouble = (VT == MVT::f64);
7047 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
7048 const APFloat &FPVal = CFP->getValueAPF();
7049
7050 // Prevent floating-point constants from using literal loads
7051 // when execute-only is enabled.
7052 if (ST->genExecuteOnly()) {
7053 // We shouldn't trigger this for v6m execute-only
7054 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7055 "Unexpected architecture");
7056
7057 // If we can represent the constant as an immediate, don't lower it
7058 if (isFPImmLegal(FPVal, VT))
7059 return Op;
7060 // Otherwise, construct as integer, and move to float register
7061 APInt INTVal = FPVal.bitcastToAPInt();
7062 SDLoc DL(CFP);
7063 switch (VT.getSimpleVT().SimpleTy) {
7064 default:
7065 llvm_unreachable("Unknown floating point type!");
7066 break;
7067 case MVT::f64: {
7068 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7069 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7070 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7071 }
7072 case MVT::f32:
7073 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7074 DAG.getConstant(INTVal, DL, MVT::i32));
7075 }
7076 }
7077
7078 if (!ST->hasVFP3Base())
7079 return SDValue();
7080
7081 // Use the default (constant pool) lowering for double constants when we have
7082 // an SP-only FPU
7083 if (IsDouble && !Subtarget->hasFP64())
7084 return SDValue();
7085
7086 // Try splatting with a VMOV.f32...
7087 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7088
7089 if (ImmVal != -1) {
7090 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7091 // We have code in place to select a valid ConstantFP already, no need to
7092 // do any mangling.
7093 return Op;
7094 }
7095
7096 // It's a float and we are trying to use NEON operations where
7097 // possible. Lower it to a splat followed by an extract.
7098 SDLoc DL(Op);
7099 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7100 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7101 NewVal);
7102 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7103 DAG.getConstant(0, DL, MVT::i32));
7104 }
7105
7106 // The rest of our options are NEON only, make sure that's allowed before
7107 // proceeding..
7108 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7109 return SDValue();
7110
7111 EVT VMovVT;
7112 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7113
7114 // It wouldn't really be worth bothering for doubles except for one very
7115 // important value, which does happen to match: 0.0. So make sure we don't do
7116 // anything stupid.
7117 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7118 return SDValue();
7119
7120 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7121 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7122 VMovVT, VT, VMOVModImm);
7123 if (NewVal != SDValue()) {
7124 SDLoc DL(Op);
7125 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7126 NewVal);
7127 if (IsDouble)
7128 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7129
7130 // It's a float: cast and extract a vector element.
7131 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7132 VecConstant);
7133 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7134 DAG.getConstant(0, DL, MVT::i32));
7135 }
7136
7137 // Finally, try a VMVN.i32
7138 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7139 VT, VMVNModImm);
7140 if (NewVal != SDValue()) {
7141 SDLoc DL(Op);
7142 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7143
7144 if (IsDouble)
7145 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7146
7147 // It's a float: cast and extract a vector element.
7148 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7149 VecConstant);
7150 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7151 DAG.getConstant(0, DL, MVT::i32));
7152 }
7153
7154 return SDValue();
7155}
7156
7157// check if an VEXT instruction can handle the shuffle mask when the
7158// vector sources of the shuffle are the same.
7159static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7160 unsigned NumElts = VT.getVectorNumElements();
7161
7162 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7163 if (M[0] < 0)
7164 return false;
7165
7166 Imm = M[0];
7167
7168 // If this is a VEXT shuffle, the immediate value is the index of the first
7169 // element. The other shuffle indices must be the successive elements after
7170 // the first one.
7171 unsigned ExpectedElt = Imm;
7172 for (unsigned i = 1; i < NumElts; ++i) {
7173 // Increment the expected index. If it wraps around, just follow it
7174 // back to index zero and keep going.
7175 ++ExpectedElt;
7176 if (ExpectedElt == NumElts)
7177 ExpectedElt = 0;
7178
7179 if (M[i] < 0) continue; // ignore UNDEF indices
7180 if (ExpectedElt != static_cast<unsigned>(M[i]))
7181 return false;
7182 }
7183
7184 return true;
7185}
7186
7187static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7188 bool &ReverseVEXT, unsigned &Imm) {
7189 unsigned NumElts = VT.getVectorNumElements();
7190 ReverseVEXT = false;
7191
7192 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7193 if (M[0] < 0)
7194 return false;
7195
7196 Imm = M[0];
7197
7198 // If this is a VEXT shuffle, the immediate value is the index of the first
7199 // element. The other shuffle indices must be the successive elements after
7200 // the first one.
7201 unsigned ExpectedElt = Imm;
7202 for (unsigned i = 1; i < NumElts; ++i) {
7203 // Increment the expected index. If it wraps around, it may still be
7204 // a VEXT but the source vectors must be swapped.
7205 ExpectedElt += 1;
7206 if (ExpectedElt == NumElts * 2) {
7207 ExpectedElt = 0;
7208 ReverseVEXT = true;
7209 }
7210
7211 if (M[i] < 0) continue; // ignore UNDEF indices
7212 if (ExpectedElt != static_cast<unsigned>(M[i]))
7213 return false;
7214 }
7215
7216 // Adjust the index value if the source operands will be swapped.
7217 if (ReverseVEXT)
7218 Imm -= NumElts;
7219
7220 return true;
7221}
7222
7223static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7224 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7225 // range, then 0 is placed into the resulting vector. So pretty much any mask
7226 // of 8 elements can work here.
7227 return VT == MVT::v8i8 && M.size() == 8;
7228}
7229
7230static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7231 unsigned Index) {
7232 if (Mask.size() == Elements * 2)
7233 return Index / Elements;
7234 return Mask[Index] == 0 ? 0 : 1;
7235}
7236
7237// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7238// checking that pairs of elements in the shuffle mask represent the same index
7239// in each vector, incrementing the expected index by 2 at each step.
7240// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7241// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7242// v2={e,f,g,h}
7243// WhichResult gives the offset for each element in the mask based on which
7244// of the two results it belongs to.
7245//
7246// The transpose can be represented either as:
7247// result1 = shufflevector v1, v2, result1_shuffle_mask
7248// result2 = shufflevector v1, v2, result2_shuffle_mask
7249// where v1/v2 and the shuffle masks have the same number of elements
7250// (here WhichResult (see below) indicates which result is being checked)
7251//
7252// or as:
7253// results = shufflevector v1, v2, shuffle_mask
7254// where both results are returned in one vector and the shuffle mask has twice
7255// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7256// want to check the low half and high half of the shuffle mask as if it were
7257// the other case
7258static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7259 unsigned EltSz = VT.getScalarSizeInBits();
7260 if (EltSz == 64)
7261 return false;
7262
7263 unsigned NumElts = VT.getVectorNumElements();
7264 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7265 return false;
7266
7267 // If the mask is twice as long as the input vector then we need to check the
7268 // upper and lower parts of the mask with a matching value for WhichResult
7269 // FIXME: A mask with only even values will be rejected in case the first
7270 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7271 // M[0] is used to determine WhichResult
7272 for (unsigned i = 0; i < M.size(); i += NumElts) {
7273 WhichResult = SelectPairHalf(NumElts, M, i);
7274 for (unsigned j = 0; j < NumElts; j += 2) {
7275 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7276 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7277 return false;
7278 }
7279 }
7280
7281 if (M.size() == NumElts*2)
7282 WhichResult = 0;
7283
7284 return true;
7285}
7286
7287/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7288/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7289/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7290static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7291 unsigned EltSz = VT.getScalarSizeInBits();
7292 if (EltSz == 64)
7293 return false;
7294
7295 unsigned NumElts = VT.getVectorNumElements();
7296 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7297 return false;
7298
7299 for (unsigned i = 0; i < M.size(); i += NumElts) {
7300 WhichResult = SelectPairHalf(NumElts, M, i);
7301 for (unsigned j = 0; j < NumElts; j += 2) {
7302 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7303 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7304 return false;
7305 }
7306 }
7307
7308 if (M.size() == NumElts*2)
7309 WhichResult = 0;
7310
7311 return true;
7312}
7313
7314// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7315// that the mask elements are either all even and in steps of size 2 or all odd
7316// and in steps of size 2.
7317// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7318// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7319// v2={e,f,g,h}
7320// Requires similar checks to that of isVTRNMask with
7321// respect the how results are returned.
7322static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7323 unsigned EltSz = VT.getScalarSizeInBits();
7324 if (EltSz == 64)
7325 return false;
7326
7327 unsigned NumElts = VT.getVectorNumElements();
7328 if (M.size() != NumElts && M.size() != NumElts*2)
7329 return false;
7330
7331 for (unsigned i = 0; i < M.size(); i += NumElts) {
7332 WhichResult = SelectPairHalf(NumElts, M, i);
7333 for (unsigned j = 0; j < NumElts; ++j) {
7334 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7335 return false;
7336 }
7337 }
7338
7339 if (M.size() == NumElts*2)
7340 WhichResult = 0;
7341
7342 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7343 if (VT.is64BitVector() && EltSz == 32)
7344 return false;
7345
7346 return true;
7347}
7348
7349/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7350/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7351/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7352static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7353 unsigned EltSz = VT.getScalarSizeInBits();
7354 if (EltSz == 64)
7355 return false;
7356
7357 unsigned NumElts = VT.getVectorNumElements();
7358 if (M.size() != NumElts && M.size() != NumElts*2)
7359 return false;
7360
7361 unsigned Half = NumElts / 2;
7362 for (unsigned i = 0; i < M.size(); i += NumElts) {
7363 WhichResult = SelectPairHalf(NumElts, M, i);
7364 for (unsigned j = 0; j < NumElts; j += Half) {
7365 unsigned Idx = WhichResult;
7366 for (unsigned k = 0; k < Half; ++k) {
7367 int MIdx = M[i + j + k];
7368 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7369 return false;
7370 Idx += 2;
7371 }
7372 }
7373 }
7374
7375 if (M.size() == NumElts*2)
7376 WhichResult = 0;
7377
7378 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7379 if (VT.is64BitVector() && EltSz == 32)
7380 return false;
7381
7382 return true;
7383}
7384
7385// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7386// that pairs of elements of the shufflemask represent the same index in each
7387// vector incrementing sequentially through the vectors.
7388// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7389// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7390// v2={e,f,g,h}
7391// Requires similar checks to that of isVTRNMask with respect the how results
7392// are returned.
7393static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7394 unsigned EltSz = VT.getScalarSizeInBits();
7395 if (EltSz == 64)
7396 return false;
7397
7398 unsigned NumElts = VT.getVectorNumElements();
7399 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7400 return false;
7401
7402 for (unsigned i = 0; i < M.size(); i += NumElts) {
7403 WhichResult = SelectPairHalf(NumElts, M, i);
7404 unsigned Idx = WhichResult * NumElts / 2;
7405 for (unsigned j = 0; j < NumElts; j += 2) {
7406 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7407 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7408 return false;
7409 Idx += 1;
7410 }
7411 }
7412
7413 if (M.size() == NumElts*2)
7414 WhichResult = 0;
7415
7416 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7417 if (VT.is64BitVector() && EltSz == 32)
7418 return false;
7419
7420 return true;
7421}
7422
7423/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7424/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7425/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7426static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7427 unsigned EltSz = VT.getScalarSizeInBits();
7428 if (EltSz == 64)
7429 return false;
7430
7431 unsigned NumElts = VT.getVectorNumElements();
7432 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7433 return false;
7434
7435 for (unsigned i = 0; i < M.size(); i += NumElts) {
7436 WhichResult = SelectPairHalf(NumElts, M, i);
7437 unsigned Idx = WhichResult * NumElts / 2;
7438 for (unsigned j = 0; j < NumElts; j += 2) {
7439 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7440 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7441 return false;
7442 Idx += 1;
7443 }
7444 }
7445
7446 if (M.size() == NumElts*2)
7447 WhichResult = 0;
7448
7449 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7450 if (VT.is64BitVector() && EltSz == 32)
7451 return false;
7452
7453 return true;
7454}
7455
7456/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7457/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7458static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7459 unsigned &WhichResult,
7460 bool &isV_UNDEF) {
7461 isV_UNDEF = false;
7462 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7463 return ARMISD::VTRN;
7464 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7465 return ARMISD::VUZP;
7466 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7467 return ARMISD::VZIP;
7468
7469 isV_UNDEF = true;
7470 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7471 return ARMISD::VTRN;
7472 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7473 return ARMISD::VUZP;
7474 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7475 return ARMISD::VZIP;
7476
7477 return 0;
7478}
7479
7480/// \return true if this is a reverse operation on an vector.
7481static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7482 unsigned NumElts = VT.getVectorNumElements();
7483 // Make sure the mask has the right size.
7484 if (NumElts != M.size())
7485 return false;
7486
7487 // Look for <15, ..., 3, -1, 1, 0>.
7488 for (unsigned i = 0; i != NumElts; ++i)
7489 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7490 return false;
7491
7492 return true;
7493}
7494
7495static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7496 unsigned NumElts = VT.getVectorNumElements();
7497 // Make sure the mask has the right size.
7498 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7499 return false;
7500
7501 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7502 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7503 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7504 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7505 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7506 int Ofs = Top ? 1 : 0;
7507 int Upper = SingleSource ? 0 : NumElts;
7508 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7509 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7510 return false;
7511 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7512 return false;
7513 }
7514 return true;
7515}
7516
7517static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7518 unsigned NumElts = VT.getVectorNumElements();
7519 // Make sure the mask has the right size.
7520 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7521 return false;
7522
7523 // If Top
7524 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7525 // This inserts Input2 into Input1
7526 // else if not Top
7527 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7528 // This inserts Input1 into Input2
7529 unsigned Offset = Top ? 0 : 1;
7530 unsigned N = SingleSource ? 0 : NumElts;
7531 for (unsigned i = 0; i < NumElts; i += 2) {
7532 if (M[i] >= 0 && M[i] != (int)i)
7533 return false;
7534 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7535 return false;
7536 }
7537
7538 return true;
7539}
7540
7541static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7542 unsigned NumElts = ToVT.getVectorNumElements();
7543 if (NumElts != M.size())
7544 return false;
7545
7546 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7547 // looking for patterns of:
7548 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7549 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7550
7551 unsigned Off0 = rev ? NumElts / 2 : 0;
7552 unsigned Off1 = rev ? 0 : NumElts / 2;
7553 for (unsigned i = 0; i < NumElts; i += 2) {
7554 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7555 return false;
7556 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7557 return false;
7558 }
7559
7560 return true;
7561}
7562
7563// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7564// from a pair of inputs. For example:
7565// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7566// FP_ROUND(EXTRACT_ELT(Y, 0),
7567// FP_ROUND(EXTRACT_ELT(X, 1),
7568// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7570 const ARMSubtarget *ST) {
7571 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7572 if (!ST->hasMVEFloatOps())
7573 return SDValue();
7574
7575 SDLoc dl(BV);
7576 EVT VT = BV.getValueType();
7577 if (VT != MVT::v8f16)
7578 return SDValue();
7579
7580 // We are looking for a buildvector of fptrunc elements, where all the
7581 // elements are interleavingly extracted from two sources. Check the first two
7582 // items are valid enough and extract some info from them (they are checked
7583 // properly in the loop below).
7584 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7587 return SDValue();
7588 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7591 return SDValue();
7592 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7593 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7594 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7595 return SDValue();
7596
7597 // Check all the values in the BuildVector line up with our expectations.
7598 for (unsigned i = 1; i < 4; i++) {
7599 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7600 return Trunc.getOpcode() == ISD::FP_ROUND &&
7602 Trunc.getOperand(0).getOperand(0) == Op &&
7603 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7604 };
7605 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7606 return SDValue();
7607 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7608 return SDValue();
7609 }
7610
7611 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7612 DAG.getConstant(0, dl, MVT::i32));
7613 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7614 DAG.getConstant(1, dl, MVT::i32));
7615}
7616
7617// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7618// from a single input on alternating lanes. For example:
7619// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7620// FP_ROUND(EXTRACT_ELT(X, 2),
7621// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7623 const ARMSubtarget *ST) {
7624 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7625 if (!ST->hasMVEFloatOps())
7626 return SDValue();
7627
7628 SDLoc dl(BV);
7629 EVT VT = BV.getValueType();
7630 if (VT != MVT::v4f32)
7631 return SDValue();
7632
7633 // We are looking for a buildvector of fptext elements, where all the
7634 // elements are alternating lanes from a single source. For example <0,2,4,6>
7635 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7636 // info from them (they are checked properly in the loop below).
7637 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7639 return SDValue();
7640 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7642 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7643 return SDValue();
7644
7645 // Check all the values in the BuildVector line up with our expectations.
7646 for (unsigned i = 1; i < 4; i++) {
7647 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7648 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7650 Trunc.getOperand(0).getOperand(0) == Op &&
7651 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7652 };
7653 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7654 return SDValue();
7655 }
7656
7657 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7658 DAG.getConstant(Offset, dl, MVT::i32));
7659}
7660
7661// If N is an integer constant that can be moved into a register in one
7662// instruction, return an SDValue of such a constant (will become a MOV
7663// instruction). Otherwise return null.
7665 const ARMSubtarget *ST, const SDLoc &dl) {
7666 uint64_t Val;
7667 if (!isa<ConstantSDNode>(N))
7668 return SDValue();
7669 Val = N->getAsZExtVal();
7670
7671 if (ST->isThumb1Only()) {
7672 if (Val <= 255 || ~Val <= 255)
7673 return DAG.getConstant(Val, dl, MVT::i32);
7674 } else {
7675 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7676 return DAG.getConstant(Val, dl, MVT::i32);
7677 }
7678 return SDValue();
7679}
7680
7682 const ARMSubtarget *ST) {
7683 SDLoc dl(Op);
7684 EVT VT = Op.getValueType();
7685
7686 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7687
7688 unsigned NumElts = VT.getVectorNumElements();
7689 unsigned BoolMask;
7690 unsigned BitsPerBool;
7691 if (NumElts == 2) {
7692 BitsPerBool = 8;
7693 BoolMask = 0xff;
7694 } else if (NumElts == 4) {
7695 BitsPerBool = 4;
7696 BoolMask = 0xf;
7697 } else if (NumElts == 8) {
7698 BitsPerBool = 2;
7699 BoolMask = 0x3;
7700 } else if (NumElts == 16) {
7701 BitsPerBool = 1;
7702 BoolMask = 0x1;
7703 } else
7704 return SDValue();
7705
7706 // If this is a single value copied into all lanes (a splat), we can just sign
7707 // extend that single value
7708 SDValue FirstOp = Op.getOperand(0);
7709 if (!isa<ConstantSDNode>(FirstOp) &&
7710 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7711 return U.get().isUndef() || U.get() == FirstOp;
7712 })) {
7713 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7714 DAG.getValueType(MVT::i1));
7715 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7716 }
7717
7718 // First create base with bits set where known
7719 unsigned Bits32 = 0;
7720 for (unsigned i = 0; i < NumElts; ++i) {
7721 SDValue V = Op.getOperand(i);
7722 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7723 continue;
7724 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7725 if (BitSet)
7726 Bits32 |= BoolMask << (i * BitsPerBool);
7727 }
7728
7729 // Add in unknown nodes
7731 DAG.getConstant(Bits32, dl, MVT::i32));
7732 for (unsigned i = 0; i < NumElts; ++i) {
7733 SDValue V = Op.getOperand(i);
7734 if (isa<ConstantSDNode>(V) || V.isUndef())
7735 continue;
7736 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7737 DAG.getConstant(i, dl, MVT::i32));
7738 }
7739
7740 return Base;
7741}
7742
7744 const ARMSubtarget *ST) {
7745 if (!ST->hasMVEIntegerOps())
7746 return SDValue();
7747
7748 // We are looking for a buildvector where each element is Op[0] + i*N
7749 EVT VT = Op.getValueType();
7750 SDValue Op0 = Op.getOperand(0);
7751 unsigned NumElts = VT.getVectorNumElements();
7752
7753 // Get the increment value from operand 1
7754 SDValue Op1 = Op.getOperand(1);
7755 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7757 return SDValue();
7758 unsigned N = Op1.getConstantOperandVal(1);
7759 if (N != 1 && N != 2 && N != 4 && N != 8)
7760 return SDValue();
7761
7762 // Check that each other operand matches
7763 for (unsigned I = 2; I < NumElts; I++) {
7764 SDValue OpI = Op.getOperand(I);
7765 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7767 OpI.getConstantOperandVal(1) != I * N)
7768 return SDValue();
7769 }
7770
7771 SDLoc DL(Op);
7772 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7773 DAG.getConstant(N, DL, MVT::i32));
7774}
7775
7776// Returns true if the operation N can be treated as qr instruction variant at
7777// operand Op.
7778static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7779 switch (N->getOpcode()) {
7780 case ISD::ADD:
7781 case ISD::MUL:
7782 case ISD::SADDSAT:
7783 case ISD::UADDSAT:
7784 case ISD::AVGFLOORS:
7785 case ISD::AVGFLOORU:
7786 return true;
7787 case ISD::SUB:
7788 case ISD::SSUBSAT:
7789 case ISD::USUBSAT:
7790 return N->getOperand(1).getNode() == Op;
7792 switch (N->getConstantOperandVal(0)) {
7793 case Intrinsic::arm_mve_add_predicated:
7794 case Intrinsic::arm_mve_mul_predicated:
7795 case Intrinsic::arm_mve_qadd_predicated:
7796 case Intrinsic::arm_mve_vhadd:
7797 case Intrinsic::arm_mve_hadd_predicated:
7798 case Intrinsic::arm_mve_vqdmulh:
7799 case Intrinsic::arm_mve_qdmulh_predicated:
7800 case Intrinsic::arm_mve_vqrdmulh:
7801 case Intrinsic::arm_mve_qrdmulh_predicated:
7802 case Intrinsic::arm_mve_vqdmull:
7803 case Intrinsic::arm_mve_vqdmull_predicated:
7804 return true;
7805 case Intrinsic::arm_mve_sub_predicated:
7806 case Intrinsic::arm_mve_qsub_predicated:
7807 case Intrinsic::arm_mve_vhsub:
7808 case Intrinsic::arm_mve_hsub_predicated:
7809 return N->getOperand(2).getNode() == Op;
7810 default:
7811 return false;
7812 }
7813 default:
7814 return false;
7815 }
7816}
7817
7818// If this is a case we can't handle, return null and let the default
7819// expansion code take care of it.
7820SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7821 const ARMSubtarget *ST) const {
7822 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7823 SDLoc dl(Op);
7824 EVT VT = Op.getValueType();
7825
7826 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7827 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7828
7829 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7830 return R;
7831
7832 APInt SplatBits, SplatUndef;
7833 unsigned SplatBitSize;
7834 bool HasAnyUndefs;
7835 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7836 if (SplatUndef.isAllOnes())
7837 return DAG.getUNDEF(VT);
7838
7839 // If all the users of this constant splat are qr instruction variants,
7840 // generate a vdup of the constant.
7841 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7842 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7843 all_of(BVN->users(),
7844 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7845 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7846 : SplatBitSize == 16 ? MVT::v8i16
7847 : MVT::v16i8;
7848 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7849 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7850 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7851 }
7852
7853 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7854 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7855 // Check if an immediate VMOV works.
7856 EVT VmovVT;
7857 SDValue Val =
7858 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7859 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7860
7861 if (Val.getNode()) {
7862 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7863 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7864 }
7865
7866 // Try an immediate VMVN.
7867 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7868 Val = isVMOVModifiedImm(
7869 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7870 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7871 if (Val.getNode()) {
7872 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7873 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7874 }
7875
7876 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7877 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7878 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7879 if (ImmVal != -1) {
7880 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7881 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7882 }
7883 }
7884
7885 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7886 // type.
7887 if (ST->hasMVEIntegerOps() &&
7888 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7889 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7890 : SplatBitSize == 16 ? MVT::v8i16
7891 : MVT::v16i8;
7892 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7893 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7894 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7895 }
7896 }
7897 }
7898
7899 // Scan through the operands to see if only one value is used.
7900 //
7901 // As an optimisation, even if more than one value is used it may be more
7902 // profitable to splat with one value then change some lanes.
7903 //
7904 // Heuristically we decide to do this if the vector has a "dominant" value,
7905 // defined as splatted to more than half of the lanes.
7906 unsigned NumElts = VT.getVectorNumElements();
7907 bool isOnlyLowElement = true;
7908 bool usesOnlyOneValue = true;
7909 bool hasDominantValue = false;
7910 bool isConstant = true;
7911
7912 // Map of the number of times a particular SDValue appears in the
7913 // element list.
7914 DenseMap<SDValue, unsigned> ValueCounts;
7915 SDValue Value;
7916 for (unsigned i = 0; i < NumElts; ++i) {
7917 SDValue V = Op.getOperand(i);
7918 if (V.isUndef())
7919 continue;
7920 if (i > 0)
7921 isOnlyLowElement = false;
7923 isConstant = false;
7924
7925 unsigned &Count = ValueCounts[V];
7926
7927 // Is this value dominant? (takes up more than half of the lanes)
7928 if (++Count > (NumElts / 2)) {
7929 hasDominantValue = true;
7930 Value = V;
7931 }
7932 }
7933 if (ValueCounts.size() != 1)
7934 usesOnlyOneValue = false;
7935 if (!Value.getNode() && !ValueCounts.empty())
7936 Value = ValueCounts.begin()->first;
7937
7938 if (ValueCounts.empty())
7939 return DAG.getUNDEF(VT);
7940
7941 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7942 // Keep going if we are hitting this case.
7943 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7944 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7945
7946 unsigned EltSize = VT.getScalarSizeInBits();
7947
7948 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7949 // i32 and try again.
7950 if (hasDominantValue && EltSize <= 32) {
7951 if (!isConstant) {
7952 SDValue N;
7953
7954 // If we are VDUPing a value that comes directly from a vector, that will
7955 // cause an unnecessary move to and from a GPR, where instead we could
7956 // just use VDUPLANE. We can only do this if the lane being extracted
7957 // is at a constant index, as the VDUP from lane instructions only have
7958 // constant-index forms.
7959 ConstantSDNode *constIndex;
7960 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7961 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7962 // We need to create a new undef vector to use for the VDUPLANE if the
7963 // size of the vector from which we get the value is different than the
7964 // size of the vector that we need to create. We will insert the element
7965 // such that the register coalescer will remove unnecessary copies.
7966 if (VT != Value->getOperand(0).getValueType()) {
7967 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7969 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7970 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7971 Value, DAG.getConstant(index, dl, MVT::i32)),
7972 DAG.getConstant(index, dl, MVT::i32));
7973 } else
7974 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7975 Value->getOperand(0), Value->getOperand(1));
7976 } else
7977 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7978
7979 if (!usesOnlyOneValue) {
7980 // The dominant value was splatted as 'N', but we now have to insert
7981 // all differing elements.
7982 for (unsigned I = 0; I < NumElts; ++I) {
7983 if (Op.getOperand(I) == Value)
7984 continue;
7986 Ops.push_back(N);
7987 Ops.push_back(Op.getOperand(I));
7988 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7989 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7990 }
7991 }
7992 return N;
7993 }
7996 MVT FVT = VT.getVectorElementType().getSimpleVT();
7997 assert(FVT == MVT::f32 || FVT == MVT::f16);
7998 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7999 for (unsigned i = 0; i < NumElts; ++i)
8000 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
8001 Op.getOperand(i)));
8002 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
8003 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
8004 Val = LowerBUILD_VECTOR(Val, DAG, ST);
8005 if (Val.getNode())
8006 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8007 }
8008 if (usesOnlyOneValue) {
8009 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
8010 if (isConstant && Val.getNode())
8011 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
8012 }
8013 }
8014
8015 // If all elements are constants and the case above didn't get hit, fall back
8016 // to the default expansion, which will generate a load from the constant
8017 // pool.
8018 if (isConstant)
8019 return SDValue();
8020
8021 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8022 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8023 // length <= 2.
8024 if (NumElts >= 4)
8025 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8026 return shuffle;
8027
8028 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8029 // VCVT's
8030 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8031 return VCVT;
8032 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8033 return VCVT;
8034
8035 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8036 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8037 // into two 64-bit vectors; we might discover a better way to lower it.
8038 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8039 EVT ExtVT = VT.getVectorElementType();
8040 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8041 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8042 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8043 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8044 SDValue Upper =
8045 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8046 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8047 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8048 if (Lower && Upper)
8049 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8050 }
8051
8052 // Vectors with 32- or 64-bit elements can be built by directly assigning
8053 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8054 // will be legalized.
8055 if (EltSize >= 32) {
8056 // Do the expansion with floating-point types, since that is what the VFP
8057 // registers are defined to use, and since i64 is not legal.
8058 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8059 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8061 for (unsigned i = 0; i < NumElts; ++i)
8062 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8063 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8064 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8065 }
8066
8067 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8068 // know the default expansion would otherwise fall back on something even
8069 // worse. For a vector with one or two non-undef values, that's
8070 // scalar_to_vector for the elements followed by a shuffle (provided the
8071 // shuffle is valid for the target) and materialization element by element
8072 // on the stack followed by a load for everything else.
8073 if (!isConstant && !usesOnlyOneValue) {
8074 SDValue Vec = DAG.getUNDEF(VT);
8075 for (unsigned i = 0 ; i < NumElts; ++i) {
8076 SDValue V = Op.getOperand(i);
8077 if (V.isUndef())
8078 continue;
8079 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8080 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8081 }
8082 return Vec;
8083 }
8084
8085 return SDValue();
8086}
8087
8088// Gather data to see if the operation can be modelled as a
8089// shuffle in combination with VEXTs.
8090SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8091 SelectionDAG &DAG) const {
8092 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8093 SDLoc dl(Op);
8094 EVT VT = Op.getValueType();
8095 unsigned NumElts = VT.getVectorNumElements();
8096
8097 struct ShuffleSourceInfo {
8098 SDValue Vec;
8099 unsigned MinElt = std::numeric_limits<unsigned>::max();
8100 unsigned MaxElt = 0;
8101
8102 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8103 // be compatible with the shuffle we intend to construct. As a result
8104 // ShuffleVec will be some sliding window into the original Vec.
8105 SDValue ShuffleVec;
8106
8107 // Code should guarantee that element i in Vec starts at element "WindowBase
8108 // + i * WindowScale in ShuffleVec".
8109 int WindowBase = 0;
8110 int WindowScale = 1;
8111
8112 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8113
8114 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8115 };
8116
8117 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8118 // node.
8120 for (unsigned i = 0; i < NumElts; ++i) {
8121 SDValue V = Op.getOperand(i);
8122 if (V.isUndef())
8123 continue;
8124 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8125 // A shuffle can only come from building a vector from various
8126 // elements of other vectors.
8127 return SDValue();
8128 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8129 // Furthermore, shuffles require a constant mask, whereas extractelts
8130 // accept variable indices.
8131 return SDValue();
8132 }
8133
8134 // Add this element source to the list if it's not already there.
8135 SDValue SourceVec = V.getOperand(0);
8136 auto Source = llvm::find(Sources, SourceVec);
8137 if (Source == Sources.end())
8138 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8139
8140 // Update the minimum and maximum lane number seen.
8141 unsigned EltNo = V.getConstantOperandVal(1);
8142 Source->MinElt = std::min(Source->MinElt, EltNo);
8143 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8144 }
8145
8146 // Currently only do something sane when at most two source vectors
8147 // are involved.
8148 if (Sources.size() > 2)
8149 return SDValue();
8150
8151 // Find out the smallest element size among result and two sources, and use
8152 // it as element size to build the shuffle_vector.
8153 EVT SmallestEltTy = VT.getVectorElementType();
8154 for (auto &Source : Sources) {
8155 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8156 if (SrcEltTy.bitsLT(SmallestEltTy))
8157 SmallestEltTy = SrcEltTy;
8158 }
8159 unsigned ResMultiplier =
8160 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8161 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8162 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8163
8164 // If the source vector is too wide or too narrow, we may nevertheless be able
8165 // to construct a compatible shuffle either by concatenating it with UNDEF or
8166 // extracting a suitable range of elements.
8167 for (auto &Src : Sources) {
8168 EVT SrcVT = Src.ShuffleVec.getValueType();
8169
8170 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8171 uint64_t VTSize = VT.getFixedSizeInBits();
8172 if (SrcVTSize == VTSize)
8173 continue;
8174
8175 // This stage of the search produces a source with the same element type as
8176 // the original, but with a total width matching the BUILD_VECTOR output.
8177 EVT EltVT = SrcVT.getVectorElementType();
8178 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8179 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8180
8181 if (SrcVTSize < VTSize) {
8182 if (2 * SrcVTSize != VTSize)
8183 return SDValue();
8184 // We can pad out the smaller vector for free, so if it's part of a
8185 // shuffle...
8186 Src.ShuffleVec =
8187 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8188 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8189 continue;
8190 }
8191
8192 if (SrcVTSize != 2 * VTSize)
8193 return SDValue();
8194
8195 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8196 // Span too large for a VEXT to cope
8197 return SDValue();
8198 }
8199
8200 if (Src.MinElt >= NumSrcElts) {
8201 // The extraction can just take the second half
8202 Src.ShuffleVec =
8203 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8204 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8205 Src.WindowBase = -NumSrcElts;
8206 } else if (Src.MaxElt < NumSrcElts) {
8207 // The extraction can just take the first half
8208 Src.ShuffleVec =
8209 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8210 DAG.getConstant(0, dl, MVT::i32));
8211 } else {
8212 // An actual VEXT is needed
8213 SDValue VEXTSrc1 =
8214 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8215 DAG.getConstant(0, dl, MVT::i32));
8216 SDValue VEXTSrc2 =
8217 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8218 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8219
8220 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8221 VEXTSrc2,
8222 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8223 Src.WindowBase = -Src.MinElt;
8224 }
8225 }
8226
8227 // Another possible incompatibility occurs from the vector element types. We
8228 // can fix this by bitcasting the source vectors to the same type we intend
8229 // for the shuffle.
8230 for (auto &Src : Sources) {
8231 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8232 if (SrcEltTy == SmallestEltTy)
8233 continue;
8234 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8235 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8236 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8237 Src.WindowBase *= Src.WindowScale;
8238 }
8239
8240 // Final check before we try to actually produce a shuffle.
8241 LLVM_DEBUG({
8242 for (auto Src : Sources)
8243 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8244 });
8245
8246 // The stars all align, our next step is to produce the mask for the shuffle.
8247 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8248 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8249 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8250 SDValue Entry = Op.getOperand(i);
8251 if (Entry.isUndef())
8252 continue;
8253
8254 auto Src = llvm::find(Sources, Entry.getOperand(0));
8255 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8256
8257 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8258 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8259 // segment.
8260 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8261 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8262 VT.getScalarSizeInBits());
8263 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8264
8265 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8266 // starting at the appropriate offset.
8267 int *LaneMask = &Mask[i * ResMultiplier];
8268
8269 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8270 ExtractBase += NumElts * (Src - Sources.begin());
8271 for (int j = 0; j < LanesDefined; ++j)
8272 LaneMask[j] = ExtractBase + j;
8273 }
8274
8275
8276 // We can't handle more than two sources. This should have already
8277 // been checked before this point.
8278 assert(Sources.size() <= 2 && "Too many sources!");
8279
8280 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8281 for (unsigned i = 0; i < Sources.size(); ++i)
8282 ShuffleOps[i] = Sources[i].ShuffleVec;
8283
8284 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8285 ShuffleOps[1], Mask, DAG);
8286 if (!Shuffle)
8287 return SDValue();
8288 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8289}
8290
8292 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8301 OP_VUZPL, // VUZP, left result
8302 OP_VUZPR, // VUZP, right result
8303 OP_VZIPL, // VZIP, left result
8304 OP_VZIPR, // VZIP, right result
8305 OP_VTRNL, // VTRN, left result
8306 OP_VTRNR // VTRN, right result
8307};
8308
8309static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8310 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8311 switch (OpNum) {
8312 case OP_COPY:
8313 case OP_VREV:
8314 case OP_VDUP0:
8315 case OP_VDUP1:
8316 case OP_VDUP2:
8317 case OP_VDUP3:
8318 return true;
8319 }
8320 return false;
8321}
8322
8323/// isShuffleMaskLegal - Targets can use this to indicate that they only
8324/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8325/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8326/// are assumed to be legal.
8328 if (VT.getVectorNumElements() == 4 &&
8329 (VT.is128BitVector() || VT.is64BitVector())) {
8330 unsigned PFIndexes[4];
8331 for (unsigned i = 0; i != 4; ++i) {
8332 if (M[i] < 0)
8333 PFIndexes[i] = 8;
8334 else
8335 PFIndexes[i] = M[i];
8336 }
8337
8338 // Compute the index in the perfect shuffle table.
8339 unsigned PFTableIndex =
8340 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8341 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8342 unsigned Cost = (PFEntry >> 30);
8343
8344 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8345 return true;
8346 }
8347
8348 bool ReverseVEXT, isV_UNDEF;
8349 unsigned Imm, WhichResult;
8350
8351 unsigned EltSize = VT.getScalarSizeInBits();
8352 if (EltSize >= 32 ||
8354 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8355 isVREVMask(M, VT, 64) ||
8356 isVREVMask(M, VT, 32) ||
8357 isVREVMask(M, VT, 16))
8358 return true;
8359 else if (Subtarget->hasNEON() &&
8360 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8361 isVTBLMask(M, VT) ||
8362 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8363 return true;
8364 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8365 isReverseMask(M, VT))
8366 return true;
8367 else if (Subtarget->hasMVEIntegerOps() &&
8368 (isVMOVNMask(M, VT, true, false) ||
8369 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8370 return true;
8371 else if (Subtarget->hasMVEIntegerOps() &&
8372 (isTruncMask(M, VT, false, false) ||
8373 isTruncMask(M, VT, false, true) ||
8374 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8375 return true;
8376 else
8377 return false;
8378}
8379
8380/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8381/// the specified operations to build the shuffle.
8383 SDValue RHS, SelectionDAG &DAG,
8384 const SDLoc &dl) {
8385 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8386 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8387 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8388
8389 if (OpNum == OP_COPY) {
8390 if (LHSID == (1*9+2)*9+3) return LHS;
8391 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8392 return RHS;
8393 }
8394
8395 SDValue OpLHS, OpRHS;
8396 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8397 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8398 EVT VT = OpLHS.getValueType();
8399
8400 switch (OpNum) {
8401 default: llvm_unreachable("Unknown shuffle opcode!");
8402 case OP_VREV:
8403 // VREV divides the vector in half and swaps within the half.
8404 if (VT.getScalarSizeInBits() == 32)
8405 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8406 // vrev <4 x i16> -> VREV32
8407 if (VT.getScalarSizeInBits() == 16)
8408 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8409 // vrev <4 x i8> -> VREV16
8410 assert(VT.getScalarSizeInBits() == 8);
8411 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8412 case OP_VDUP0:
8413 case OP_VDUP1:
8414 case OP_VDUP2:
8415 case OP_VDUP3:
8416 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8417 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8418 case OP_VEXT1:
8419 case OP_VEXT2:
8420 case OP_VEXT3:
8421 return DAG.getNode(ARMISD::VEXT, dl, VT,
8422 OpLHS, OpRHS,
8423 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8424 case OP_VUZPL:
8425 case OP_VUZPR:
8426 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8427 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8428 case OP_VZIPL:
8429 case OP_VZIPR:
8430 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8431 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8432 case OP_VTRNL:
8433 case OP_VTRNR:
8434 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8435 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8436 }
8437}
8438
8440 ArrayRef<int> ShuffleMask,
8441 SelectionDAG &DAG) {
8442 // Check to see if we can use the VTBL instruction.
8443 SDValue V1 = Op.getOperand(0);
8444 SDValue V2 = Op.getOperand(1);
8445 SDLoc DL(Op);
8446
8447 SmallVector<SDValue, 8> VTBLMask;
8448 for (int I : ShuffleMask)
8449 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8450
8451 if (V2.getNode()->isUndef())
8452 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8453 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8454
8455 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8456 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8457}
8458
8460 SDLoc DL(Op);
8461 EVT VT = Op.getValueType();
8462
8463 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8464 "Expect an v8i16/v16i8 type");
8465 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8466 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8467 // extract the first 8 bytes into the top double word and the last 8 bytes
8468 // into the bottom double word, through a new vector shuffle that will be
8469 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8470 std::vector<int> NewMask;
8471 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8472 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8473 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8474 NewMask.push_back(i);
8475 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8476}
8477
8479 switch (VT.getSimpleVT().SimpleTy) {
8480 case MVT::v2i1:
8481 return MVT::v2f64;
8482 case MVT::v4i1:
8483 return MVT::v4i32;
8484 case MVT::v8i1:
8485 return MVT::v8i16;
8486 case MVT::v16i1:
8487 return MVT::v16i8;
8488 default:
8489 llvm_unreachable("Unexpected vector predicate type");
8490 }
8491}
8492
8494 SelectionDAG &DAG) {
8495 // Converting from boolean predicates to integers involves creating a vector
8496 // of all ones or all zeroes and selecting the lanes based upon the real
8497 // predicate.
8499 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8500 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8501
8502 SDValue AllZeroes =
8503 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8504 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8505
8506 // Get full vector type from predicate type
8508
8509 SDValue RecastV1;
8510 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8511 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8512 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8513 // since we know in hardware the sizes are really the same.
8514 if (VT != MVT::v16i1)
8515 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8516 else
8517 RecastV1 = Pred;
8518
8519 // Select either all ones or zeroes depending upon the real predicate bits.
8520 SDValue PredAsVector =
8521 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8522
8523 // Recast our new predicate-as-integer v16i8 vector into something
8524 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8525 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8526}
8527
8529 const ARMSubtarget *ST) {
8530 EVT VT = Op.getValueType();
8532 ArrayRef<int> ShuffleMask = SVN->getMask();
8533
8534 assert(ST->hasMVEIntegerOps() &&
8535 "No support for vector shuffle of boolean predicates");
8536
8537 SDValue V1 = Op.getOperand(0);
8538 SDValue V2 = Op.getOperand(1);
8539 SDLoc dl(Op);
8540 if (isReverseMask(ShuffleMask, VT)) {
8541 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8542 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8543 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8544 DAG.getConstant(16, dl, MVT::i32));
8545 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8546 }
8547
8548 // Until we can come up with optimised cases for every single vector
8549 // shuffle in existence we have chosen the least painful strategy. This is
8550 // to essentially promote the boolean predicate to a 8-bit integer, where
8551 // each predicate represents a byte. Then we fall back on a normal integer
8552 // vector shuffle and convert the result back into a predicate vector. In
8553 // many cases the generated code might be even better than scalar code
8554 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8555 // fields in a register into 8 other arbitrary 2-bit fields!
8556 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8557 EVT NewVT = PredAsVector1.getValueType();
8558 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8559 : PromoteMVEPredVector(dl, V2, VT, DAG);
8560 assert(PredAsVector2.getValueType() == NewVT &&
8561 "Expected identical vector type in expanded i1 shuffle!");
8562
8563 // Do the shuffle!
8564 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8565 PredAsVector2, ShuffleMask);
8566
8567 // Now return the result of comparing the shuffled vector with zero,
8568 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8569 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8570 if (VT == MVT::v2i1) {
8571 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8572 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8573 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8574 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8575 }
8576 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8577 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8578}
8579
8581 ArrayRef<int> ShuffleMask,
8582 SelectionDAG &DAG) {
8583 // Attempt to lower the vector shuffle using as many whole register movs as
8584 // possible. This is useful for types smaller than 32bits, which would
8585 // often otherwise become a series for grp movs.
8586 SDLoc dl(Op);
8587 EVT VT = Op.getValueType();
8588 if (VT.getScalarSizeInBits() >= 32)
8589 return SDValue();
8590
8591 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8592 "Unexpected vector type");
8593 int NumElts = VT.getVectorNumElements();
8594 int QuarterSize = NumElts / 4;
8595 // The four final parts of the vector, as i32's
8596 SDValue Parts[4];
8597
8598 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8599 // <u,u,u,u>), returning the vmov lane index
8600 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8601 // Detect which mov lane this would be from the first non-undef element.
8602 int MovIdx = -1;
8603 for (int i = 0; i < Length; i++) {
8604 if (ShuffleMask[Start + i] >= 0) {
8605 if (ShuffleMask[Start + i] % Length != i)
8606 return -1;
8607 MovIdx = ShuffleMask[Start + i] / Length;
8608 break;
8609 }
8610 }
8611 // If all items are undef, leave this for other combines
8612 if (MovIdx == -1)
8613 return -1;
8614 // Check the remaining values are the correct part of the same mov
8615 for (int i = 1; i < Length; i++) {
8616 if (ShuffleMask[Start + i] >= 0 &&
8617 (ShuffleMask[Start + i] / Length != MovIdx ||
8618 ShuffleMask[Start + i] % Length != i))
8619 return -1;
8620 }
8621 return MovIdx;
8622 };
8623
8624 for (int Part = 0; Part < 4; ++Part) {
8625 // Does this part look like a mov
8626 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8627 if (Elt != -1) {
8628 SDValue Input = Op->getOperand(0);
8629 if (Elt >= 4) {
8630 Input = Op->getOperand(1);
8631 Elt -= 4;
8632 }
8633 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8634 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8635 DAG.getConstant(Elt, dl, MVT::i32));
8636 }
8637 }
8638
8639 // Nothing interesting found, just return
8640 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8641 return SDValue();
8642
8643 // The other parts need to be built with the old shuffle vector, cast to a
8644 // v4i32 and extract_vector_elts
8645 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8646 SmallVector<int, 16> NewShuffleMask;
8647 for (int Part = 0; Part < 4; ++Part)
8648 for (int i = 0; i < QuarterSize; i++)
8649 NewShuffleMask.push_back(
8650 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8651 SDValue NewShuffle = DAG.getVectorShuffle(
8652 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8653 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8654
8655 for (int Part = 0; Part < 4; ++Part)
8656 if (!Parts[Part])
8657 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8658 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8659 }
8660 // Build a vector out of the various parts and bitcast it back to the original
8661 // type.
8662 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8663 return DAG.getBitcast(VT, NewVec);
8664}
8665
8667 ArrayRef<int> ShuffleMask,
8668 SelectionDAG &DAG) {
8669 SDValue V1 = Op.getOperand(0);
8670 SDValue V2 = Op.getOperand(1);
8671 EVT VT = Op.getValueType();
8672 unsigned NumElts = VT.getVectorNumElements();
8673
8674 // An One-Off Identity mask is one that is mostly an identity mask from as
8675 // single source but contains a single element out-of-place, either from a
8676 // different vector or from another position in the same vector. As opposed to
8677 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8678 // pair directly.
8679 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8680 int &OffElement) {
8681 OffElement = -1;
8682 int NonUndef = 0;
8683 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8684 if (Mask[i] == -1)
8685 continue;
8686 NonUndef++;
8687 if (Mask[i] != i + BaseOffset) {
8688 if (OffElement == -1)
8689 OffElement = i;
8690 else
8691 return false;
8692 }
8693 }
8694 return NonUndef > 2 && OffElement != -1;
8695 };
8696 int OffElement;
8697 SDValue VInput;
8698 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8699 VInput = V1;
8700 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8701 VInput = V2;
8702 else
8703 return SDValue();
8704
8705 SDLoc dl(Op);
8706 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8707 ? MVT::i32
8708 : VT.getScalarType();
8709 SDValue Elt = DAG.getNode(
8710 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8711 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8712 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8713 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8714 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8715}
8716
8718 const ARMSubtarget *ST) {
8719 SDValue V1 = Op.getOperand(0);
8720 SDValue V2 = Op.getOperand(1);
8721 SDLoc dl(Op);
8722 EVT VT = Op.getValueType();
8724 unsigned EltSize = VT.getScalarSizeInBits();
8725
8726 if (ST->hasMVEIntegerOps() && EltSize == 1)
8727 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8728
8729 // Convert shuffles that are directly supported on NEON to target-specific
8730 // DAG nodes, instead of keeping them as shuffles and matching them again
8731 // during code selection. This is more efficient and avoids the possibility
8732 // of inconsistencies between legalization and selection.
8733 // FIXME: floating-point vectors should be canonicalized to integer vectors
8734 // of the same time so that they get CSEd properly.
8735 ArrayRef<int> ShuffleMask = SVN->getMask();
8736
8737 if (EltSize <= 32) {
8738 if (SVN->isSplat()) {
8739 int Lane = SVN->getSplatIndex();
8740 // If this is undef splat, generate it via "just" vdup, if possible.
8741 if (Lane == -1) Lane = 0;
8742
8743 // Test if V1 is a SCALAR_TO_VECTOR.
8744 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8745 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8746 }
8747 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8748 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8749 // reaches it).
8750 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8752 bool IsScalarToVector = true;
8753 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8754 if (!V1.getOperand(i).isUndef()) {
8755 IsScalarToVector = false;
8756 break;
8757 }
8758 if (IsScalarToVector)
8759 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8760 }
8761 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8762 DAG.getConstant(Lane, dl, MVT::i32));
8763 }
8764
8765 bool ReverseVEXT = false;
8766 unsigned Imm = 0;
8767 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8768 if (ReverseVEXT)
8769 std::swap(V1, V2);
8770 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8771 DAG.getConstant(Imm, dl, MVT::i32));
8772 }
8773
8774 if (isVREVMask(ShuffleMask, VT, 64))
8775 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8776 if (isVREVMask(ShuffleMask, VT, 32))
8777 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8778 if (isVREVMask(ShuffleMask, VT, 16))
8779 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8780
8781 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8782 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8783 DAG.getConstant(Imm, dl, MVT::i32));
8784 }
8785
8786 // Check for Neon shuffles that modify both input vectors in place.
8787 // If both results are used, i.e., if there are two shuffles with the same
8788 // source operands and with masks corresponding to both results of one of
8789 // these operations, DAG memoization will ensure that a single node is
8790 // used for both shuffles.
8791 unsigned WhichResult = 0;
8792 bool isV_UNDEF = false;
8793 if (ST->hasNEON()) {
8794 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8795 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8796 if (isV_UNDEF)
8797 V2 = V1;
8798 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8799 .getValue(WhichResult);
8800 }
8801 }
8802 if (ST->hasMVEIntegerOps()) {
8803 if (isVMOVNMask(ShuffleMask, VT, false, false))
8804 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8805 DAG.getConstant(0, dl, MVT::i32));
8806 if (isVMOVNMask(ShuffleMask, VT, true, false))
8807 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8808 DAG.getConstant(1, dl, MVT::i32));
8809 if (isVMOVNMask(ShuffleMask, VT, true, true))
8810 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8811 DAG.getConstant(1, dl, MVT::i32));
8812 }
8813
8814 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8815 // shuffles that produce a result larger than their operands with:
8816 // shuffle(concat(v1, undef), concat(v2, undef))
8817 // ->
8818 // shuffle(concat(v1, v2), undef)
8819 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8820 //
8821 // This is useful in the general case, but there are special cases where
8822 // native shuffles produce larger results: the two-result ops.
8823 //
8824 // Look through the concat when lowering them:
8825 // shuffle(concat(v1, v2), undef)
8826 // ->
8827 // concat(VZIP(v1, v2):0, :1)
8828 //
8829 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8830 SDValue SubV1 = V1->getOperand(0);
8831 SDValue SubV2 = V1->getOperand(1);
8832 EVT SubVT = SubV1.getValueType();
8833
8834 // We expect these to have been canonicalized to -1.
8835 assert(llvm::all_of(ShuffleMask, [&](int i) {
8836 return i < (int)VT.getVectorNumElements();
8837 }) && "Unexpected shuffle index into UNDEF operand!");
8838
8839 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8840 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8841 if (isV_UNDEF)
8842 SubV2 = SubV1;
8843 assert((WhichResult == 0) &&
8844 "In-place shuffle of concat can only have one result!");
8845 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8846 SubV1, SubV2);
8847 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8848 Res.getValue(1));
8849 }
8850 }
8851 }
8852
8853 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8854 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8855 return V;
8856
8857 for (bool Top : {false, true}) {
8858 for (bool SingleSource : {false, true}) {
8859 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8860 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8861 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8862 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8863 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8864 SingleSource ? V1 : V2);
8865 if (Top) {
8866 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8867 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8868 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8869 }
8870 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8871 }
8872 }
8873 }
8874 }
8875
8876 // If the shuffle is not directly supported and it has 4 elements, use
8877 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8878 unsigned NumElts = VT.getVectorNumElements();
8879 if (NumElts == 4) {
8880 unsigned PFIndexes[4];
8881 for (unsigned i = 0; i != 4; ++i) {
8882 if (ShuffleMask[i] < 0)
8883 PFIndexes[i] = 8;
8884 else
8885 PFIndexes[i] = ShuffleMask[i];
8886 }
8887
8888 // Compute the index in the perfect shuffle table.
8889 unsigned PFTableIndex =
8890 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8891 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8892 unsigned Cost = (PFEntry >> 30);
8893
8894 if (Cost <= 4) {
8895 if (ST->hasNEON())
8896 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8897 else if (isLegalMVEShuffleOp(PFEntry)) {
8898 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8899 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8900 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8901 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8902 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8903 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8904 }
8905 }
8906 }
8907
8908 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8909 if (EltSize >= 32) {
8910 // Do the expansion with floating-point types, since that is what the VFP
8911 // registers are defined to use, and since i64 is not legal.
8912 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8913 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8914 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8915 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8917 for (unsigned i = 0; i < NumElts; ++i) {
8918 if (ShuffleMask[i] < 0)
8919 Ops.push_back(DAG.getUNDEF(EltVT));
8920 else
8921 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8922 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8923 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8924 dl, MVT::i32)));
8925 }
8926 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8927 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8928 }
8929
8930 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8931 isReverseMask(ShuffleMask, VT))
8932 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8933
8934 if (ST->hasNEON() && VT == MVT::v8i8)
8935 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8936 return NewOp;
8937
8938 if (ST->hasMVEIntegerOps())
8939 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8940 return NewOp;
8941
8942 return SDValue();
8943}
8944
8946 const ARMSubtarget *ST) {
8947 EVT VecVT = Op.getOperand(0).getValueType();
8948 SDLoc dl(Op);
8949
8950 assert(ST->hasMVEIntegerOps() &&
8951 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8952
8953 SDValue Conv =
8954 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8955 unsigned Lane = Op.getConstantOperandVal(2);
8956 unsigned LaneWidth =
8958 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8959 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8960 Op.getOperand(1), DAG.getValueType(MVT::i1));
8961 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8962 DAG.getConstant(~Mask, dl, MVT::i32));
8963 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8964}
8965
8966SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8967 SelectionDAG &DAG) const {
8968 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8969 SDValue Lane = Op.getOperand(2);
8970 if (!isa<ConstantSDNode>(Lane))
8971 return SDValue();
8972
8973 SDValue Elt = Op.getOperand(1);
8974 EVT EltVT = Elt.getValueType();
8975
8976 if (Subtarget->hasMVEIntegerOps() &&
8977 Op.getValueType().getScalarSizeInBits() == 1)
8978 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8979
8980 if (getTypeAction(*DAG.getContext(), EltVT) ==
8982 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8983 // but the type system will try to do that if we don't intervene.
8984 // Reinterpret any such vector-element insertion as one with the
8985 // corresponding integer types.
8986
8987 SDLoc dl(Op);
8988
8989 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8990 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8992
8993 SDValue VecIn = Op.getOperand(0);
8994 EVT VecVT = VecIn.getValueType();
8995 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8996 VecVT.getVectorNumElements());
8997
8998 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8999 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
9000 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
9001 IVecIn, IElt, Lane);
9002 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
9003 }
9004
9005 return Op;
9006}
9007
9009 const ARMSubtarget *ST) {
9010 EVT VecVT = Op.getOperand(0).getValueType();
9011 SDLoc dl(Op);
9012
9013 assert(ST->hasMVEIntegerOps() &&
9014 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9015
9016 SDValue Conv =
9017 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9018 unsigned Lane = Op.getConstantOperandVal(1);
9019 unsigned LaneWidth =
9021 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9022 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9023 return Shift;
9024}
9025
9027 const ARMSubtarget *ST) {
9028 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9029 SDValue Lane = Op.getOperand(1);
9030 if (!isa<ConstantSDNode>(Lane))
9031 return SDValue();
9032
9033 SDValue Vec = Op.getOperand(0);
9034 EVT VT = Vec.getValueType();
9035
9036 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9037 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9038
9039 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9040 SDLoc dl(Op);
9041 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9042 }
9043
9044 return Op;
9045}
9046
9048 const ARMSubtarget *ST) {
9049 SDLoc dl(Op);
9050 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9051 "Unexpected custom CONCAT_VECTORS lowering");
9052 assert(isPowerOf2_32(Op.getNumOperands()) &&
9053 "Unexpected custom CONCAT_VECTORS lowering");
9054 assert(ST->hasMVEIntegerOps() &&
9055 "CONCAT_VECTORS lowering only supported for MVE");
9056
9057 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9058 EVT Op1VT = V1.getValueType();
9059 EVT Op2VT = V2.getValueType();
9060 assert(Op1VT == Op2VT && "Operand types don't match!");
9061 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9062 "Unexpected i1 concat operations!");
9063 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9064
9065 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9066 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9067
9068 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9069 // promoted to v8i16, etc.
9070 MVT ElType =
9072 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9073
9074 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9075 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9076 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9077 // ConcatVT.
9078 SDValue ConVec =
9079 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9080 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9081 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9082 }
9083
9084 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9085 // to be the right size for the destination. For example, if Op1 is v4i1
9086 // then the promoted vector is v4i32. The result of concatenation gives a
9087 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9088 // needs truncating to i16 and inserting in the result.
9089 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9090 EVT NewVT = NewV.getValueType();
9091 EVT ConcatVT = ConVec.getValueType();
9092 unsigned ExtScale = 1;
9093 if (NewVT == MVT::v2f64) {
9094 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9095 ExtScale = 2;
9096 }
9097 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9098 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9099 DAG.getIntPtrConstant(i * ExtScale, dl));
9100 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9101 DAG.getConstant(j, dl, MVT::i32));
9102 }
9103 return ConVec;
9104 };
9105 unsigned j = 0;
9106 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9107 ConVec = ExtractInto(NewV1, ConVec, j);
9108 ConVec = ExtractInto(NewV2, ConVec, j);
9109
9110 // Now return the result of comparing the subvector with zero, which will
9111 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9112 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9113 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9114 };
9115
9116 // Concat each pair of subvectors and pack into the lower half of the array.
9117 SmallVector<SDValue> ConcatOps(Op->ops());
9118 while (ConcatOps.size() > 1) {
9119 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9120 SDValue V1 = ConcatOps[I];
9121 SDValue V2 = ConcatOps[I + 1];
9122 ConcatOps[I / 2] = ConcatPair(V1, V2);
9123 }
9124 ConcatOps.resize(ConcatOps.size() / 2);
9125 }
9126 return ConcatOps[0];
9127}
9128
9130 const ARMSubtarget *ST) {
9131 EVT VT = Op->getValueType(0);
9132 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9133 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9134
9135 // The only time a CONCAT_VECTORS operation can have legal types is when
9136 // two 64-bit vectors are concatenated to a 128-bit vector.
9137 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9138 "unexpected CONCAT_VECTORS");
9139 SDLoc dl(Op);
9140 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9141 SDValue Op0 = Op.getOperand(0);
9142 SDValue Op1 = Op.getOperand(1);
9143 if (!Op0.isUndef())
9144 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9145 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9146 DAG.getIntPtrConstant(0, dl));
9147 if (!Op1.isUndef())
9148 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9149 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9150 DAG.getIntPtrConstant(1, dl));
9151 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9152}
9153
9155 const ARMSubtarget *ST) {
9156 SDValue V1 = Op.getOperand(0);
9157 SDValue V2 = Op.getOperand(1);
9158 SDLoc dl(Op);
9159 EVT VT = Op.getValueType();
9160 EVT Op1VT = V1.getValueType();
9161 unsigned NumElts = VT.getVectorNumElements();
9162 unsigned Index = V2->getAsZExtVal();
9163
9164 assert(VT.getScalarSizeInBits() == 1 &&
9165 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9166 assert(ST->hasMVEIntegerOps() &&
9167 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9168
9169 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9170
9171 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9172 // promoted to v8i16, etc.
9173
9175
9176 if (NumElts == 2) {
9177 EVT SubVT = MVT::v4i32;
9178 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9179 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9180 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9181 DAG.getIntPtrConstant(i, dl));
9182 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9183 DAG.getConstant(j, dl, MVT::i32));
9184 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9185 DAG.getConstant(j + 1, dl, MVT::i32));
9186 }
9187 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9188 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9189 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9190 }
9191
9192 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9193 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9194 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9195 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9196 DAG.getIntPtrConstant(i, dl));
9197 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9198 DAG.getConstant(j, dl, MVT::i32));
9199 }
9200
9201 // Now return the result of comparing the subvector with zero,
9202 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9203 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9204 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9205}
9206
9207// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9209 const ARMSubtarget *ST) {
9210 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9211 EVT VT = N->getValueType(0);
9212 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9213 "Expected a vector i1 type!");
9214 SDValue Op = N->getOperand(0);
9215 EVT FromVT = Op.getValueType();
9216 SDLoc DL(N);
9217
9218 SDValue And =
9219 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9220 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9221 DAG.getCondCode(ISD::SETNE));
9222}
9223
9225 const ARMSubtarget *Subtarget) {
9226 if (!Subtarget->hasMVEIntegerOps())
9227 return SDValue();
9228
9229 EVT ToVT = N->getValueType(0);
9230 if (ToVT.getScalarType() == MVT::i1)
9231 return LowerTruncatei1(N, DAG, Subtarget);
9232
9233 // MVE does not have a single instruction to perform the truncation of a v4i32
9234 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9235 // Most of the instructions in MVE follow the 'Beats' system, where moving
9236 // values from different lanes is usually something that the instructions
9237 // avoid.
9238 //
9239 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9240 // which take a the top/bottom half of a larger lane and extend it (or do the
9241 // opposite, truncating into the top/bottom lane from a larger lane). Note
9242 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9243 // bottom 16bits from each vector lane. This works really well with T/B
9244 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9245 // to move order.
9246 //
9247 // But truncates and sext/zext are always going to be fairly common from llvm.
9248 // We have several options for how to deal with them:
9249 // - Wherever possible combine them into an instruction that makes them
9250 // "free". This includes loads/stores, which can perform the trunc as part
9251 // of the memory operation. Or certain shuffles that can be turned into
9252 // VMOVN/VMOVL.
9253 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9254 // trunc(mul(sext(a), sext(b))) may become
9255 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9256 // this case can use VMULL). This is performed in the
9257 // MVELaneInterleavingPass.
9258 // - Otherwise we have an option. By default we would expand the
9259 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9260 // registers. One for each vector lane in the vector. This can obviously be
9261 // very expensive.
9262 // - The other option is to use the fact that loads/store can extend/truncate
9263 // to turn a trunc into two truncating stack stores and a stack reload. This
9264 // becomes 3 back-to-back memory operations, but at least that is less than
9265 // all the insert/extracts.
9266 //
9267 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9268 // are either optimized where they can be, or eventually lowered into stack
9269 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9270 // two early, where other instructions would be better, and stops us from
9271 // having to reconstruct multiple buildvector shuffles into loads/stores.
9272 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9273 return SDValue();
9274 EVT FromVT = N->getOperand(0).getValueType();
9275 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9276 return SDValue();
9277
9278 SDValue Lo, Hi;
9279 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9280 SDLoc DL(N);
9281 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9282}
9283
9285 const ARMSubtarget *Subtarget) {
9286 if (!Subtarget->hasMVEIntegerOps())
9287 return SDValue();
9288
9289 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9290
9291 EVT ToVT = N->getValueType(0);
9292 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9293 return SDValue();
9294 SDValue Op = N->getOperand(0);
9295 EVT FromVT = Op.getValueType();
9296 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9297 return SDValue();
9298
9299 SDLoc DL(N);
9300 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9301 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9302 ExtVT = MVT::v8i16;
9303
9304 unsigned Opcode =
9306 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9307 SDValue Ext1 = Ext.getValue(1);
9308
9309 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9310 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9311 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9312 }
9313
9314 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9315}
9316
9317/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9318/// element has been zero/sign-extended, depending on the isSigned parameter,
9319/// from an integer type half its size.
9321 bool isSigned) {
9322 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9323 EVT VT = N->getValueType(0);
9324 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9325 SDNode *BVN = N->getOperand(0).getNode();
9326 if (BVN->getValueType(0) != MVT::v4i32 ||
9327 BVN->getOpcode() != ISD::BUILD_VECTOR)
9328 return false;
9329 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9330 unsigned HiElt = 1 - LoElt;
9335 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9336 return false;
9337 if (isSigned) {
9338 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9339 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9340 return true;
9341 } else {
9342 if (Hi0->isZero() && Hi1->isZero())
9343 return true;
9344 }
9345 return false;
9346 }
9347
9348 if (N->getOpcode() != ISD::BUILD_VECTOR)
9349 return false;
9350
9351 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9352 SDNode *Elt = N->getOperand(i).getNode();
9354 unsigned EltSize = VT.getScalarSizeInBits();
9355 unsigned HalfSize = EltSize / 2;
9356 if (isSigned) {
9357 if (!isIntN(HalfSize, C->getSExtValue()))
9358 return false;
9359 } else {
9360 if (!isUIntN(HalfSize, C->getZExtValue()))
9361 return false;
9362 }
9363 continue;
9364 }
9365 return false;
9366 }
9367
9368 return true;
9369}
9370
9371/// isSignExtended - Check if a node is a vector value that is sign-extended
9372/// or a constant BUILD_VECTOR with sign-extended elements.
9374 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9375 return true;
9376 if (isExtendedBUILD_VECTOR(N, DAG, true))
9377 return true;
9378 return false;
9379}
9380
9381/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9382/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9384 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9386 return true;
9387 if (isExtendedBUILD_VECTOR(N, DAG, false))
9388 return true;
9389 return false;
9390}
9391
9392static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9393 if (OrigVT.getSizeInBits() >= 64)
9394 return OrigVT;
9395
9396 assert(OrigVT.isSimple() && "Expecting a simple value type");
9397
9398 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9399 switch (OrigSimpleTy) {
9400 default: llvm_unreachable("Unexpected Vector Type");
9401 case MVT::v2i8:
9402 case MVT::v2i16:
9403 return MVT::v2i32;
9404 case MVT::v4i8:
9405 return MVT::v4i16;
9406 }
9407}
9408
9409/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9410/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9411/// We insert the required extension here to get the vector to fill a D register.
9413 const EVT &OrigTy,
9414 const EVT &ExtTy,
9415 unsigned ExtOpcode) {
9416 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9417 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9418 // 64-bits we need to insert a new extension so that it will be 64-bits.
9419 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9420 if (OrigTy.getSizeInBits() >= 64)
9421 return N;
9422
9423 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9424 EVT NewVT = getExtensionTo64Bits(OrigTy);
9425
9426 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9427}
9428
9429/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9430/// does not do any sign/zero extension. If the original vector is less
9431/// than 64 bits, an appropriate extension will be added after the load to
9432/// reach a total size of 64 bits. We have to add the extension separately
9433/// because ARM does not have a sign/zero extending load for vectors.
9435 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9436
9437 // The load already has the right type.
9438 if (ExtendedTy == LD->getMemoryVT())
9439 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9440 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9441 LD->getMemOperand()->getFlags());
9442
9443 // We need to create a zextload/sextload. We cannot just create a load
9444 // followed by a zext/zext node because LowerMUL is also run during normal
9445 // operation legalization where we can't create illegal types.
9446 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9447 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9448 LD->getMemoryVT(), LD->getAlign(),
9449 LD->getMemOperand()->getFlags());
9450}
9451
9452/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9453/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9454/// the unextended value. The unextended vector should be 64 bits so that it can
9455/// be used as an operand to a VMULL instruction. If the original vector size
9456/// before extension is less than 64 bits we add a an extension to resize
9457/// the vector to 64 bits.
9459 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9460 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9461 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9462 N->getOperand(0)->getValueType(0),
9463 N->getValueType(0),
9464 N->getOpcode());
9465
9466 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9467 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9468 "Expected extending load");
9469
9470 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9471 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9472 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9473 SDValue extLoad =
9474 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9475 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9476
9477 return newLoad;
9478 }
9479
9480 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9481 // have been legalized as a BITCAST from v4i32.
9482 if (N->getOpcode() == ISD::BITCAST) {
9483 SDNode *BVN = N->getOperand(0).getNode();
9485 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9486 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9487 return DAG.getBuildVector(
9488 MVT::v2i32, SDLoc(N),
9489 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9490 }
9491 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9492 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9493 EVT VT = N->getValueType(0);
9494 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9495 unsigned NumElts = VT.getVectorNumElements();
9496 MVT TruncVT = MVT::getIntegerVT(EltSize);
9498 SDLoc dl(N);
9499 for (unsigned i = 0; i != NumElts; ++i) {
9500 const APInt &CInt = N->getConstantOperandAPInt(i);
9501 // Element types smaller than 32 bits are not legal, so use i32 elements.
9502 // The values are implicitly truncated so sext vs. zext doesn't matter.
9503 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9504 }
9505 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9506}
9507
9508static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9509 unsigned Opcode = N->getOpcode();
9510 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9511 SDNode *N0 = N->getOperand(0).getNode();
9512 SDNode *N1 = N->getOperand(1).getNode();
9513 return N0->hasOneUse() && N1->hasOneUse() &&
9514 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9515 }
9516 return false;
9517}
9518
9519static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9520 unsigned Opcode = N->getOpcode();
9521 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9522 SDNode *N0 = N->getOperand(0).getNode();
9523 SDNode *N1 = N->getOperand(1).getNode();
9524 return N0->hasOneUse() && N1->hasOneUse() &&
9525 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9526 }
9527 return false;
9528}
9529
9531 // Multiplications are only custom-lowered for 128-bit vectors so that
9532 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9533 EVT VT = Op.getValueType();
9534 assert(VT.is128BitVector() && VT.isInteger() &&
9535 "unexpected type for custom-lowering ISD::MUL");
9536 SDNode *N0 = Op.getOperand(0).getNode();
9537 SDNode *N1 = Op.getOperand(1).getNode();
9538 unsigned NewOpc = 0;
9539 bool isMLA = false;
9540 bool isN0SExt = isSignExtended(N0, DAG);
9541 bool isN1SExt = isSignExtended(N1, DAG);
9542 if (isN0SExt && isN1SExt)
9543 NewOpc = ARMISD::VMULLs;
9544 else {
9545 bool isN0ZExt = isZeroExtended(N0, DAG);
9546 bool isN1ZExt = isZeroExtended(N1, DAG);
9547 if (isN0ZExt && isN1ZExt)
9548 NewOpc = ARMISD::VMULLu;
9549 else if (isN1SExt || isN1ZExt) {
9550 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9551 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9552 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9553 NewOpc = ARMISD::VMULLs;
9554 isMLA = true;
9555 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9556 NewOpc = ARMISD::VMULLu;
9557 isMLA = true;
9558 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9559 std::swap(N0, N1);
9560 NewOpc = ARMISD::VMULLu;
9561 isMLA = true;
9562 }
9563 }
9564
9565 if (!NewOpc) {
9566 if (VT == MVT::v2i64)
9567 // Fall through to expand this. It is not legal.
9568 return SDValue();
9569 else
9570 // Other vector multiplications are legal.
9571 return Op;
9572 }
9573 }
9574
9575 // Legalize to a VMULL instruction.
9576 SDLoc DL(Op);
9577 SDValue Op0;
9578 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9579 if (!isMLA) {
9580 Op0 = SkipExtensionForVMULL(N0, DAG);
9582 Op1.getValueType().is64BitVector() &&
9583 "unexpected types for extended operands to VMULL");
9584 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9585 }
9586
9587 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9588 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9589 // vmull q0, d4, d6
9590 // vmlal q0, d5, d6
9591 // is faster than
9592 // vaddl q0, d4, d5
9593 // vmovl q1, d6
9594 // vmul q0, q0, q1
9595 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9596 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9597 EVT Op1VT = Op1.getValueType();
9598 return DAG.getNode(N0->getOpcode(), DL, VT,
9599 DAG.getNode(NewOpc, DL, VT,
9600 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9601 DAG.getNode(NewOpc, DL, VT,
9602 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9603}
9604
9606 SelectionDAG &DAG) {
9607 // TODO: Should this propagate fast-math-flags?
9608
9609 // Convert to float
9610 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9611 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9612 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9613 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9614 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9615 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9616 // Get reciprocal estimate.
9617 // float4 recip = vrecpeq_f32(yf);
9618 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9619 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9620 Y);
9621 // Because char has a smaller range than uchar, we can actually get away
9622 // without any newton steps. This requires that we use a weird bias
9623 // of 0xb000, however (again, this has been exhaustively tested).
9624 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9625 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9626 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9627 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9628 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9629 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9630 // Convert back to short.
9631 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9632 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9633 return X;
9634}
9635
9637 SelectionDAG &DAG) {
9638 // TODO: Should this propagate fast-math-flags?
9639
9640 SDValue N2;
9641 // Convert to float.
9642 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9643 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9644 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9645 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9646 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9647 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9648
9649 // Use reciprocal estimate and one refinement step.
9650 // float4 recip = vrecpeq_f32(yf);
9651 // recip *= vrecpsq_f32(yf, recip);
9652 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9653 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9654 N1);
9655 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9656 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9657 N1, N2);
9658 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9659 // Because short has a smaller range than ushort, we can actually get away
9660 // with only a single newton step. This requires that we use a weird bias
9661 // of 89, however (again, this has been exhaustively tested).
9662 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9663 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9664 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9665 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9666 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9667 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9668 // Convert back to integer and return.
9669 // return vmovn_s32(vcvt_s32_f32(result));
9670 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9671 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9672 return N0;
9673}
9674
9676 const ARMSubtarget *ST) {
9677 EVT VT = Op.getValueType();
9678 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9679 "unexpected type for custom-lowering ISD::SDIV");
9680
9681 SDLoc dl(Op);
9682 SDValue N0 = Op.getOperand(0);
9683 SDValue N1 = Op.getOperand(1);
9684 SDValue N2, N3;
9685
9686 if (VT == MVT::v8i8) {
9687 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9688 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9689
9690 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9691 DAG.getIntPtrConstant(4, dl));
9692 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9693 DAG.getIntPtrConstant(4, dl));
9694 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9695 DAG.getIntPtrConstant(0, dl));
9696 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9697 DAG.getIntPtrConstant(0, dl));
9698
9699 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9700 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9701
9702 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9703 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9704
9705 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9706 return N0;
9707 }
9708 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9709}
9710
9712 const ARMSubtarget *ST) {
9713 // TODO: Should this propagate fast-math-flags?
9714 EVT VT = Op.getValueType();
9715 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9716 "unexpected type for custom-lowering ISD::UDIV");
9717
9718 SDLoc dl(Op);
9719 SDValue N0 = Op.getOperand(0);
9720 SDValue N1 = Op.getOperand(1);
9721 SDValue N2, N3;
9722
9723 if (VT == MVT::v8i8) {
9724 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9725 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9726
9727 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9728 DAG.getIntPtrConstant(4, dl));
9729 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9730 DAG.getIntPtrConstant(4, dl));
9731 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9732 DAG.getIntPtrConstant(0, dl));
9733 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9734 DAG.getIntPtrConstant(0, dl));
9735
9736 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9737 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9738
9739 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9740 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9741
9742 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9743 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9744 MVT::i32),
9745 N0);
9746 return N0;
9747 }
9748
9749 // v4i16 sdiv ... Convert to float.
9750 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9751 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9752 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9753 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9754 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9755 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9756
9757 // Use reciprocal estimate and two refinement steps.
9758 // float4 recip = vrecpeq_f32(yf);
9759 // recip *= vrecpsq_f32(yf, recip);
9760 // recip *= vrecpsq_f32(yf, recip);
9761 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9762 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9763 BN1);
9764 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9765 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9766 BN1, N2);
9767 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9768 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9769 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9770 BN1, N2);
9771 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9772 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9773 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9774 // and that it will never cause us to return an answer too large).
9775 // float4 result = as_float4(as_int4(xf*recip) + 2);
9776 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9777 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9778 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9779 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9780 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9781 // Convert back to integer and return.
9782 // return vmovn_u32(vcvt_s32_f32(result));
9783 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9784 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9785 return N0;
9786}
9787
9789 SDNode *N = Op.getNode();
9790 EVT VT = N->getValueType(0);
9791 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9792
9793 SDValue Carry = Op.getOperand(2);
9794
9795 SDLoc DL(Op);
9796
9797 SDValue Result;
9798 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9799 // This converts the boolean value carry into the carry flag.
9800 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9801
9802 // Do the addition proper using the carry flag we wanted.
9803 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9804 Op.getOperand(1), Carry);
9805
9806 // Now convert the carry flag into a boolean value.
9807 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9808 } else {
9809 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9810 // have to invert the carry first.
9811 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9812 DAG.getConstant(1, DL, MVT::i32), Carry);
9813 // This converts the boolean value carry into the carry flag.
9814 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9815
9816 // Do the subtraction proper using the carry flag we wanted.
9817 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9818 Op.getOperand(1), Carry);
9819
9820 // Now convert the carry flag into a boolean value.
9821 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9822 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9823 // by ISD::USUBO_CARRY, so compute 1 - C.
9824 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9825 DAG.getConstant(1, DL, MVT::i32), Carry);
9826 }
9827
9828 // Return both values.
9829 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9830}
9831
9832SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9833 assert(Subtarget->isTargetDarwin());
9834
9835 // For iOS, we want to call an alternative entry point: __sincos_stret,
9836 // return values are passed via sret.
9837 SDLoc dl(Op);
9838 SDValue Arg = Op.getOperand(0);
9839 EVT ArgVT = Arg.getValueType();
9840 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9841 auto PtrVT = getPointerTy(DAG.getDataLayout());
9842
9843 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9844
9845 // Pair of floats / doubles used to pass the result.
9846 Type *RetTy = StructType::get(ArgTy, ArgTy);
9847 auto &DL = DAG.getDataLayout();
9848
9850 bool ShouldUseSRet = getTM().isAPCS_ABI();
9851 SDValue SRet;
9852 if (ShouldUseSRet) {
9853 // Create stack object for sret.
9854 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9855 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9856 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9857 SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
9858
9860 Entry.IsSExt = false;
9861 Entry.IsZExt = false;
9862 Entry.IsSRet = true;
9863 Args.push_back(Entry);
9864 RetTy = Type::getVoidTy(*DAG.getContext());
9865 }
9866
9867 Args.emplace_back(Arg, ArgTy);
9868
9869 RTLIB::Libcall LC =
9870 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9871 const char *LibcallName = getLibcallName(LC);
9873 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9874
9875 TargetLowering::CallLoweringInfo CLI(DAG);
9876 CLI.setDebugLoc(dl)
9877 .setChain(DAG.getEntryNode())
9878 .setCallee(CC, RetTy, Callee, std::move(Args))
9879 .setDiscardResult(ShouldUseSRet);
9880 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9881
9882 if (!ShouldUseSRet)
9883 return CallResult.first;
9884
9885 SDValue LoadSin =
9886 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9887
9888 // Address of cos field.
9889 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9890 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9891 SDValue LoadCos =
9892 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9893
9894 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9895 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9896 LoadSin.getValue(0), LoadCos.getValue(0));
9897}
9898
9899SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9900 bool Signed,
9901 SDValue &Chain) const {
9902 EVT VT = Op.getValueType();
9903 assert((VT == MVT::i32 || VT == MVT::i64) &&
9904 "unexpected type for custom lowering DIV");
9905 SDLoc dl(Op);
9906
9907 const auto &DL = DAG.getDataLayout();
9908 RTLIB::Libcall LC;
9909 if (Signed)
9910 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9911 else
9912 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9913
9914 const char *Name = getLibcallName(LC);
9915 SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
9916
9918
9919 for (auto AI : {1, 0}) {
9920 SDValue Operand = Op.getOperand(AI);
9921 Args.emplace_back(Operand,
9922 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9923 }
9924
9925 CallLoweringInfo CLI(DAG);
9926 CLI.setDebugLoc(dl)
9927 .setChain(Chain)
9929 ES, std::move(Args));
9930
9931 return LowerCallTo(CLI).first;
9932}
9933
9934// This is a code size optimisation: return the original SDIV node to
9935// DAGCombiner when we don't want to expand SDIV into a sequence of
9936// instructions, and an empty node otherwise which will cause the
9937// SDIV to be expanded in DAGCombine.
9938SDValue
9939ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9940 SelectionDAG &DAG,
9941 SmallVectorImpl<SDNode *> &Created) const {
9942 // TODO: Support SREM
9943 if (N->getOpcode() != ISD::SDIV)
9944 return SDValue();
9945
9946 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9947 const bool MinSize = ST.hasMinSize();
9948 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9949 : ST.hasDivideInARMMode();
9950
9951 // Don't touch vector types; rewriting this may lead to scalarizing
9952 // the int divs.
9953 if (N->getOperand(0).getValueType().isVector())
9954 return SDValue();
9955
9956 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9957 // hwdiv support for this to be really profitable.
9958 if (!(MinSize && HasDivide))
9959 return SDValue();
9960
9961 // ARM mode is a bit simpler than Thumb: we can handle large power
9962 // of 2 immediates with 1 mov instruction; no further checks required,
9963 // just return the sdiv node.
9964 if (!ST.isThumb())
9965 return SDValue(N, 0);
9966
9967 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9968 // and thus lose the code size benefits of a MOVS that requires only 2.
9969 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9970 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9971 if (Divisor.sgt(128))
9972 return SDValue();
9973
9974 return SDValue(N, 0);
9975}
9976
9977SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9978 bool Signed) const {
9979 assert(Op.getValueType() == MVT::i32 &&
9980 "unexpected type for custom lowering DIV");
9981 SDLoc dl(Op);
9982
9983 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9984 DAG.getEntryNode(), Op.getOperand(1));
9985
9986 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9987}
9988
9990 SDLoc DL(N);
9991 SDValue Op = N->getOperand(1);
9992 if (N->getValueType(0) == MVT::i32)
9993 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9994 SDValue Lo, Hi;
9995 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9996 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9997 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9998}
9999
10000void ARMTargetLowering::ExpandDIV_Windows(
10001 SDValue Op, SelectionDAG &DAG, bool Signed,
10003 const auto &DL = DAG.getDataLayout();
10004
10005 assert(Op.getValueType() == MVT::i64 &&
10006 "unexpected type for custom lowering DIV");
10007 SDLoc dl(Op);
10008
10009 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
10010
10011 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
10012
10013 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
10014 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
10015 DAG.getConstant(32, dl, getPointerTy(DL)));
10016 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
10017
10018 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10019}
10020
10022 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
10023 EVT MemVT = LD->getMemoryVT();
10024 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10025 MemVT == MVT::v16i1) &&
10026 "Expected a predicate type!");
10027 assert(MemVT == Op.getValueType());
10028 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10029 "Expected a non-extending load");
10030 assert(LD->isUnindexed() && "Expected a unindexed load");
10031
10032 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10033 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10034 // need to make sure that 8/4/2 bits are actually loaded into the correct
10035 // place, which means loading the value and then shuffling the values into
10036 // the bottom bits of the predicate.
10037 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10038 // for BE).
10039 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10040 // a natural VMSR(load), so needs to be reversed.
10041
10042 SDLoc dl(Op);
10043 SDValue Load = DAG.getExtLoad(
10044 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10046 LD->getMemOperand());
10047 SDValue Val = Load;
10048 if (DAG.getDataLayout().isBigEndian())
10049 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10050 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10051 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10052 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10053 if (MemVT != MVT::v16i1)
10054 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10055 DAG.getConstant(0, dl, MVT::i32));
10056 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10057}
10058
10059void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10060 SelectionDAG &DAG) const {
10061 LoadSDNode *LD = cast<LoadSDNode>(N);
10062 EVT MemVT = LD->getMemoryVT();
10063 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10064
10065 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10066 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10067 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10068 SDLoc dl(N);
10070 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10071 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10072 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10073 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10074 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10075 Results.append({Pair, Result.getValue(2)});
10076 }
10077}
10078
10080 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10081 EVT MemVT = ST->getMemoryVT();
10082 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10083 MemVT == MVT::v16i1) &&
10084 "Expected a predicate type!");
10085 assert(MemVT == ST->getValue().getValueType());
10086 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10087 assert(ST->isUnindexed() && "Expected a unindexed store");
10088
10089 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10090 // top bits unset and a scalar store.
10091 SDLoc dl(Op);
10092 SDValue Build = ST->getValue();
10093 if (MemVT != MVT::v16i1) {
10095 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10096 unsigned Elt = DAG.getDataLayout().isBigEndian()
10097 ? MemVT.getVectorNumElements() - I - 1
10098 : I;
10099 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10100 DAG.getConstant(Elt, dl, MVT::i32)));
10101 }
10102 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10103 Ops.push_back(DAG.getUNDEF(MVT::i32));
10104 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10105 }
10106 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10107 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10108 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10109 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10110 DAG.getConstant(16, dl, MVT::i32));
10111 return DAG.getTruncStore(
10112 ST->getChain(), dl, GRP, ST->getBasePtr(),
10114 ST->getMemOperand());
10115}
10116
10118 const ARMSubtarget *Subtarget) {
10119 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10120 EVT MemVT = ST->getMemoryVT();
10121 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10122
10123 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10124 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10125 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10126 SDNode *N = Op.getNode();
10127 SDLoc dl(N);
10128
10129 SDValue Lo = DAG.getNode(
10130 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10131 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10132 MVT::i32));
10133 SDValue Hi = DAG.getNode(
10134 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10135 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10136 MVT::i32));
10137
10138 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10139 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10140 MemVT, ST->getMemOperand());
10141 } else if (Subtarget->hasMVEIntegerOps() &&
10142 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10143 MemVT == MVT::v16i1))) {
10144 return LowerPredicateStore(Op, DAG);
10145 }
10146
10147 return SDValue();
10148}
10149
10150static bool isZeroVector(SDValue N) {
10151 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10152 (N->getOpcode() == ARMISD::VMOVIMM &&
10153 isNullConstant(N->getOperand(0))));
10154}
10155
10158 MVT VT = Op.getSimpleValueType();
10159 SDValue Mask = N->getMask();
10160 SDValue PassThru = N->getPassThru();
10161 SDLoc dl(Op);
10162
10163 if (isZeroVector(PassThru))
10164 return Op;
10165
10166 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10167 // zero too, and other values are lowered to a select.
10168 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10169 DAG.getTargetConstant(0, dl, MVT::i32));
10170 SDValue NewLoad = DAG.getMaskedLoad(
10171 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10172 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10173 N->getExtensionType(), N->isExpandingLoad());
10174 SDValue Combo = NewLoad;
10175 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10176 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10177 isZeroVector(PassThru->getOperand(0));
10178 if (!PassThru.isUndef() && !PassThruIsCastZero)
10179 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10180 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10181}
10182
10184 const ARMSubtarget *ST) {
10185 if (!ST->hasMVEIntegerOps())
10186 return SDValue();
10187
10188 SDLoc dl(Op);
10189 unsigned BaseOpcode = 0;
10190 switch (Op->getOpcode()) {
10191 default: llvm_unreachable("Expected VECREDUCE opcode");
10192 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10193 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10194 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10195 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10196 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10197 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10198 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10199 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10200 }
10201
10202 SDValue Op0 = Op->getOperand(0);
10203 EVT VT = Op0.getValueType();
10204 EVT EltVT = VT.getVectorElementType();
10205 unsigned NumElts = VT.getVectorNumElements();
10206 unsigned NumActiveLanes = NumElts;
10207
10208 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10209 NumActiveLanes == 2) &&
10210 "Only expected a power 2 vector size");
10211
10212 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10213 // allows us to easily extract vector elements from the lanes.
10214 while (NumActiveLanes > 4) {
10215 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10216 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10217 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10218 NumActiveLanes /= 2;
10219 }
10220
10221 SDValue Res;
10222 if (NumActiveLanes == 4) {
10223 // The remaining 4 elements are summed sequentially
10224 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10225 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10226 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10227 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10228 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10229 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10230 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10231 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10232 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10233 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10234 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10235 } else {
10236 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10237 DAG.getConstant(0, dl, MVT::i32));
10238 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10239 DAG.getConstant(1, dl, MVT::i32));
10240 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10241 }
10242
10243 // Result type may be wider than element type.
10244 if (EltVT != Op->getValueType(0))
10245 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10246 return Res;
10247}
10248
10250 const ARMSubtarget *ST) {
10251 if (!ST->hasMVEFloatOps())
10252 return SDValue();
10253 return LowerVecReduce(Op, DAG, ST);
10254}
10255
10257 const ARMSubtarget *ST) {
10258 if (!ST->hasNEON())
10259 return SDValue();
10260
10261 SDLoc dl(Op);
10262 SDValue Op0 = Op->getOperand(0);
10263 EVT VT = Op0.getValueType();
10264 EVT EltVT = VT.getVectorElementType();
10265
10266 unsigned PairwiseIntrinsic = 0;
10267 switch (Op->getOpcode()) {
10268 default:
10269 llvm_unreachable("Expected VECREDUCE opcode");
10270 case ISD::VECREDUCE_UMIN:
10271 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10272 break;
10273 case ISD::VECREDUCE_UMAX:
10274 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10275 break;
10276 case ISD::VECREDUCE_SMIN:
10277 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10278 break;
10279 case ISD::VECREDUCE_SMAX:
10280 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10281 break;
10282 }
10283 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10284
10285 unsigned NumElts = VT.getVectorNumElements();
10286 unsigned NumActiveLanes = NumElts;
10287
10288 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10289 NumActiveLanes == 2) &&
10290 "Only expected a power 2 vector size");
10291
10292 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10293 if (VT.is128BitVector()) {
10294 SDValue Lo, Hi;
10295 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10296 VT = Lo.getValueType();
10297 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10298 NumActiveLanes /= 2;
10299 }
10300
10301 // Use pairwise reductions until one lane remains
10302 while (NumActiveLanes > 1) {
10303 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10304 NumActiveLanes /= 2;
10305 }
10306
10307 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10308 DAG.getConstant(0, dl, MVT::i32));
10309
10310 // Result type may be wider than element type.
10311 if (EltVT != Op.getValueType()) {
10312 unsigned Extend = 0;
10313 switch (Op->getOpcode()) {
10314 default:
10315 llvm_unreachable("Expected VECREDUCE opcode");
10316 case ISD::VECREDUCE_UMIN:
10317 case ISD::VECREDUCE_UMAX:
10318 Extend = ISD::ZERO_EXTEND;
10319 break;
10320 case ISD::VECREDUCE_SMIN:
10321 case ISD::VECREDUCE_SMAX:
10322 Extend = ISD::SIGN_EXTEND;
10323 break;
10324 }
10325 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10326 }
10327 return Res;
10328}
10329
10331 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10332 // Acquire/Release load/store is not legal for targets without a dmb or
10333 // equivalent available.
10334 return SDValue();
10335
10336 // Monotonic load/store is legal for all targets.
10337 return Op;
10338}
10339
10342 SelectionDAG &DAG,
10343 const ARMSubtarget *Subtarget) {
10344 SDLoc DL(N);
10345 // Under Power Management extensions, the cycle-count is:
10346 // mrc p15, #0, <Rt>, c9, c13, #0
10347 SDValue Ops[] = { N->getOperand(0), // Chain
10348 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10349 DAG.getTargetConstant(15, DL, MVT::i32),
10350 DAG.getTargetConstant(0, DL, MVT::i32),
10351 DAG.getTargetConstant(9, DL, MVT::i32),
10352 DAG.getTargetConstant(13, DL, MVT::i32),
10353 DAG.getTargetConstant(0, DL, MVT::i32)
10354 };
10355
10356 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10357 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10358 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10359 DAG.getConstant(0, DL, MVT::i32)));
10360 Results.push_back(Cycles32.getValue(1));
10361}
10362
10364 SDValue V1) {
10365 SDLoc dl(V0.getNode());
10366 SDValue RegClass =
10367 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10368 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10369 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10370 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10371 return SDValue(
10372 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10373}
10374
10376 SDLoc dl(V.getNode());
10377 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10378 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10379 if (isBigEndian)
10380 std::swap(VLo, VHi);
10381 return createGPRPairNode2xi32(DAG, VLo, VHi);
10382}
10383
10386 SelectionDAG &DAG) {
10387 assert(N->getValueType(0) == MVT::i64 &&
10388 "AtomicCmpSwap on types less than 64 should be legal");
10389 SDValue Ops[] = {
10390 createGPRPairNode2xi32(DAG, N->getOperand(1),
10391 DAG.getUNDEF(MVT::i32)), // pointer, temp
10392 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10393 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10394 N->getOperand(0), // chain in
10395 };
10396 SDNode *CmpSwap = DAG.getMachineNode(
10397 ARM::CMP_SWAP_64, SDLoc(N),
10398 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10399
10400 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10401 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10402
10403 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10404
10405 SDValue Lo =
10406 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10407 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10408 SDValue Hi =
10409 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10410 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10411 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10412 Results.push_back(SDValue(CmpSwap, 2));
10413}
10414
10415SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10416 SDLoc dl(Op);
10417 EVT VT = Op.getValueType();
10418 SDValue Chain = Op.getOperand(0);
10419 SDValue LHS = Op.getOperand(1);
10420 SDValue RHS = Op.getOperand(2);
10421 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10422 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10423
10424 // If we don't have instructions of this float type then soften to a libcall
10425 // and use SETCC instead.
10426 if (isUnsupportedFloatingType(LHS.getValueType())) {
10427 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10428 Chain, IsSignaling);
10429 if (!RHS.getNode()) {
10430 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10431 CC = ISD::SETNE;
10432 }
10433 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10434 DAG.getCondCode(CC));
10435 return DAG.getMergeValues({Result, Chain}, dl);
10436 }
10437
10438 ARMCC::CondCodes CondCode, CondCode2;
10439 FPCCToARMCC(CC, CondCode, CondCode2);
10440
10441 SDValue True = DAG.getConstant(1, dl, VT);
10442 SDValue False = DAG.getConstant(0, dl, VT);
10443 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10444 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10445 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10446 if (CondCode2 != ARMCC::AL) {
10447 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10448 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10449 }
10450 return DAG.getMergeValues({Result, Chain}, dl);
10451}
10452
10453SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10454 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10455
10456 EVT VT = getPointerTy(DAG.getDataLayout());
10457 int FI = MFI.CreateFixedObject(4, 0, false);
10458 return DAG.getFrameIndex(FI, VT);
10459}
10460
10461SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10462 SelectionDAG &DAG) const {
10463 SDLoc DL(Op);
10464 MakeLibCallOptions CallOptions;
10465 MVT SVT = Op.getOperand(0).getSimpleValueType();
10466 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10467 SDValue Res =
10468 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10469 return DAG.getBitcast(MVT::i32, Res);
10470}
10471
10472SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10473 SDLoc dl(Op);
10474 SDValue LHS = Op.getOperand(0);
10475 SDValue RHS = Op.getOperand(1);
10476
10477 // Determine if this is signed or unsigned comparison
10478 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10479
10480 // Special case for Thumb1 UCMP only
10481 if (!IsSigned && Subtarget->isThumb1Only()) {
10482 // For Thumb unsigned comparison, use this sequence:
10483 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10484 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10485 // cmp r1, r0 ; compare RHS with LHS
10486 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10487 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10488
10489 // First subtraction: LHS - RHS
10490 SDValue Sub1WithFlags = DAG.getNode(
10491 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10492 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10493 SDValue Flags1 = Sub1WithFlags.getValue(1);
10494
10495 // SUBE: Sub1Result - Sub1Result - !carry
10496 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10497 SDValue Sbc1 =
10498 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10499 Sub1Result, Sub1Result, Flags1);
10500 SDValue Sbc1Result = Sbc1.getValue(0);
10501
10502 // Second comparison: RHS vs LHS (reverse comparison)
10503 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10504
10505 // SUBE: RHS - RHS - !carry
10506 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10507 SDValue Sbc2 = DAG.getNode(
10508 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10509 SDValue Sbc2Result = Sbc2.getValue(0);
10510
10511 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10512 SDValue Result =
10513 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10514 if (Op.getValueType() != MVT::i32)
10515 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10516
10517 return Result;
10518 }
10519
10520 // For the ARM assembly pattern:
10521 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10522 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10523 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10524 // signed, LO for unsigned)
10525 // ; if LHS == RHS, result remains 0 from the subs
10526
10527 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10528 unsigned Opcode = ARMISD::SUBC;
10529
10530 // Check if RHS is a subtraction against 0: (0 - X)
10531 if (RHS.getOpcode() == ISD::SUB) {
10532 SDValue SubLHS = RHS.getOperand(0);
10533 SDValue SubRHS = RHS.getOperand(1);
10534
10535 // Check if it's 0 - X
10536 if (isNullConstant(SubLHS)) {
10537 bool CanUseAdd = false;
10538 if (IsSigned) {
10539 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10540 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10542 .isMinSignedValue()) {
10543 CanUseAdd = true;
10544 }
10545 } else {
10546 // For UCMP: only if X is known to never be zero
10547 if (DAG.isKnownNeverZero(SubRHS)) {
10548 CanUseAdd = true;
10549 }
10550 }
10551
10552 if (CanUseAdd) {
10553 Opcode = ARMISD::ADDC;
10554 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10555 // LHS - (0 - X)
10556 }
10557 }
10558 }
10559
10560 // Generate the operation with flags
10561 SDValue OpWithFlags =
10562 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10563
10564 SDValue OpResult = OpWithFlags.getValue(0);
10565 SDValue Flags = OpWithFlags.getValue(1);
10566
10567 // Constants for conditional moves
10568 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10569 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10570
10571 // Select condition codes based on signed vs unsigned
10572 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10573 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10574
10575 // First conditional move: if greater than, set to 1
10576 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10577 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10578 GTCondValue, Flags);
10579
10580 // Second conditional move: if less than, set to -1
10581 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10582 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10583 LTCondValue, Flags);
10584
10585 if (Op.getValueType() != MVT::i32)
10586 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10587
10588 return Result2;
10589}
10590
10592 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10593 switch (Op.getOpcode()) {
10594 default: llvm_unreachable("Don't know how to custom lower this!");
10595 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10596 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10597 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10598 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10599 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10600 case ISD::SELECT: return LowerSELECT(Op, DAG);
10601 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10602 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10603 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10604 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10605 case ISD::VASTART: return LowerVASTART(Op, DAG);
10606 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10607 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10608 case ISD::SINT_TO_FP:
10609 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10612 case ISD::FP_TO_SINT:
10613 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10615 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10616 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10617 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10618 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10619 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10620 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10621 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10622 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10623 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10624 Subtarget);
10625 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10626 case ISD::SHL:
10627 case ISD::SRL:
10628 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10629 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10630 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10631 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10632 case ISD::SRL_PARTS:
10633 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10634 case ISD::CTTZ:
10635 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10636 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10637 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10638 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10639 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10640 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10641 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10642 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10643 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10644 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10645 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10646 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10647 case ISD::SIGN_EXTEND:
10648 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10649 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10650 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10651 case ISD::SET_FPMODE:
10652 return LowerSET_FPMODE(Op, DAG);
10653 case ISD::RESET_FPMODE:
10654 return LowerRESET_FPMODE(Op, DAG);
10655 case ISD::MUL: return LowerMUL(Op, DAG);
10656 case ISD::SDIV:
10657 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10658 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10659 return LowerSDIV(Op, DAG, Subtarget);
10660 case ISD::UDIV:
10661 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10662 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10663 return LowerUDIV(Op, DAG, Subtarget);
10664 case ISD::UADDO_CARRY:
10665 case ISD::USUBO_CARRY:
10666 return LowerUADDSUBO_CARRY(Op, DAG);
10667 case ISD::SADDO:
10668 case ISD::SSUBO:
10669 return LowerSignedALUO(Op, DAG);
10670 case ISD::UADDO:
10671 case ISD::USUBO:
10672 return LowerUnsignedALUO(Op, DAG);
10673 case ISD::SADDSAT:
10674 case ISD::SSUBSAT:
10675 case ISD::UADDSAT:
10676 case ISD::USUBSAT:
10677 return LowerADDSUBSAT(Op, DAG, Subtarget);
10678 case ISD::LOAD:
10679 return LowerPredicateLoad(Op, DAG);
10680 case ISD::STORE:
10681 return LowerSTORE(Op, DAG, Subtarget);
10682 case ISD::MLOAD:
10683 return LowerMLOAD(Op, DAG);
10684 case ISD::VECREDUCE_MUL:
10685 case ISD::VECREDUCE_AND:
10686 case ISD::VECREDUCE_OR:
10687 case ISD::VECREDUCE_XOR:
10688 return LowerVecReduce(Op, DAG, Subtarget);
10689 case ISD::VECREDUCE_FADD:
10690 case ISD::VECREDUCE_FMUL:
10691 case ISD::VECREDUCE_FMIN:
10692 case ISD::VECREDUCE_FMAX:
10693 return LowerVecReduceF(Op, DAG, Subtarget);
10694 case ISD::VECREDUCE_UMIN:
10695 case ISD::VECREDUCE_UMAX:
10696 case ISD::VECREDUCE_SMIN:
10697 case ISD::VECREDUCE_SMAX:
10698 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10699 case ISD::ATOMIC_LOAD:
10700 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10701 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10702 case ISD::SDIVREM:
10703 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10704 case ISD::DYNAMIC_STACKALLOC:
10705 if (Subtarget->isTargetWindows())
10706 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10707 llvm_unreachable("Don't know how to custom lower this!");
10709 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10711 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10712 case ISD::STRICT_FSETCC:
10713 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10714 case ISD::SPONENTRY:
10715 return LowerSPONENTRY(Op, DAG);
10716 case ISD::FP_TO_BF16:
10717 return LowerFP_TO_BF16(Op, DAG);
10718 case ARMISD::WIN__DBZCHK: return SDValue();
10719 case ISD::UCMP:
10720 case ISD::SCMP:
10721 return LowerCMP(Op, DAG);
10722 case ISD::ABS:
10723 return LowerABS(Op, DAG);
10724 }
10725}
10726
10728 SelectionDAG &DAG) {
10729 unsigned IntNo = N->getConstantOperandVal(0);
10730 unsigned Opc = 0;
10731 if (IntNo == Intrinsic::arm_smlald)
10733 else if (IntNo == Intrinsic::arm_smlaldx)
10735 else if (IntNo == Intrinsic::arm_smlsld)
10737 else if (IntNo == Intrinsic::arm_smlsldx)
10739 else
10740 return;
10741
10742 SDLoc dl(N);
10743 SDValue Lo, Hi;
10744 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10745
10746 SDValue LongMul = DAG.getNode(Opc, dl,
10747 DAG.getVTList(MVT::i32, MVT::i32),
10748 N->getOperand(1), N->getOperand(2),
10749 Lo, Hi);
10750 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10751 LongMul.getValue(0), LongMul.getValue(1)));
10752}
10753
10754/// ReplaceNodeResults - Replace the results of node with an illegal result
10755/// type with new values built out of custom code.
10758 SelectionDAG &DAG) const {
10759 SDValue Res;
10760 switch (N->getOpcode()) {
10761 default:
10762 llvm_unreachable("Don't know how to custom expand this!");
10763 case ISD::READ_REGISTER:
10765 break;
10766 case ISD::BITCAST:
10767 Res = ExpandBITCAST(N, DAG, Subtarget);
10768 break;
10769 case ISD::SRL:
10770 case ISD::SRA:
10771 case ISD::SHL:
10772 Res = Expand64BitShift(N, DAG, Subtarget);
10773 break;
10774 case ISD::SREM:
10775 case ISD::UREM:
10776 Res = LowerREM(N, DAG);
10777 break;
10778 case ISD::SDIVREM:
10779 case ISD::UDIVREM:
10780 Res = LowerDivRem(SDValue(N, 0), DAG);
10781 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10782 Results.push_back(Res.getValue(0));
10783 Results.push_back(Res.getValue(1));
10784 return;
10785 case ISD::SADDSAT:
10786 case ISD::SSUBSAT:
10787 case ISD::UADDSAT:
10788 case ISD::USUBSAT:
10789 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10790 break;
10791 case ISD::READCYCLECOUNTER:
10792 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10793 return;
10794 case ISD::UDIV:
10795 case ISD::SDIV:
10796 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10797 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10798 Results);
10799 case ISD::ATOMIC_CMP_SWAP:
10801 return;
10803 return ReplaceLongIntrinsic(N, Results, DAG);
10804 case ISD::LOAD:
10805 LowerLOAD(N, Results, DAG);
10806 break;
10807 case ISD::TRUNCATE:
10808 Res = LowerTruncate(N, DAG, Subtarget);
10809 break;
10810 case ISD::SIGN_EXTEND:
10811 case ISD::ZERO_EXTEND:
10812 Res = LowerVectorExtend(N, DAG, Subtarget);
10813 break;
10816 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10817 break;
10818 }
10819 if (Res.getNode())
10820 Results.push_back(Res);
10821}
10822
10823//===----------------------------------------------------------------------===//
10824// ARM Scheduler Hooks
10825//===----------------------------------------------------------------------===//
10826
10827/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10828/// registers the function context.
10829void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10831 MachineBasicBlock *DispatchBB,
10832 int FI) const {
10833 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10834 "ROPI/RWPI not currently supported with SjLj");
10835 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10836 DebugLoc dl = MI.getDebugLoc();
10837 MachineFunction *MF = MBB->getParent();
10841 const Function &F = MF->getFunction();
10842
10843 bool isThumb = Subtarget->isThumb();
10844 bool isThumb2 = Subtarget->isThumb2();
10845
10846 unsigned PCLabelId = AFI->createPICLabelUId();
10847 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10849 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10850 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10851
10852 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10853 : &ARM::GPRRegClass;
10854
10855 // Grab constant pool and fixed stack memory operands.
10856 MachineMemOperand *CPMMO =
10859
10860 MachineMemOperand *FIMMOSt =
10863
10864 // Load the address of the dispatch MBB into the jump buffer.
10865 if (isThumb2) {
10866 // Incoming value: jbuf
10867 // ldr.n r5, LCPI1_1
10868 // orr r5, r5, #1
10869 // add r5, pc
10870 // str r5, [$jbuf, #+4] ; &jbuf[1]
10871 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10872 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10874 .addMemOperand(CPMMO)
10876 // Set the low bit because of thumb mode.
10877 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10878 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10879 .addReg(NewVReg1, RegState::Kill)
10880 .addImm(0x01)
10882 .add(condCodeOp());
10883 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10884 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10885 .addReg(NewVReg2, RegState::Kill)
10886 .addImm(PCLabelId);
10887 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10888 .addReg(NewVReg3, RegState::Kill)
10889 .addFrameIndex(FI)
10890 .addImm(36) // &jbuf[1] :: pc
10891 .addMemOperand(FIMMOSt)
10893 } else if (isThumb) {
10894 // Incoming value: jbuf
10895 // ldr.n r1, LCPI1_4
10896 // add r1, pc
10897 // mov r2, #1
10898 // orrs r1, r2
10899 // add r2, $jbuf, #+4 ; &jbuf[1]
10900 // str r1, [r2]
10901 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10902 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10904 .addMemOperand(CPMMO)
10906 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10907 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10908 .addReg(NewVReg1, RegState::Kill)
10909 .addImm(PCLabelId);
10910 // Set the low bit because of thumb mode.
10911 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10912 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10913 .addReg(ARM::CPSR, RegState::Define)
10914 .addImm(1)
10916 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10917 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10918 .addReg(ARM::CPSR, RegState::Define)
10919 .addReg(NewVReg2, RegState::Kill)
10920 .addReg(NewVReg3, RegState::Kill)
10922 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10923 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10924 .addFrameIndex(FI)
10925 .addImm(36); // &jbuf[1] :: pc
10926 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10927 .addReg(NewVReg4, RegState::Kill)
10928 .addReg(NewVReg5, RegState::Kill)
10929 .addImm(0)
10930 .addMemOperand(FIMMOSt)
10932 } else {
10933 // Incoming value: jbuf
10934 // ldr r1, LCPI1_1
10935 // add r1, pc, r1
10936 // str r1, [$jbuf, #+4] ; &jbuf[1]
10937 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10938 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10940 .addImm(0)
10941 .addMemOperand(CPMMO)
10943 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10944 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10945 .addReg(NewVReg1, RegState::Kill)
10946 .addImm(PCLabelId)
10948 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10949 .addReg(NewVReg2, RegState::Kill)
10950 .addFrameIndex(FI)
10951 .addImm(36) // &jbuf[1] :: pc
10952 .addMemOperand(FIMMOSt)
10954 }
10955}
10956
10957void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10958 MachineBasicBlock *MBB) const {
10959 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10960 DebugLoc dl = MI.getDebugLoc();
10961 MachineFunction *MF = MBB->getParent();
10962 MachineRegisterInfo *MRI = &MF->getRegInfo();
10963 MachineFrameInfo &MFI = MF->getFrameInfo();
10964 int FI = MFI.getFunctionContextIndex();
10965
10966 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10967 : &ARM::GPRnopcRegClass;
10968
10969 // Get a mapping of the call site numbers to all of the landing pads they're
10970 // associated with.
10971 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10972 unsigned MaxCSNum = 0;
10973 for (MachineBasicBlock &BB : *MF) {
10974 if (!BB.isEHPad())
10975 continue;
10976
10977 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10978 // pad.
10979 for (MachineInstr &II : BB) {
10980 if (!II.isEHLabel())
10981 continue;
10982
10983 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10984 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10985
10986 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10987 for (unsigned Idx : CallSiteIdxs) {
10988 CallSiteNumToLPad[Idx].push_back(&BB);
10989 MaxCSNum = std::max(MaxCSNum, Idx);
10990 }
10991 break;
10992 }
10993 }
10994
10995 // Get an ordered list of the machine basic blocks for the jump table.
10996 std::vector<MachineBasicBlock*> LPadList;
10997 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10998 LPadList.reserve(CallSiteNumToLPad.size());
10999 for (unsigned I = 1; I <= MaxCSNum; ++I) {
11000 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
11001 for (MachineBasicBlock *MBB : MBBList) {
11002 LPadList.push_back(MBB);
11003 InvokeBBs.insert_range(MBB->predecessors());
11004 }
11005 }
11006
11007 assert(!LPadList.empty() &&
11008 "No landing pad destinations for the dispatch jump table!");
11009
11010 // Create the jump table and associated information.
11011 MachineJumpTableInfo *JTI =
11012 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
11013 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
11014
11015 // Create the MBBs for the dispatch code.
11016
11017 // Shove the dispatch's address into the return slot in the function context.
11018 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
11019 DispatchBB->setIsEHPad();
11020
11021 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11022
11023 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
11024 DispatchBB->addSuccessor(TrapBB);
11025
11026 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
11027 DispatchBB->addSuccessor(DispContBB);
11028
11029 // Insert and MBBs.
11030 MF->insert(MF->end(), DispatchBB);
11031 MF->insert(MF->end(), DispContBB);
11032 MF->insert(MF->end(), TrapBB);
11033
11034 // Insert code into the entry block that creates and registers the function
11035 // context.
11036 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11037
11038 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11041
11042 MachineInstrBuilder MIB;
11043 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11044
11045 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11046 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11047
11048 // Add a register mask with no preserved registers. This results in all
11049 // registers being marked as clobbered. This can't work if the dispatch block
11050 // is in a Thumb1 function and is linked with ARM code which uses the FP
11051 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11053
11054 bool IsPositionIndependent = isPositionIndependent();
11055 unsigned NumLPads = LPadList.size();
11056 if (Subtarget->isThumb2()) {
11057 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11058 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11059 .addFrameIndex(FI)
11060 .addImm(4)
11061 .addMemOperand(FIMMOLd)
11063
11064 if (NumLPads < 256) {
11065 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11066 .addReg(NewVReg1)
11067 .addImm(LPadList.size())
11069 } else {
11070 Register VReg1 = MRI->createVirtualRegister(TRC);
11071 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11072 .addImm(NumLPads & 0xFFFF)
11074
11075 unsigned VReg2 = VReg1;
11076 if ((NumLPads & 0xFFFF0000) != 0) {
11077 VReg2 = MRI->createVirtualRegister(TRC);
11078 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11079 .addReg(VReg1)
11080 .addImm(NumLPads >> 16)
11082 }
11083
11084 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11085 .addReg(NewVReg1)
11086 .addReg(VReg2)
11088 }
11089
11090 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11091 .addMBB(TrapBB)
11093 .addReg(ARM::CPSR);
11094
11095 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11096 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11097 .addJumpTableIndex(MJTI)
11099
11100 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11101 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11102 .addReg(NewVReg3, RegState::Kill)
11103 .addReg(NewVReg1)
11106 .add(condCodeOp());
11107
11108 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11109 .addReg(NewVReg4, RegState::Kill)
11110 .addReg(NewVReg1)
11111 .addJumpTableIndex(MJTI);
11112 } else if (Subtarget->isThumb()) {
11113 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11114 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11115 .addFrameIndex(FI)
11116 .addImm(1)
11117 .addMemOperand(FIMMOLd)
11119
11120 if (NumLPads < 256) {
11121 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11122 .addReg(NewVReg1)
11123 .addImm(NumLPads)
11125 } else {
11126 MachineConstantPool *ConstantPool = MF->getConstantPool();
11127 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11128 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11129
11130 // MachineConstantPool wants an explicit alignment.
11131 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11132 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11133
11134 Register VReg1 = MRI->createVirtualRegister(TRC);
11135 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11136 .addReg(VReg1, RegState::Define)
11139 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11140 .addReg(NewVReg1)
11141 .addReg(VReg1)
11143 }
11144
11145 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11146 .addMBB(TrapBB)
11148 .addReg(ARM::CPSR);
11149
11150 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11151 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11152 .addReg(ARM::CPSR, RegState::Define)
11153 .addReg(NewVReg1)
11154 .addImm(2)
11156
11157 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11158 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11159 .addJumpTableIndex(MJTI)
11161
11162 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11163 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11164 .addReg(ARM::CPSR, RegState::Define)
11165 .addReg(NewVReg2, RegState::Kill)
11166 .addReg(NewVReg3)
11168
11169 MachineMemOperand *JTMMOLd =
11170 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11172
11173 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11174 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11175 .addReg(NewVReg4, RegState::Kill)
11176 .addImm(0)
11177 .addMemOperand(JTMMOLd)
11179
11180 unsigned NewVReg6 = NewVReg5;
11181 if (IsPositionIndependent) {
11182 NewVReg6 = MRI->createVirtualRegister(TRC);
11183 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11184 .addReg(ARM::CPSR, RegState::Define)
11185 .addReg(NewVReg5, RegState::Kill)
11186 .addReg(NewVReg3)
11188 }
11189
11190 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11191 .addReg(NewVReg6, RegState::Kill)
11192 .addJumpTableIndex(MJTI);
11193 } else {
11194 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11195 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11196 .addFrameIndex(FI)
11197 .addImm(4)
11198 .addMemOperand(FIMMOLd)
11200
11201 if (NumLPads < 256) {
11202 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11203 .addReg(NewVReg1)
11204 .addImm(NumLPads)
11206 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11207 Register VReg1 = MRI->createVirtualRegister(TRC);
11208 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11209 .addImm(NumLPads & 0xFFFF)
11211
11212 unsigned VReg2 = VReg1;
11213 if ((NumLPads & 0xFFFF0000) != 0) {
11214 VReg2 = MRI->createVirtualRegister(TRC);
11215 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11216 .addReg(VReg1)
11217 .addImm(NumLPads >> 16)
11219 }
11220
11221 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11222 .addReg(NewVReg1)
11223 .addReg(VReg2)
11225 } else {
11226 MachineConstantPool *ConstantPool = MF->getConstantPool();
11227 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11228 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11229
11230 // MachineConstantPool wants an explicit alignment.
11231 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11232 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11233
11234 Register VReg1 = MRI->createVirtualRegister(TRC);
11235 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11236 .addReg(VReg1, RegState::Define)
11238 .addImm(0)
11240 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11241 .addReg(NewVReg1)
11242 .addReg(VReg1, RegState::Kill)
11244 }
11245
11246 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11247 .addMBB(TrapBB)
11249 .addReg(ARM::CPSR);
11250
11251 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11252 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11253 .addReg(NewVReg1)
11256 .add(condCodeOp());
11257 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11258 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11259 .addJumpTableIndex(MJTI)
11261
11262 MachineMemOperand *JTMMOLd =
11263 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11265 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11266 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11267 .addReg(NewVReg3, RegState::Kill)
11268 .addReg(NewVReg4)
11269 .addImm(0)
11270 .addMemOperand(JTMMOLd)
11272
11273 if (IsPositionIndependent) {
11274 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11275 .addReg(NewVReg5, RegState::Kill)
11276 .addReg(NewVReg4)
11277 .addJumpTableIndex(MJTI);
11278 } else {
11279 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11280 .addReg(NewVReg5, RegState::Kill)
11281 .addJumpTableIndex(MJTI);
11282 }
11283 }
11284
11285 // Add the jump table entries as successors to the MBB.
11286 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11287 for (MachineBasicBlock *CurMBB : LPadList) {
11288 if (SeenMBBs.insert(CurMBB).second)
11289 DispContBB->addSuccessor(CurMBB);
11290 }
11291
11292 // N.B. the order the invoke BBs are processed in doesn't matter here.
11293 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11295 for (MachineBasicBlock *BB : InvokeBBs) {
11296
11297 // Remove the landing pad successor from the invoke block and replace it
11298 // with the new dispatch block.
11299 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11300 while (!Successors.empty()) {
11301 MachineBasicBlock *SMBB = Successors.pop_back_val();
11302 if (SMBB->isEHPad()) {
11303 BB->removeSuccessor(SMBB);
11304 MBBLPads.push_back(SMBB);
11305 }
11306 }
11307
11308 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11309 BB->normalizeSuccProbs();
11310
11311 // Find the invoke call and mark all of the callee-saved registers as
11312 // 'implicit defined' so that they're spilled. This prevents code from
11313 // moving instructions to before the EH block, where they will never be
11314 // executed.
11316 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11317 if (!II->isCall()) continue;
11318
11319 DenseSet<unsigned> DefRegs;
11321 OI = II->operands_begin(), OE = II->operands_end();
11322 OI != OE; ++OI) {
11323 if (!OI->isReg()) continue;
11324 DefRegs.insert(OI->getReg());
11325 }
11326
11327 MachineInstrBuilder MIB(*MF, &*II);
11328
11329 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11330 unsigned Reg = SavedRegs[i];
11331 if (Subtarget->isThumb2() &&
11332 !ARM::tGPRRegClass.contains(Reg) &&
11333 !ARM::hGPRRegClass.contains(Reg))
11334 continue;
11335 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11336 continue;
11337 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11338 continue;
11339 if (!DefRegs.contains(Reg))
11341 }
11342
11343 break;
11344 }
11345 }
11346
11347 // Mark all former landing pads as non-landing pads. The dispatch is the only
11348 // landing pad now.
11349 for (MachineBasicBlock *MBBLPad : MBBLPads)
11350 MBBLPad->setIsEHPad(false);
11351
11352 // The instruction is gone now.
11353 MI.eraseFromParent();
11354}
11355
11356static
11358 for (MachineBasicBlock *S : MBB->successors())
11359 if (S != Succ)
11360 return S;
11361 llvm_unreachable("Expecting a BB with two successors!");
11362}
11363
11364/// Return the load opcode for a given load size. If load size >= 8,
11365/// neon opcode will be returned.
11366static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11367 if (LdSize >= 8)
11368 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11369 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11370 if (IsThumb1)
11371 return LdSize == 4 ? ARM::tLDRi
11372 : LdSize == 2 ? ARM::tLDRHi
11373 : LdSize == 1 ? ARM::tLDRBi : 0;
11374 if (IsThumb2)
11375 return LdSize == 4 ? ARM::t2LDR_POST
11376 : LdSize == 2 ? ARM::t2LDRH_POST
11377 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11378 return LdSize == 4 ? ARM::LDR_POST_IMM
11379 : LdSize == 2 ? ARM::LDRH_POST
11380 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11381}
11382
11383/// Return the store opcode for a given store size. If store size >= 8,
11384/// neon opcode will be returned.
11385static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11386 if (StSize >= 8)
11387 return StSize == 16 ? ARM::VST1q32wb_fixed
11388 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11389 if (IsThumb1)
11390 return StSize == 4 ? ARM::tSTRi
11391 : StSize == 2 ? ARM::tSTRHi
11392 : StSize == 1 ? ARM::tSTRBi : 0;
11393 if (IsThumb2)
11394 return StSize == 4 ? ARM::t2STR_POST
11395 : StSize == 2 ? ARM::t2STRH_POST
11396 : StSize == 1 ? ARM::t2STRB_POST : 0;
11397 return StSize == 4 ? ARM::STR_POST_IMM
11398 : StSize == 2 ? ARM::STRH_POST
11399 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11400}
11401
11402/// Emit a post-increment load operation with given size. The instructions
11403/// will be added to BB at Pos.
11405 const TargetInstrInfo *TII, const DebugLoc &dl,
11406 unsigned LdSize, unsigned Data, unsigned AddrIn,
11407 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11408 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11409 assert(LdOpc != 0 && "Should have a load opcode");
11410 if (LdSize >= 8) {
11411 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11412 .addReg(AddrOut, RegState::Define)
11413 .addReg(AddrIn)
11414 .addImm(0)
11416 } else if (IsThumb1) {
11417 // load + update AddrIn
11418 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11419 .addReg(AddrIn)
11420 .addImm(0)
11422 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11423 .add(t1CondCodeOp())
11424 .addReg(AddrIn)
11425 .addImm(LdSize)
11427 } else if (IsThumb2) {
11428 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11429 .addReg(AddrOut, RegState::Define)
11430 .addReg(AddrIn)
11431 .addImm(LdSize)
11433 } else { // arm
11434 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11435 .addReg(AddrOut, RegState::Define)
11436 .addReg(AddrIn)
11437 .addReg(0)
11438 .addImm(LdSize)
11440 }
11441}
11442
11443/// Emit a post-increment store operation with given size. The instructions
11444/// will be added to BB at Pos.
11446 const TargetInstrInfo *TII, const DebugLoc &dl,
11447 unsigned StSize, unsigned Data, unsigned AddrIn,
11448 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11449 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11450 assert(StOpc != 0 && "Should have a store opcode");
11451 if (StSize >= 8) {
11452 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11453 .addReg(AddrIn)
11454 .addImm(0)
11455 .addReg(Data)
11457 } else if (IsThumb1) {
11458 // store + update AddrIn
11459 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11460 .addReg(Data)
11461 .addReg(AddrIn)
11462 .addImm(0)
11464 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11465 .add(t1CondCodeOp())
11466 .addReg(AddrIn)
11467 .addImm(StSize)
11469 } else if (IsThumb2) {
11470 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11471 .addReg(Data)
11472 .addReg(AddrIn)
11473 .addImm(StSize)
11475 } else { // arm
11476 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11477 .addReg(Data)
11478 .addReg(AddrIn)
11479 .addReg(0)
11480 .addImm(StSize)
11482 }
11483}
11484
11486ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11487 MachineBasicBlock *BB) const {
11488 // This pseudo instruction has 3 operands: dst, src, size
11489 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11490 // Otherwise, we will generate unrolled scalar copies.
11491 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11492 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11494
11495 Register dest = MI.getOperand(0).getReg();
11496 Register src = MI.getOperand(1).getReg();
11497 unsigned SizeVal = MI.getOperand(2).getImm();
11498 unsigned Alignment = MI.getOperand(3).getImm();
11499 DebugLoc dl = MI.getDebugLoc();
11500
11501 MachineFunction *MF = BB->getParent();
11502 MachineRegisterInfo &MRI = MF->getRegInfo();
11503 unsigned UnitSize = 0;
11504 const TargetRegisterClass *TRC = nullptr;
11505 const TargetRegisterClass *VecTRC = nullptr;
11506
11507 bool IsThumb1 = Subtarget->isThumb1Only();
11508 bool IsThumb2 = Subtarget->isThumb2();
11509 bool IsThumb = Subtarget->isThumb();
11510
11511 if (Alignment & 1) {
11512 UnitSize = 1;
11513 } else if (Alignment & 2) {
11514 UnitSize = 2;
11515 } else {
11516 // Check whether we can use NEON instructions.
11517 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11518 Subtarget->hasNEON()) {
11519 if ((Alignment % 16 == 0) && SizeVal >= 16)
11520 UnitSize = 16;
11521 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11522 UnitSize = 8;
11523 }
11524 // Can't use NEON instructions.
11525 if (UnitSize == 0)
11526 UnitSize = 4;
11527 }
11528
11529 // Select the correct opcode and register class for unit size load/store
11530 bool IsNeon = UnitSize >= 8;
11531 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11532 if (IsNeon)
11533 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11534 : UnitSize == 8 ? &ARM::DPRRegClass
11535 : nullptr;
11536
11537 unsigned BytesLeft = SizeVal % UnitSize;
11538 unsigned LoopSize = SizeVal - BytesLeft;
11539
11540 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11541 // Use LDR and STR to copy.
11542 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11543 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11544 unsigned srcIn = src;
11545 unsigned destIn = dest;
11546 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11547 Register srcOut = MRI.createVirtualRegister(TRC);
11548 Register destOut = MRI.createVirtualRegister(TRC);
11549 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11550 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11551 IsThumb1, IsThumb2);
11552 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11553 IsThumb1, IsThumb2);
11554 srcIn = srcOut;
11555 destIn = destOut;
11556 }
11557
11558 // Handle the leftover bytes with LDRB and STRB.
11559 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11560 // [destOut] = STRB_POST(scratch, destIn, 1)
11561 for (unsigned i = 0; i < BytesLeft; i++) {
11562 Register srcOut = MRI.createVirtualRegister(TRC);
11563 Register destOut = MRI.createVirtualRegister(TRC);
11564 Register scratch = MRI.createVirtualRegister(TRC);
11565 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11566 IsThumb1, IsThumb2);
11567 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11568 IsThumb1, IsThumb2);
11569 srcIn = srcOut;
11570 destIn = destOut;
11571 }
11572 MI.eraseFromParent(); // The instruction is gone now.
11573 return BB;
11574 }
11575
11576 // Expand the pseudo op to a loop.
11577 // thisMBB:
11578 // ...
11579 // movw varEnd, # --> with thumb2
11580 // movt varEnd, #
11581 // ldrcp varEnd, idx --> without thumb2
11582 // fallthrough --> loopMBB
11583 // loopMBB:
11584 // PHI varPhi, varEnd, varLoop
11585 // PHI srcPhi, src, srcLoop
11586 // PHI destPhi, dst, destLoop
11587 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11588 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11589 // subs varLoop, varPhi, #UnitSize
11590 // bne loopMBB
11591 // fallthrough --> exitMBB
11592 // exitMBB:
11593 // epilogue to handle left-over bytes
11594 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11595 // [destOut] = STRB_POST(scratch, destLoop, 1)
11596 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11597 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11598 MF->insert(It, loopMBB);
11599 MF->insert(It, exitMBB);
11600
11601 // Set the call frame size on entry to the new basic blocks.
11602 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11603 loopMBB->setCallFrameSize(CallFrameSize);
11604 exitMBB->setCallFrameSize(CallFrameSize);
11605
11606 // Transfer the remainder of BB and its successor edges to exitMBB.
11607 exitMBB->splice(exitMBB->begin(), BB,
11608 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11610
11611 // Load an immediate to varEnd.
11612 Register varEnd = MRI.createVirtualRegister(TRC);
11613 if (Subtarget->useMovt()) {
11614 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11615 varEnd)
11616 .addImm(LoopSize);
11617 } else if (Subtarget->genExecuteOnly()) {
11618 assert(IsThumb && "Non-thumb expected to have used movt");
11619 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11620 } else {
11621 MachineConstantPool *ConstantPool = MF->getConstantPool();
11623 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11624
11625 // MachineConstantPool wants an explicit alignment.
11626 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11627 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11628 MachineMemOperand *CPMMO =
11631
11632 if (IsThumb)
11633 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11634 .addReg(varEnd, RegState::Define)
11637 .addMemOperand(CPMMO);
11638 else
11639 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11640 .addReg(varEnd, RegState::Define)
11642 .addImm(0)
11644 .addMemOperand(CPMMO);
11645 }
11646 BB->addSuccessor(loopMBB);
11647
11648 // Generate the loop body:
11649 // varPhi = PHI(varLoop, varEnd)
11650 // srcPhi = PHI(srcLoop, src)
11651 // destPhi = PHI(destLoop, dst)
11652 MachineBasicBlock *entryBB = BB;
11653 BB = loopMBB;
11654 Register varLoop = MRI.createVirtualRegister(TRC);
11655 Register varPhi = MRI.createVirtualRegister(TRC);
11656 Register srcLoop = MRI.createVirtualRegister(TRC);
11657 Register srcPhi = MRI.createVirtualRegister(TRC);
11658 Register destLoop = MRI.createVirtualRegister(TRC);
11659 Register destPhi = MRI.createVirtualRegister(TRC);
11660
11661 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11662 .addReg(varLoop).addMBB(loopMBB)
11663 .addReg(varEnd).addMBB(entryBB);
11664 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11665 .addReg(srcLoop).addMBB(loopMBB)
11666 .addReg(src).addMBB(entryBB);
11667 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11668 .addReg(destLoop).addMBB(loopMBB)
11669 .addReg(dest).addMBB(entryBB);
11670
11671 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11672 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11673 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11674 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11675 IsThumb1, IsThumb2);
11676 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11677 IsThumb1, IsThumb2);
11678
11679 // Decrement loop variable by UnitSize.
11680 if (IsThumb1) {
11681 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11682 .add(t1CondCodeOp())
11683 .addReg(varPhi)
11684 .addImm(UnitSize)
11686 } else {
11687 MachineInstrBuilder MIB =
11688 BuildMI(*BB, BB->end(), dl,
11689 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11690 MIB.addReg(varPhi)
11691 .addImm(UnitSize)
11693 .add(condCodeOp());
11694 MIB->getOperand(5).setReg(ARM::CPSR);
11695 MIB->getOperand(5).setIsDef(true);
11696 }
11697 BuildMI(*BB, BB->end(), dl,
11698 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11699 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11700
11701 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11702 BB->addSuccessor(loopMBB);
11703 BB->addSuccessor(exitMBB);
11704
11705 // Add epilogue to handle BytesLeft.
11706 BB = exitMBB;
11707 auto StartOfExit = exitMBB->begin();
11708
11709 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11710 // [destOut] = STRB_POST(scratch, destLoop, 1)
11711 unsigned srcIn = srcLoop;
11712 unsigned destIn = destLoop;
11713 for (unsigned i = 0; i < BytesLeft; i++) {
11714 Register srcOut = MRI.createVirtualRegister(TRC);
11715 Register destOut = MRI.createVirtualRegister(TRC);
11716 Register scratch = MRI.createVirtualRegister(TRC);
11717 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11718 IsThumb1, IsThumb2);
11719 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11720 IsThumb1, IsThumb2);
11721 srcIn = srcOut;
11722 destIn = destOut;
11723 }
11724
11725 MI.eraseFromParent(); // The instruction is gone now.
11726 return BB;
11727}
11728
11730ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11731 MachineBasicBlock *MBB) const {
11732 const TargetMachine &TM = getTargetMachine();
11733 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11734 DebugLoc DL = MI.getDebugLoc();
11735
11736 assert(Subtarget->isTargetWindows() &&
11737 "__chkstk is only supported on Windows");
11738 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11739
11740 // __chkstk takes the number of words to allocate on the stack in R4, and
11741 // returns the stack adjustment in number of bytes in R4. This will not
11742 // clober any other registers (other than the obvious lr).
11743 //
11744 // Although, technically, IP should be considered a register which may be
11745 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11746 // thumb-2 environment, so there is no interworking required. As a result, we
11747 // do not expect a veneer to be emitted by the linker, clobbering IP.
11748 //
11749 // Each module receives its own copy of __chkstk, so no import thunk is
11750 // required, again, ensuring that IP is not clobbered.
11751 //
11752 // Finally, although some linkers may theoretically provide a trampoline for
11753 // out of range calls (which is quite common due to a 32M range limitation of
11754 // branches for Thumb), we can generate the long-call version via
11755 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11756 // IP.
11757
11758 switch (TM.getCodeModel()) {
11759 case CodeModel::Tiny:
11760 llvm_unreachable("Tiny code model not available on ARM.");
11761 case CodeModel::Small:
11762 case CodeModel::Medium:
11763 case CodeModel::Kernel:
11764 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11766 .addExternalSymbol("__chkstk")
11769 .addReg(ARM::R12,
11771 .addReg(ARM::CPSR,
11773 break;
11774 case CodeModel::Large: {
11775 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11776 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11777
11778 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11779 .addExternalSymbol("__chkstk");
11785 .addReg(ARM::R12,
11787 .addReg(ARM::CPSR,
11789 break;
11790 }
11791 }
11792
11793 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11794 .addReg(ARM::SP, RegState::Kill)
11795 .addReg(ARM::R4, RegState::Kill)
11798 .add(condCodeOp());
11799
11800 MI.eraseFromParent();
11801 return MBB;
11802}
11803
11805ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11806 MachineBasicBlock *MBB) const {
11807 DebugLoc DL = MI.getDebugLoc();
11808 MachineFunction *MF = MBB->getParent();
11809 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11810
11811 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11812 MF->insert(++MBB->getIterator(), ContBB);
11813 ContBB->splice(ContBB->begin(), MBB,
11814 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11816 MBB->addSuccessor(ContBB);
11817
11818 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11819 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11820 MF->push_back(TrapBB);
11821 MBB->addSuccessor(TrapBB);
11822
11823 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11824 .addReg(MI.getOperand(0).getReg())
11825 .addImm(0)
11827 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11828 .addMBB(TrapBB)
11830 .addReg(ARM::CPSR);
11831
11832 MI.eraseFromParent();
11833 return ContBB;
11834}
11835
11836// The CPSR operand of SelectItr might be missing a kill marker
11837// because there were multiple uses of CPSR, and ISel didn't know
11838// which to mark. Figure out whether SelectItr should have had a
11839// kill marker, and set it if it should. Returns the correct kill
11840// marker value.
11843 const TargetRegisterInfo* TRI) {
11844 // Scan forward through BB for a use/def of CPSR.
11845 MachineBasicBlock::iterator miI(std::next(SelectItr));
11846 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11847 const MachineInstr& mi = *miI;
11848 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11849 return false;
11850 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11851 break; // Should have kill-flag - update below.
11852 }
11853
11854 // If we hit the end of the block, check whether CPSR is live into a
11855 // successor.
11856 if (miI == BB->end()) {
11857 for (MachineBasicBlock *Succ : BB->successors())
11858 if (Succ->isLiveIn(ARM::CPSR))
11859 return false;
11860 }
11861
11862 // We found a def, or hit the end of the basic block and CPSR wasn't live
11863 // out. SelectMI should have a kill flag on CPSR.
11864 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11865 return true;
11866}
11867
11868/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11869/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11871 MachineBasicBlock *TpLoopBody,
11872 MachineBasicBlock *TpExit, Register OpSizeReg,
11873 const TargetInstrInfo *TII, DebugLoc Dl,
11875 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11876 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11877 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11878 .addUse(OpSizeReg)
11879 .addImm(15)
11881 .addReg(0);
11882
11883 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11884 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11885 .addUse(AddDestReg, RegState::Kill)
11886 .addImm(4)
11888 .addReg(0);
11889
11890 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11891 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11892 .addUse(LsrDestReg, RegState::Kill);
11893
11894 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11895 .addUse(TotalIterationsReg)
11896 .addMBB(TpExit);
11897
11898 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11899 .addMBB(TpLoopBody)
11901
11902 return TotalIterationsReg;
11903}
11904
11905/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11906/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11907/// loops.
11908static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11909 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11910 const TargetInstrInfo *TII, DebugLoc Dl,
11911 MachineRegisterInfo &MRI, Register OpSrcReg,
11912 Register OpDestReg, Register ElementCountReg,
11913 Register TotalIterationsReg, bool IsMemcpy) {
11914 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11915 // array, loop iteration counter, predication counter.
11916
11917 Register SrcPhiReg, CurrSrcReg;
11918 if (IsMemcpy) {
11919 // Current position in the src array
11920 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11921 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11922 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11923 .addUse(OpSrcReg)
11924 .addMBB(TpEntry)
11925 .addUse(CurrSrcReg)
11926 .addMBB(TpLoopBody);
11927 }
11928
11929 // Current position in the dest array
11930 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11931 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11932 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11933 .addUse(OpDestReg)
11934 .addMBB(TpEntry)
11935 .addUse(CurrDestReg)
11936 .addMBB(TpLoopBody);
11937
11938 // Current loop counter
11939 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11940 Register RemainingLoopIterationsReg =
11941 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11942 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11943 .addUse(TotalIterationsReg)
11944 .addMBB(TpEntry)
11945 .addUse(RemainingLoopIterationsReg)
11946 .addMBB(TpLoopBody);
11947
11948 // Predication counter
11949 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11950 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11951 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11952 .addUse(ElementCountReg)
11953 .addMBB(TpEntry)
11954 .addUse(RemainingElementsReg)
11955 .addMBB(TpLoopBody);
11956
11957 // Pass predication counter to VCTP
11958 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11959 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11960 .addUse(PredCounterPhiReg)
11962 .addReg(0)
11963 .addReg(0);
11964
11965 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11966 .addUse(PredCounterPhiReg)
11967 .addImm(16)
11969 .addReg(0);
11970
11971 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11972 Register SrcValueReg;
11973 if (IsMemcpy) {
11974 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11975 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11976 .addDef(CurrSrcReg)
11977 .addDef(SrcValueReg)
11978 .addReg(SrcPhiReg)
11979 .addImm(16)
11981 .addUse(VccrReg)
11982 .addReg(0);
11983 } else
11984 SrcValueReg = OpSrcReg;
11985
11986 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11987 .addDef(CurrDestReg)
11988 .addUse(SrcValueReg)
11989 .addReg(DestPhiReg)
11990 .addImm(16)
11992 .addUse(VccrReg)
11993 .addReg(0);
11994
11995 // Add the pseudoInstrs for decrementing the loop counter and marking the
11996 // end:t2DoLoopDec and t2DoLoopEnd
11997 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11998 .addUse(LoopCounterPhiReg)
11999 .addImm(1);
12000
12001 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
12002 .addUse(RemainingLoopIterationsReg)
12003 .addMBB(TpLoopBody);
12004
12005 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
12006 .addMBB(TpExit)
12008}
12009
12012 MachineBasicBlock *BB) const {
12013 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
12014 DebugLoc dl = MI.getDebugLoc();
12015 bool isThumb2 = Subtarget->isThumb2();
12016 switch (MI.getOpcode()) {
12017 default: {
12018 MI.print(errs());
12019 llvm_unreachable("Unexpected instr type to insert");
12020 }
12021
12022 // Thumb1 post-indexed loads are really just single-register LDMs.
12023 case ARM::tLDR_postidx: {
12024 MachineOperand Def(MI.getOperand(1));
12025 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12026 .add(Def) // Rn_wb
12027 .add(MI.getOperand(2)) // Rn
12028 .add(MI.getOperand(3)) // PredImm
12029 .add(MI.getOperand(4)) // PredReg
12030 .add(MI.getOperand(0)) // Rt
12031 .cloneMemRefs(MI);
12032 MI.eraseFromParent();
12033 return BB;
12034 }
12035
12036 case ARM::MVE_MEMCPYLOOPINST:
12037 case ARM::MVE_MEMSETLOOPINST: {
12038
12039 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12040 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12041 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12042 // adds the relevant instructions in the TP loop Body for generation of a
12043 // WLSTP loop.
12044
12045 // Below is relevant portion of the CFG after the transformation.
12046 // The Machine Basic Blocks are shown along with branch conditions (in
12047 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12048 // portion of the CFG and may not necessarily be the entry/exit of the
12049 // function.
12050
12051 // (Relevant) CFG after transformation:
12052 // TP entry MBB
12053 // |
12054 // |-----------------|
12055 // (n <= 0) (n > 0)
12056 // | |
12057 // | TP loop Body MBB<--|
12058 // | | |
12059 // \ |___________|
12060 // \ /
12061 // TP exit MBB
12062
12063 MachineFunction *MF = BB->getParent();
12064 MachineFunctionProperties &Properties = MF->getProperties();
12066
12067 Register OpDestReg = MI.getOperand(0).getReg();
12068 Register OpSrcReg = MI.getOperand(1).getReg();
12069 Register OpSizeReg = MI.getOperand(2).getReg();
12070
12071 // Allocate the required MBBs and add to parent function.
12072 MachineBasicBlock *TpEntry = BB;
12073 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12074 MachineBasicBlock *TpExit;
12075
12076 MF->push_back(TpLoopBody);
12077
12078 // If any instructions are present in the current block after
12079 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12080 // move the instructions into the newly created exit block. If there are no
12081 // instructions add an explicit branch to the FallThrough block and then
12082 // split.
12083 //
12084 // The split is required for two reasons:
12085 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12086 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12087 // need to be updated. splitAt() already handles this.
12088 TpExit = BB->splitAt(MI, false);
12089 if (TpExit == BB) {
12090 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12091 "block containing memcpy/memset Pseudo");
12092 TpExit = BB->getFallThrough();
12093 BuildMI(BB, dl, TII->get(ARM::t2B))
12094 .addMBB(TpExit)
12096 TpExit = BB->splitAt(MI, false);
12097 }
12098
12099 // Add logic for iteration count
12100 Register TotalIterationsReg =
12101 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12102
12103 // Add the vectorized (and predicated) loads/store instructions
12104 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12105 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12106 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12107
12108 // Required to avoid conflict with the MachineVerifier during testing.
12109 Properties.resetNoPHIs();
12110
12111 // Connect the blocks
12112 TpEntry->addSuccessor(TpLoopBody);
12113 TpLoopBody->addSuccessor(TpLoopBody);
12114 TpLoopBody->addSuccessor(TpExit);
12115
12116 // Reorder for a more natural layout
12117 TpLoopBody->moveAfter(TpEntry);
12118 TpExit->moveAfter(TpLoopBody);
12119
12120 // Finally, remove the memcpy Pseudo Instruction
12121 MI.eraseFromParent();
12122
12123 // Return the exit block as it may contain other instructions requiring a
12124 // custom inserter
12125 return TpExit;
12126 }
12127
12128 // The Thumb2 pre-indexed stores have the same MI operands, they just
12129 // define them differently in the .td files from the isel patterns, so
12130 // they need pseudos.
12131 case ARM::t2STR_preidx:
12132 MI.setDesc(TII->get(ARM::t2STR_PRE));
12133 return BB;
12134 case ARM::t2STRB_preidx:
12135 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12136 return BB;
12137 case ARM::t2STRH_preidx:
12138 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12139 return BB;
12140
12141 case ARM::STRi_preidx:
12142 case ARM::STRBi_preidx: {
12143 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12144 : ARM::STRB_PRE_IMM;
12145 // Decode the offset.
12146 unsigned Offset = MI.getOperand(4).getImm();
12147 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12149 if (isSub)
12150 Offset = -Offset;
12151
12152 MachineMemOperand *MMO = *MI.memoperands_begin();
12153 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12154 .add(MI.getOperand(0)) // Rn_wb
12155 .add(MI.getOperand(1)) // Rt
12156 .add(MI.getOperand(2)) // Rn
12157 .addImm(Offset) // offset (skip GPR==zero_reg)
12158 .add(MI.getOperand(5)) // pred
12159 .add(MI.getOperand(6))
12160 .addMemOperand(MMO);
12161 MI.eraseFromParent();
12162 return BB;
12163 }
12164 case ARM::STRr_preidx:
12165 case ARM::STRBr_preidx:
12166 case ARM::STRH_preidx: {
12167 unsigned NewOpc;
12168 switch (MI.getOpcode()) {
12169 default: llvm_unreachable("unexpected opcode!");
12170 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12171 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12172 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12173 }
12174 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12175 for (const MachineOperand &MO : MI.operands())
12176 MIB.add(MO);
12177 MI.eraseFromParent();
12178 return BB;
12179 }
12180
12181 case ARM::tMOVCCr_pseudo: {
12182 // To "insert" a SELECT_CC instruction, we actually have to insert the
12183 // diamond control-flow pattern. The incoming instruction knows the
12184 // destination vreg to set, the condition code register to branch on, the
12185 // true/false values to select between, and a branch opcode to use.
12186 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12188
12189 // thisMBB:
12190 // ...
12191 // TrueVal = ...
12192 // cmpTY ccX, r1, r2
12193 // bCC copy1MBB
12194 // fallthrough --> copy0MBB
12195 MachineBasicBlock *thisMBB = BB;
12196 MachineFunction *F = BB->getParent();
12197 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12198 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12199 F->insert(It, copy0MBB);
12200 F->insert(It, sinkMBB);
12201
12202 // Set the call frame size on entry to the new basic blocks.
12203 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12204 copy0MBB->setCallFrameSize(CallFrameSize);
12205 sinkMBB->setCallFrameSize(CallFrameSize);
12206
12207 // Check whether CPSR is live past the tMOVCCr_pseudo.
12208 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12209 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12210 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12211 copy0MBB->addLiveIn(ARM::CPSR);
12212 sinkMBB->addLiveIn(ARM::CPSR);
12213 }
12214
12215 // Transfer the remainder of BB and its successor edges to sinkMBB.
12216 sinkMBB->splice(sinkMBB->begin(), BB,
12217 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12219
12220 BB->addSuccessor(copy0MBB);
12221 BB->addSuccessor(sinkMBB);
12222
12223 BuildMI(BB, dl, TII->get(ARM::tBcc))
12224 .addMBB(sinkMBB)
12225 .addImm(MI.getOperand(3).getImm())
12226 .addReg(MI.getOperand(4).getReg());
12227
12228 // copy0MBB:
12229 // %FalseValue = ...
12230 // # fallthrough to sinkMBB
12231 BB = copy0MBB;
12232
12233 // Update machine-CFG edges
12234 BB->addSuccessor(sinkMBB);
12235
12236 // sinkMBB:
12237 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12238 // ...
12239 BB = sinkMBB;
12240 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12241 .addReg(MI.getOperand(1).getReg())
12242 .addMBB(copy0MBB)
12243 .addReg(MI.getOperand(2).getReg())
12244 .addMBB(thisMBB);
12245
12246 MI.eraseFromParent(); // The pseudo instruction is gone now.
12247 return BB;
12248 }
12249
12250 case ARM::BCCi64:
12251 case ARM::BCCZi64: {
12252 // If there is an unconditional branch to the other successor, remove it.
12253 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12254
12255 // Compare both parts that make up the double comparison separately for
12256 // equality.
12257 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12258
12259 Register LHS1 = MI.getOperand(1).getReg();
12260 Register LHS2 = MI.getOperand(2).getReg();
12261 if (RHSisZero) {
12262 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12263 .addReg(LHS1)
12264 .addImm(0)
12266 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12267 .addReg(LHS2).addImm(0)
12268 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12269 } else {
12270 Register RHS1 = MI.getOperand(3).getReg();
12271 Register RHS2 = MI.getOperand(4).getReg();
12272 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12273 .addReg(LHS1)
12274 .addReg(RHS1)
12276 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12277 .addReg(LHS2).addReg(RHS2)
12278 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12279 }
12280
12281 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12282 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12283 if (MI.getOperand(0).getImm() == ARMCC::NE)
12284 std::swap(destMBB, exitMBB);
12285
12286 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12287 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12288 if (isThumb2)
12289 BuildMI(BB, dl, TII->get(ARM::t2B))
12290 .addMBB(exitMBB)
12292 else
12293 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12294
12295 MI.eraseFromParent(); // The pseudo instruction is gone now.
12296 return BB;
12297 }
12298
12299 case ARM::Int_eh_sjlj_setjmp:
12300 case ARM::Int_eh_sjlj_setjmp_nofp:
12301 case ARM::tInt_eh_sjlj_setjmp:
12302 case ARM::t2Int_eh_sjlj_setjmp:
12303 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12304 return BB;
12305
12306 case ARM::Int_eh_sjlj_setup_dispatch:
12307 EmitSjLjDispatchBlock(MI, BB);
12308 return BB;
12309 case ARM::COPY_STRUCT_BYVAL_I32:
12310 ++NumLoopByVals;
12311 return EmitStructByval(MI, BB);
12312 case ARM::WIN__CHKSTK:
12313 return EmitLowered__chkstk(MI, BB);
12314 case ARM::WIN__DBZCHK:
12315 return EmitLowered__dbzchk(MI, BB);
12316 }
12317}
12318
12319/// Attaches vregs to MEMCPY that it will use as scratch registers
12320/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12321/// instead of as a custom inserter because we need the use list from the SDNode.
12322static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12323 MachineInstr &MI, const SDNode *Node) {
12324 bool isThumb1 = Subtarget->isThumb1Only();
12325
12326 MachineFunction *MF = MI.getParent()->getParent();
12328 MachineInstrBuilder MIB(*MF, MI);
12329
12330 // If the new dst/src is unused mark it as dead.
12331 if (!Node->hasAnyUseOfValue(0)) {
12332 MI.getOperand(0).setIsDead(true);
12333 }
12334 if (!Node->hasAnyUseOfValue(1)) {
12335 MI.getOperand(1).setIsDead(true);
12336 }
12337
12338 // The MEMCPY both defines and kills the scratch registers.
12339 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12340 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12341 : &ARM::GPRRegClass);
12343 }
12344}
12345
12347 SDNode *Node) const {
12348 if (MI.getOpcode() == ARM::MEMCPY) {
12349 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12350 return;
12351 }
12352
12353 const MCInstrDesc *MCID = &MI.getDesc();
12354 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12355 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12356 // operand is still set to noreg. If needed, set the optional operand's
12357 // register to CPSR, and remove the redundant implicit def.
12358 //
12359 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12360
12361 // Rename pseudo opcodes.
12362 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12363 unsigned ccOutIdx;
12364 if (NewOpc) {
12365 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12366 MCID = &TII->get(NewOpc);
12367
12368 assert(MCID->getNumOperands() ==
12369 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12370 && "converted opcode should be the same except for cc_out"
12371 " (and, on Thumb1, pred)");
12372
12373 MI.setDesc(*MCID);
12374
12375 // Add the optional cc_out operand
12376 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12377
12378 // On Thumb1, move all input operands to the end, then add the predicate
12379 if (Subtarget->isThumb1Only()) {
12380 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12381 MI.addOperand(MI.getOperand(1));
12382 MI.removeOperand(1);
12383 }
12384
12385 // Restore the ties
12386 for (unsigned i = MI.getNumOperands(); i--;) {
12387 const MachineOperand& op = MI.getOperand(i);
12388 if (op.isReg() && op.isUse()) {
12389 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12390 if (DefIdx != -1)
12391 MI.tieOperands(DefIdx, i);
12392 }
12393 }
12394
12396 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12397 ccOutIdx = 1;
12398 } else
12399 ccOutIdx = MCID->getNumOperands() - 1;
12400 } else
12401 ccOutIdx = MCID->getNumOperands() - 1;
12402
12403 // Any ARM instruction that sets the 's' bit should specify an optional
12404 // "cc_out" operand in the last operand position.
12405 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12406 assert(!NewOpc && "Optional cc_out operand required");
12407 return;
12408 }
12409 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12410 // since we already have an optional CPSR def.
12411 bool definesCPSR = false;
12412 bool deadCPSR = false;
12413 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12414 ++i) {
12415 const MachineOperand &MO = MI.getOperand(i);
12416 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12417 definesCPSR = true;
12418 if (MO.isDead())
12419 deadCPSR = true;
12420 MI.removeOperand(i);
12421 break;
12422 }
12423 }
12424 if (!definesCPSR) {
12425 assert(!NewOpc && "Optional cc_out operand required");
12426 return;
12427 }
12428 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12429 if (deadCPSR) {
12430 assert(!MI.getOperand(ccOutIdx).getReg() &&
12431 "expect uninitialized optional cc_out operand");
12432 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12433 if (!Subtarget->isThumb1Only())
12434 return;
12435 }
12436
12437 // If this instruction was defined with an optional CPSR def and its dag node
12438 // had a live implicit CPSR def, then activate the optional CPSR def.
12439 MachineOperand &MO = MI.getOperand(ccOutIdx);
12440 MO.setReg(ARM::CPSR);
12441 MO.setIsDef(true);
12442}
12443
12444//===----------------------------------------------------------------------===//
12445// ARM Optimization Hooks
12446//===----------------------------------------------------------------------===//
12447
12448// Helper function that checks if N is a null or all ones constant.
12449static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12451}
12452
12453// Return true if N is conditionally 0 or all ones.
12454// Detects these expressions where cc is an i1 value:
12455//
12456// (select cc 0, y) [AllOnes=0]
12457// (select cc y, 0) [AllOnes=0]
12458// (zext cc) [AllOnes=0]
12459// (sext cc) [AllOnes=0/1]
12460// (select cc -1, y) [AllOnes=1]
12461// (select cc y, -1) [AllOnes=1]
12462//
12463// Invert is set when N is the null/all ones constant when CC is false.
12464// OtherOp is set to the alternative value of N.
12466 SDValue &CC, bool &Invert,
12467 SDValue &OtherOp,
12468 SelectionDAG &DAG) {
12469 switch (N->getOpcode()) {
12470 default: return false;
12471 case ISD::SELECT: {
12472 CC = N->getOperand(0);
12473 SDValue N1 = N->getOperand(1);
12474 SDValue N2 = N->getOperand(2);
12475 if (isZeroOrAllOnes(N1, AllOnes)) {
12476 Invert = false;
12477 OtherOp = N2;
12478 return true;
12479 }
12480 if (isZeroOrAllOnes(N2, AllOnes)) {
12481 Invert = true;
12482 OtherOp = N1;
12483 return true;
12484 }
12485 return false;
12486 }
12487 case ISD::ZERO_EXTEND:
12488 // (zext cc) can never be the all ones value.
12489 if (AllOnes)
12490 return false;
12491 [[fallthrough]];
12492 case ISD::SIGN_EXTEND: {
12493 SDLoc dl(N);
12494 EVT VT = N->getValueType(0);
12495 CC = N->getOperand(0);
12496 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12497 return false;
12498 Invert = !AllOnes;
12499 if (AllOnes)
12500 // When looking for an AllOnes constant, N is an sext, and the 'other'
12501 // value is 0.
12502 OtherOp = DAG.getConstant(0, dl, VT);
12503 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12504 // When looking for a 0 constant, N can be zext or sext.
12505 OtherOp = DAG.getConstant(1, dl, VT);
12506 else
12507 OtherOp = DAG.getAllOnesConstant(dl, VT);
12508 return true;
12509 }
12510 }
12511}
12512
12513// Combine a constant select operand into its use:
12514//
12515// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12516// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12517// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12518// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12519// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12520//
12521// The transform is rejected if the select doesn't have a constant operand that
12522// is null, or all ones when AllOnes is set.
12523//
12524// Also recognize sext/zext from i1:
12525//
12526// (add (zext cc), x) -> (select cc (add x, 1), x)
12527// (add (sext cc), x) -> (select cc (add x, -1), x)
12528//
12529// These transformations eventually create predicated instructions.
12530//
12531// @param N The node to transform.
12532// @param Slct The N operand that is a select.
12533// @param OtherOp The other N operand (x above).
12534// @param DCI Context.
12535// @param AllOnes Require the select constant to be all ones instead of null.
12536// @returns The new node, or SDValue() on failure.
12537static
12540 bool AllOnes = false) {
12541 SelectionDAG &DAG = DCI.DAG;
12542 EVT VT = N->getValueType(0);
12543 SDValue NonConstantVal;
12544 SDValue CCOp;
12545 bool SwapSelectOps;
12546 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12547 NonConstantVal, DAG))
12548 return SDValue();
12549
12550 // Slct is now know to be the desired identity constant when CC is true.
12551 SDValue TrueVal = OtherOp;
12552 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12553 OtherOp, NonConstantVal);
12554 // Unless SwapSelectOps says CC should be false.
12555 if (SwapSelectOps)
12556 std::swap(TrueVal, FalseVal);
12557
12558 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12559 CCOp, TrueVal, FalseVal);
12560}
12561
12562// Attempt combineSelectAndUse on each operand of a commutative operator N.
12563static
12566 SDValue N0 = N->getOperand(0);
12567 SDValue N1 = N->getOperand(1);
12568 if (N0.getNode()->hasOneUse())
12569 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12570 return Result;
12571 if (N1.getNode()->hasOneUse())
12572 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12573 return Result;
12574 return SDValue();
12575}
12576
12578 // VUZP shuffle node.
12579 if (N->getOpcode() == ARMISD::VUZP)
12580 return true;
12581
12582 // "VUZP" on i32 is an alias for VTRN.
12583 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12584 return true;
12585
12586 return false;
12587}
12588
12591 const ARMSubtarget *Subtarget) {
12592 // Look for ADD(VUZP.0, VUZP.1).
12593 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12594 N0 == N1)
12595 return SDValue();
12596
12597 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12598 if (!N->getValueType(0).is64BitVector())
12599 return SDValue();
12600
12601 // Generate vpadd.
12602 SelectionDAG &DAG = DCI.DAG;
12603 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12604 SDLoc dl(N);
12605 SDNode *Unzip = N0.getNode();
12606 EVT VT = N->getValueType(0);
12607
12609 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12610 TLI.getPointerTy(DAG.getDataLayout())));
12611 Ops.push_back(Unzip->getOperand(0));
12612 Ops.push_back(Unzip->getOperand(1));
12613
12614 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12615}
12616
12619 const ARMSubtarget *Subtarget) {
12620 // Check for two extended operands.
12621 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12622 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12623 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12624 N1.getOpcode() == ISD::ZERO_EXTEND))
12625 return SDValue();
12626
12627 SDValue N00 = N0.getOperand(0);
12628 SDValue N10 = N1.getOperand(0);
12629
12630 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12631 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12632 N00 == N10)
12633 return SDValue();
12634
12635 // We only recognize Q register paddl here; this can't be reached until
12636 // after type legalization.
12637 if (!N00.getValueType().is64BitVector() ||
12639 return SDValue();
12640
12641 // Generate vpaddl.
12642 SelectionDAG &DAG = DCI.DAG;
12643 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12644 SDLoc dl(N);
12645 EVT VT = N->getValueType(0);
12646
12648 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12649 unsigned Opcode;
12650 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12651 Opcode = Intrinsic::arm_neon_vpaddls;
12652 else
12653 Opcode = Intrinsic::arm_neon_vpaddlu;
12654 Ops.push_back(DAG.getConstant(Opcode, dl,
12655 TLI.getPointerTy(DAG.getDataLayout())));
12656 EVT ElemTy = N00.getValueType().getVectorElementType();
12657 unsigned NumElts = VT.getVectorNumElements();
12658 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12659 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12660 N00.getOperand(0), N00.getOperand(1));
12661 Ops.push_back(Concat);
12662
12663 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12664}
12665
12666// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12667// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12668// much easier to match.
12669static SDValue
12672 const ARMSubtarget *Subtarget) {
12673 // Only perform optimization if after legalize, and if NEON is available. We
12674 // also expected both operands to be BUILD_VECTORs.
12675 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12676 || N0.getOpcode() != ISD::BUILD_VECTOR
12677 || N1.getOpcode() != ISD::BUILD_VECTOR)
12678 return SDValue();
12679
12680 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12681 EVT VT = N->getValueType(0);
12682 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12683 return SDValue();
12684
12685 // Check that the vector operands are of the right form.
12686 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12687 // operands, where N is the size of the formed vector.
12688 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12689 // index such that we have a pair wise add pattern.
12690
12691 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12693 return SDValue();
12694 SDValue Vec = N0->getOperand(0)->getOperand(0);
12695 SDNode *V = Vec.getNode();
12696 unsigned nextIndex = 0;
12697
12698 // For each operands to the ADD which are BUILD_VECTORs,
12699 // check to see if each of their operands are an EXTRACT_VECTOR with
12700 // the same vector and appropriate index.
12701 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12704
12705 SDValue ExtVec0 = N0->getOperand(i);
12706 SDValue ExtVec1 = N1->getOperand(i);
12707
12708 // First operand is the vector, verify its the same.
12709 if (V != ExtVec0->getOperand(0).getNode() ||
12710 V != ExtVec1->getOperand(0).getNode())
12711 return SDValue();
12712
12713 // Second is the constant, verify its correct.
12716
12717 // For the constant, we want to see all the even or all the odd.
12718 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12719 || C1->getZExtValue() != nextIndex+1)
12720 return SDValue();
12721
12722 // Increment index.
12723 nextIndex+=2;
12724 } else
12725 return SDValue();
12726 }
12727
12728 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12729 // we're using the entire input vector, otherwise there's a size/legality
12730 // mismatch somewhere.
12731 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12733 return SDValue();
12734
12735 // Create VPADDL node.
12736 SelectionDAG &DAG = DCI.DAG;
12737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12738
12739 SDLoc dl(N);
12740
12741 // Build operand list.
12743 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12744 TLI.getPointerTy(DAG.getDataLayout())));
12745
12746 // Input is the vector.
12747 Ops.push_back(Vec);
12748
12749 // Get widened type and narrowed type.
12750 MVT widenType;
12751 unsigned numElem = VT.getVectorNumElements();
12752
12753 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12754 switch (inputLaneType.getSimpleVT().SimpleTy) {
12755 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12756 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12757 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12758 default:
12759 llvm_unreachable("Invalid vector element type for padd optimization.");
12760 }
12761
12762 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12763 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12764 return DAG.getNode(ExtOp, dl, VT, tmp);
12765}
12766
12768 if (V->getOpcode() == ISD::UMUL_LOHI ||
12769 V->getOpcode() == ISD::SMUL_LOHI)
12770 return V;
12771 return SDValue();
12772}
12773
12774static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12776 const ARMSubtarget *Subtarget) {
12777 if (!Subtarget->hasBaseDSP())
12778 return SDValue();
12779
12780 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12781 // accumulates the product into a 64-bit value. The 16-bit values will
12782 // be sign extended somehow or SRA'd into 32-bit values
12783 // (addc (adde (mul 16bit, 16bit), lo), hi)
12784 SDValue Mul = AddcNode->getOperand(0);
12785 SDValue Lo = AddcNode->getOperand(1);
12786 if (Mul.getOpcode() != ISD::MUL) {
12787 Lo = AddcNode->getOperand(0);
12788 Mul = AddcNode->getOperand(1);
12789 if (Mul.getOpcode() != ISD::MUL)
12790 return SDValue();
12791 }
12792
12793 SDValue SRA = AddeNode->getOperand(0);
12794 SDValue Hi = AddeNode->getOperand(1);
12795 if (SRA.getOpcode() != ISD::SRA) {
12796 SRA = AddeNode->getOperand(1);
12797 Hi = AddeNode->getOperand(0);
12798 if (SRA.getOpcode() != ISD::SRA)
12799 return SDValue();
12800 }
12801 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12802 if (Const->getZExtValue() != 31)
12803 return SDValue();
12804 } else
12805 return SDValue();
12806
12807 if (SRA.getOperand(0) != Mul)
12808 return SDValue();
12809
12810 SelectionDAG &DAG = DCI.DAG;
12811 SDLoc dl(AddcNode);
12812 unsigned Opcode = 0;
12813 SDValue Op0;
12814 SDValue Op1;
12815
12816 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12817 Opcode = ARMISD::SMLALBB;
12818 Op0 = Mul.getOperand(0);
12819 Op1 = Mul.getOperand(1);
12820 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12821 Opcode = ARMISD::SMLALBT;
12822 Op0 = Mul.getOperand(0);
12823 Op1 = Mul.getOperand(1).getOperand(0);
12824 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12825 Opcode = ARMISD::SMLALTB;
12826 Op0 = Mul.getOperand(0).getOperand(0);
12827 Op1 = Mul.getOperand(1);
12828 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12829 Opcode = ARMISD::SMLALTT;
12830 Op0 = Mul->getOperand(0).getOperand(0);
12831 Op1 = Mul->getOperand(1).getOperand(0);
12832 }
12833
12834 if (!Op0 || !Op1)
12835 return SDValue();
12836
12837 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12838 Op0, Op1, Lo, Hi);
12839 // Replace the ADDs' nodes uses by the MLA node's values.
12840 SDValue HiMLALResult(SMLAL.getNode(), 1);
12841 SDValue LoMLALResult(SMLAL.getNode(), 0);
12842
12843 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12844 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12845
12846 // Return original node to notify the driver to stop replacing.
12847 SDValue resNode(AddcNode, 0);
12848 return resNode;
12849}
12850
12853 const ARMSubtarget *Subtarget) {
12854 // Look for multiply add opportunities.
12855 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12856 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12857 // a glue link from the first add to the second add.
12858 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12859 // a S/UMLAL instruction.
12860 // UMUL_LOHI
12861 // / :lo \ :hi
12862 // V \ [no multiline comment]
12863 // loAdd -> ADDC |
12864 // \ :carry /
12865 // V V
12866 // ADDE <- hiAdd
12867 //
12868 // In the special case where only the higher part of a signed result is used
12869 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12870 // a constant with the exact value of 0x80000000, we recognize we are dealing
12871 // with a "rounded multiply and add" (or subtract) and transform it into
12872 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12873
12874 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12875 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12876 "Expect an ADDE or SUBE");
12877
12878 assert(AddeSubeNode->getNumOperands() == 3 &&
12879 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12880 "ADDE node has the wrong inputs");
12881
12882 // Check that we are chained to the right ADDC or SUBC node.
12883 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12884 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12885 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12886 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12887 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12888 return SDValue();
12889
12890 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12891 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12892
12893 // Check if the two operands are from the same mul_lohi node.
12894 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12895 return SDValue();
12896
12897 assert(AddcSubcNode->getNumValues() == 2 &&
12898 AddcSubcNode->getValueType(0) == MVT::i32 &&
12899 "Expect ADDC with two result values. First: i32");
12900
12901 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12902 // maybe a SMLAL which multiplies two 16-bit values.
12903 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12904 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12905 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12906 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12907 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12908 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12909
12910 // Check for the triangle shape.
12911 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12912 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12913
12914 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12915 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12916 return SDValue();
12917
12918 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12919 bool IsLeftOperandMUL = false;
12920 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12921 if (MULOp == SDValue())
12922 MULOp = findMUL_LOHI(AddeSubeOp1);
12923 else
12924 IsLeftOperandMUL = true;
12925 if (MULOp == SDValue())
12926 return SDValue();
12927
12928 // Figure out the right opcode.
12929 unsigned Opc = MULOp->getOpcode();
12930 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12931
12932 // Figure out the high and low input values to the MLAL node.
12933 SDValue *HiAddSub = nullptr;
12934 SDValue *LoMul = nullptr;
12935 SDValue *LowAddSub = nullptr;
12936
12937 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12938 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12939 return SDValue();
12940
12941 if (IsLeftOperandMUL)
12942 HiAddSub = &AddeSubeOp1;
12943 else
12944 HiAddSub = &AddeSubeOp0;
12945
12946 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12947 // whose low result is fed to the ADDC/SUBC we are checking.
12948
12949 if (AddcSubcOp0 == MULOp.getValue(0)) {
12950 LoMul = &AddcSubcOp0;
12951 LowAddSub = &AddcSubcOp1;
12952 }
12953 if (AddcSubcOp1 == MULOp.getValue(0)) {
12954 LoMul = &AddcSubcOp1;
12955 LowAddSub = &AddcSubcOp0;
12956 }
12957
12958 if (!LoMul)
12959 return SDValue();
12960
12961 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12962 // the replacement below will create a cycle.
12963 if (AddcSubcNode == HiAddSub->getNode() ||
12964 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12965 return SDValue();
12966
12967 // Create the merged node.
12968 SelectionDAG &DAG = DCI.DAG;
12969
12970 // Start building operand list.
12972 Ops.push_back(LoMul->getOperand(0));
12973 Ops.push_back(LoMul->getOperand(1));
12974
12975 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12976 // the case, we must be doing signed multiplication and only use the higher
12977 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12978 // addition or subtraction with the value of 0x800000.
12979 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12980 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12981 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12982 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12983 0x80000000) {
12984 Ops.push_back(*HiAddSub);
12985 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12986 FinalOpc = ARMISD::SMMLSR;
12987 } else {
12988 FinalOpc = ARMISD::SMMLAR;
12989 }
12990 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12991 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12992
12993 return SDValue(AddeSubeNode, 0);
12994 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12995 // SMMLS is generated during instruction selection and the rest of this
12996 // function can not handle the case where AddcSubcNode is a SUBC.
12997 return SDValue();
12998
12999 // Finish building the operand list for {U/S}MLAL
13000 Ops.push_back(*LowAddSub);
13001 Ops.push_back(*HiAddSub);
13002
13003 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13004 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13005
13006 // Replace the ADDs' nodes uses by the MLA node's values.
13007 SDValue HiMLALResult(MLALNode.getNode(), 1);
13008 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13009
13010 SDValue LoMLALResult(MLALNode.getNode(), 0);
13011 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13012
13013 // Return original node to notify the driver to stop replacing.
13014 return SDValue(AddeSubeNode, 0);
13015}
13016
13019 const ARMSubtarget *Subtarget) {
13020 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13021 // While trying to combine for the other MLAL nodes, first search for the
13022 // chance to use UMAAL. Check if Addc uses a node which has already
13023 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13024 // as the addend, and it's handled in PerformUMLALCombine.
13025
13026 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13027 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13028
13029 // Check that we have a glued ADDC node.
13030 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13031 if (AddcNode->getOpcode() != ARMISD::ADDC)
13032 return SDValue();
13033
13034 // Find the converted UMAAL or quit if it doesn't exist.
13035 SDNode *UmlalNode = nullptr;
13036 SDValue AddHi;
13037 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13038 UmlalNode = AddcNode->getOperand(0).getNode();
13039 AddHi = AddcNode->getOperand(1);
13040 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13041 UmlalNode = AddcNode->getOperand(1).getNode();
13042 AddHi = AddcNode->getOperand(0);
13043 } else {
13044 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13045 }
13046
13047 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13048 // the ADDC as well as Zero.
13049 if (!isNullConstant(UmlalNode->getOperand(3)))
13050 return SDValue();
13051
13052 if ((isNullConstant(AddeNode->getOperand(0)) &&
13053 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13054 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13055 isNullConstant(AddeNode->getOperand(1)))) {
13056 SelectionDAG &DAG = DCI.DAG;
13057 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13058 UmlalNode->getOperand(2), AddHi };
13059 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13060 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13061
13062 // Replace the ADDs' nodes uses by the UMAAL node's values.
13063 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13064 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13065
13066 // Return original node to notify the driver to stop replacing.
13067 return SDValue(AddeNode, 0);
13068 }
13069 return SDValue();
13070}
13071
13073 const ARMSubtarget *Subtarget) {
13074 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13075 return SDValue();
13076
13077 // Check that we have a pair of ADDC and ADDE as operands.
13078 // Both addends of the ADDE must be zero.
13079 SDNode* AddcNode = N->getOperand(2).getNode();
13080 SDNode* AddeNode = N->getOperand(3).getNode();
13081 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13082 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13083 isNullConstant(AddeNode->getOperand(0)) &&
13084 isNullConstant(AddeNode->getOperand(1)) &&
13085 (AddeNode->getOperand(2).getNode() == AddcNode))
13086 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13087 DAG.getVTList(MVT::i32, MVT::i32),
13088 {N->getOperand(0), N->getOperand(1),
13089 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13090 else
13091 return SDValue();
13092}
13093
13096 const ARMSubtarget *Subtarget) {
13097 SelectionDAG &DAG(DCI.DAG);
13098
13099 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13100 // (SUBC (ADDE 0, 0, C), 1) -> C
13101 SDValue LHS = N->getOperand(0);
13102 SDValue RHS = N->getOperand(1);
13103 if (LHS->getOpcode() == ARMISD::ADDE &&
13104 isNullConstant(LHS->getOperand(0)) &&
13105 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13106 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13107 }
13108 }
13109
13110 if (Subtarget->isThumb1Only()) {
13111 SDValue RHS = N->getOperand(1);
13113 int32_t imm = C->getSExtValue();
13114 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13115 SDLoc DL(N);
13116 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13117 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13118 : ARMISD::ADDC;
13119 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13120 }
13121 }
13122 }
13123
13124 return SDValue();
13125}
13126
13129 const ARMSubtarget *Subtarget) {
13130 if (Subtarget->isThumb1Only()) {
13131 SelectionDAG &DAG = DCI.DAG;
13132 SDValue RHS = N->getOperand(1);
13134 int64_t imm = C->getSExtValue();
13135 if (imm < 0) {
13136 SDLoc DL(N);
13137
13138 // The with-carry-in form matches bitwise not instead of the negation.
13139 // Effectively, the inverse interpretation of the carry flag already
13140 // accounts for part of the negation.
13141 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13142
13143 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13144 : ARMISD::ADDE;
13145 return DAG.getNode(Opcode, DL, N->getVTList(),
13146 N->getOperand(0), RHS, N->getOperand(2));
13147 }
13148 }
13149 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13150 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13151 }
13152 return SDValue();
13153}
13154
13157 const ARMSubtarget *Subtarget) {
13158 if (!Subtarget->hasMVEIntegerOps())
13159 return SDValue();
13160
13161 SDLoc dl(N);
13162 SDValue SetCC;
13163 SDValue LHS;
13164 SDValue RHS;
13165 ISD::CondCode CC;
13166 SDValue TrueVal;
13167 SDValue FalseVal;
13168
13169 if (N->getOpcode() == ISD::SELECT &&
13170 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13171 SetCC = N->getOperand(0);
13172 LHS = SetCC->getOperand(0);
13173 RHS = SetCC->getOperand(1);
13174 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13175 TrueVal = N->getOperand(1);
13176 FalseVal = N->getOperand(2);
13177 } else if (N->getOpcode() == ISD::SELECT_CC) {
13178 LHS = N->getOperand(0);
13179 RHS = N->getOperand(1);
13180 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13181 TrueVal = N->getOperand(2);
13182 FalseVal = N->getOperand(3);
13183 } else {
13184 return SDValue();
13185 }
13186
13187 unsigned int Opcode = 0;
13188 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13189 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13190 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13191 Opcode = ARMISD::VMINVu;
13192 if (CC == ISD::SETUGT)
13193 std::swap(TrueVal, FalseVal);
13194 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13195 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13196 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13197 Opcode = ARMISD::VMINVs;
13198 if (CC == ISD::SETGT)
13199 std::swap(TrueVal, FalseVal);
13200 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13201 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13202 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13203 Opcode = ARMISD::VMAXVu;
13204 if (CC == ISD::SETULT)
13205 std::swap(TrueVal, FalseVal);
13206 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13207 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13208 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13209 Opcode = ARMISD::VMAXVs;
13210 if (CC == ISD::SETLT)
13211 std::swap(TrueVal, FalseVal);
13212 } else
13213 return SDValue();
13214
13215 // Normalise to the right hand side being the vector reduction
13216 switch (TrueVal->getOpcode()) {
13217 case ISD::VECREDUCE_UMIN:
13218 case ISD::VECREDUCE_SMIN:
13219 case ISD::VECREDUCE_UMAX:
13220 case ISD::VECREDUCE_SMAX:
13221 std::swap(LHS, RHS);
13222 std::swap(TrueVal, FalseVal);
13223 break;
13224 }
13225
13226 EVT VectorType = FalseVal->getOperand(0).getValueType();
13227
13228 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13229 VectorType != MVT::v4i32)
13230 return SDValue();
13231
13232 EVT VectorScalarType = VectorType.getVectorElementType();
13233
13234 // The values being selected must also be the ones being compared
13235 if (TrueVal != LHS || FalseVal != RHS)
13236 return SDValue();
13237
13238 EVT LeftType = LHS->getValueType(0);
13239 EVT RightType = RHS->getValueType(0);
13240
13241 // The types must match the reduced type too
13242 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13243 return SDValue();
13244
13245 // Legalise the scalar to an i32
13246 if (VectorScalarType != MVT::i32)
13247 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13248
13249 // Generate the reduction as an i32 for legalisation purposes
13250 auto Reduction =
13251 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13252
13253 // The result isn't actually an i32 so truncate it back to its original type
13254 if (VectorScalarType != MVT::i32)
13255 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13256
13257 return Reduction;
13258}
13259
13260// A special combine for the vqdmulh family of instructions. This is one of the
13261// potential set of patterns that could patch this instruction. The base pattern
13262// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13263// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13264// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13265// the max is unnecessary.
13267 EVT VT = N->getValueType(0);
13268 SDValue Shft;
13269 ConstantSDNode *Clamp;
13270
13271 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13272 return SDValue();
13273
13274 if (N->getOpcode() == ISD::SMIN) {
13275 Shft = N->getOperand(0);
13276 Clamp = isConstOrConstSplat(N->getOperand(1));
13277 } else if (N->getOpcode() == ISD::VSELECT) {
13278 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13279 SDValue Cmp = N->getOperand(0);
13280 if (Cmp.getOpcode() != ISD::SETCC ||
13281 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13282 Cmp.getOperand(0) != N->getOperand(1) ||
13283 Cmp.getOperand(1) != N->getOperand(2))
13284 return SDValue();
13285 Shft = N->getOperand(1);
13286 Clamp = isConstOrConstSplat(N->getOperand(2));
13287 } else
13288 return SDValue();
13289
13290 if (!Clamp)
13291 return SDValue();
13292
13293 MVT ScalarType;
13294 int ShftAmt = 0;
13295 switch (Clamp->getSExtValue()) {
13296 case (1 << 7) - 1:
13297 ScalarType = MVT::i8;
13298 ShftAmt = 7;
13299 break;
13300 case (1 << 15) - 1:
13301 ScalarType = MVT::i16;
13302 ShftAmt = 15;
13303 break;
13304 case (1ULL << 31) - 1:
13305 ScalarType = MVT::i32;
13306 ShftAmt = 31;
13307 break;
13308 default:
13309 return SDValue();
13310 }
13311
13312 if (Shft.getOpcode() != ISD::SRA)
13313 return SDValue();
13315 if (!N1 || N1->getSExtValue() != ShftAmt)
13316 return SDValue();
13317
13318 SDValue Mul = Shft.getOperand(0);
13319 if (Mul.getOpcode() != ISD::MUL)
13320 return SDValue();
13321
13322 SDValue Ext0 = Mul.getOperand(0);
13323 SDValue Ext1 = Mul.getOperand(1);
13324 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13325 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13326 return SDValue();
13327 EVT VecVT = Ext0.getOperand(0).getValueType();
13328 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13329 return SDValue();
13330 if (Ext1.getOperand(0).getValueType() != VecVT ||
13331 VecVT.getScalarType() != ScalarType ||
13332 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13333 return SDValue();
13334
13335 SDLoc DL(Mul);
13336 unsigned LegalLanes = 128 / (ShftAmt + 1);
13337 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13338 // For types smaller than legal vectors extend to be legal and only use needed
13339 // lanes.
13340 if (VecVT.getSizeInBits() < 128) {
13341 EVT ExtVecVT =
13343 VecVT.getVectorNumElements());
13344 SDValue Inp0 =
13345 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13346 SDValue Inp1 =
13347 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13348 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13349 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13350 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13351 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13352 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13353 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13354 }
13355
13356 // For larger types, split into legal sized chunks.
13357 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13358 unsigned NumParts = VecVT.getSizeInBits() / 128;
13360 for (unsigned I = 0; I < NumParts; ++I) {
13361 SDValue Inp0 =
13362 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13363 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13364 SDValue Inp1 =
13365 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13366 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13367 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13368 Parts.push_back(VQDMULH);
13369 }
13370 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13371 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13372}
13373
13376 const ARMSubtarget *Subtarget) {
13377 if (!Subtarget->hasMVEIntegerOps())
13378 return SDValue();
13379
13380 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13381 return V;
13382
13383 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13384 //
13385 // We need to re-implement this optimization here as the implementation in the
13386 // Target-Independent DAGCombiner does not handle the kind of constant we make
13387 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13388 // good reason, allowing truncation there would break other targets).
13389 //
13390 // Currently, this is only done for MVE, as it's the only target that benefits
13391 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13392 if (N->getOperand(0).getOpcode() != ISD::XOR)
13393 return SDValue();
13394 SDValue XOR = N->getOperand(0);
13395
13396 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13397 // It is important to check with truncation allowed as the BUILD_VECTORs we
13398 // generate in those situations will truncate their operands.
13399 ConstantSDNode *Const =
13400 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13401 /*AllowTruncation*/ true);
13402 if (!Const || !Const->isOne())
13403 return SDValue();
13404
13405 // Rewrite into vselect(cond, rhs, lhs).
13406 SDValue Cond = XOR->getOperand(0);
13407 SDValue LHS = N->getOperand(1);
13408 SDValue RHS = N->getOperand(2);
13409 EVT Type = N->getValueType(0);
13410 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13411}
13412
13413// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13416 const ARMSubtarget *Subtarget) {
13417 SDValue Op0 = N->getOperand(0);
13418 SDValue Op1 = N->getOperand(1);
13419 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13420 EVT VT = N->getValueType(0);
13421
13422 if (!Subtarget->hasMVEIntegerOps() ||
13424 return SDValue();
13425
13426 if (CC == ISD::SETUGE) {
13427 std::swap(Op0, Op1);
13428 CC = ISD::SETULT;
13429 }
13430
13431 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13433 return SDValue();
13434
13435 // Check first operand is BuildVector of 0,1,2,...
13436 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13437 if (!Op0.getOperand(I).isUndef() &&
13439 Op0.getConstantOperandVal(I) == I))
13440 return SDValue();
13441 }
13442
13443 // The second is a Splat of Op1S
13444 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13445 if (!Op1S)
13446 return SDValue();
13447
13448 unsigned Opc;
13449 switch (VT.getVectorNumElements()) {
13450 case 2:
13451 Opc = Intrinsic::arm_mve_vctp64;
13452 break;
13453 case 4:
13454 Opc = Intrinsic::arm_mve_vctp32;
13455 break;
13456 case 8:
13457 Opc = Intrinsic::arm_mve_vctp16;
13458 break;
13459 case 16:
13460 Opc = Intrinsic::arm_mve_vctp8;
13461 break;
13462 default:
13463 return SDValue();
13464 }
13465
13466 SDLoc DL(N);
13467 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13468 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13469 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13470}
13471
13472/// PerformADDECombine - Target-specific dag combine transform from
13473/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13474/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13477 const ARMSubtarget *Subtarget) {
13478 // Only ARM and Thumb2 support UMLAL/SMLAL.
13479 if (Subtarget->isThumb1Only())
13480 return PerformAddeSubeCombine(N, DCI, Subtarget);
13481
13482 // Only perform the checks after legalize when the pattern is available.
13483 if (DCI.isBeforeLegalize()) return SDValue();
13484
13485 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13486}
13487
13488/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13489/// operands N0 and N1. This is a helper for PerformADDCombine that is
13490/// called with the default operands, and if that fails, with commuted
13491/// operands.
13494 const ARMSubtarget *Subtarget){
13495 // Attempt to create vpadd for this add.
13496 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13497 return Result;
13498
13499 // Attempt to create vpaddl for this add.
13500 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13501 return Result;
13502 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13503 Subtarget))
13504 return Result;
13505
13506 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13507 if (N0.getNode()->hasOneUse())
13508 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13509 return Result;
13510 return SDValue();
13511}
13512
13514 EVT VT = N->getValueType(0);
13515 SDValue N0 = N->getOperand(0);
13516 SDValue N1 = N->getOperand(1);
13517 SDLoc dl(N);
13518
13519 auto IsVecReduce = [](SDValue Op) {
13520 switch (Op.getOpcode()) {
13521 case ISD::VECREDUCE_ADD:
13522 case ARMISD::VADDVs:
13523 case ARMISD::VADDVu:
13524 case ARMISD::VMLAVs:
13525 case ARMISD::VMLAVu:
13526 return true;
13527 }
13528 return false;
13529 };
13530
13531 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13532 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13533 // add(add(X, vecreduce(Y)), vecreduce(Z))
13534 // to make better use of vaddva style instructions.
13535 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13536 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13537 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13538 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13539 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13540 }
13541 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13542 // add(add(add(A, C), reduce(B)), reduce(D))
13543 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13544 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13545 unsigned N0RedOp = 0;
13546 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13547 N0RedOp = 1;
13548 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13549 return SDValue();
13550 }
13551
13552 unsigned N1RedOp = 0;
13553 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13554 N1RedOp = 1;
13555 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13556 return SDValue();
13557
13558 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13559 N1.getOperand(1 - N1RedOp));
13560 SDValue Add1 =
13561 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13562 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13563 }
13564 return SDValue();
13565 };
13566 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13567 return R;
13568 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13569 return R;
13570
13571 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13572 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13573 // by ascending load offsets. This can help cores prefetch if the order of
13574 // loads is more predictable.
13575 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13576 // Check if two reductions are known to load data where one is before/after
13577 // another. Return negative if N0 loads data before N1, positive if N1 is
13578 // before N0 and 0 otherwise if nothing is known.
13579 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13580 // Look through to the first operand of a MUL, for the VMLA case.
13581 // Currently only looks at the first operand, in the hope they are equal.
13582 if (N0.getOpcode() == ISD::MUL)
13583 N0 = N0.getOperand(0);
13584 if (N1.getOpcode() == ISD::MUL)
13585 N1 = N1.getOperand(0);
13586
13587 // Return true if the two operands are loads to the same object and the
13588 // offset of the first is known to be less than the offset of the second.
13589 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13590 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13591 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13592 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13593 Load1->isIndexed())
13594 return 0;
13595
13596 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13597 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13598
13599 if (!BaseLocDecomp0.getBase() ||
13600 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13601 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13602 return 0;
13603 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13604 return -1;
13605 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13606 return 1;
13607 return 0;
13608 };
13609
13610 SDValue X;
13611 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13612 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13613 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13614 N0.getOperand(1).getOperand(0));
13615 if (IsBefore < 0) {
13616 X = N0.getOperand(0);
13617 N0 = N0.getOperand(1);
13618 } else if (IsBefore > 0) {
13619 X = N0.getOperand(1);
13620 N0 = N0.getOperand(0);
13621 } else
13622 return SDValue();
13623 } else if (IsVecReduce(N0.getOperand(0))) {
13624 X = N0.getOperand(1);
13625 N0 = N0.getOperand(0);
13626 } else if (IsVecReduce(N0.getOperand(1))) {
13627 X = N0.getOperand(0);
13628 N0 = N0.getOperand(1);
13629 } else
13630 return SDValue();
13631 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13632 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13633 // Note this is backward to how you would expect. We create
13634 // add(reduce(load + 16), reduce(load + 0)) so that the
13635 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13636 // the X as VADDV(load + 0)
13637 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13638 } else
13639 return SDValue();
13640
13641 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13642 return SDValue();
13643
13644 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13645 return SDValue();
13646
13647 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13648 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13649 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13650 };
13651 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13652 return R;
13653 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13654 return R;
13655 return SDValue();
13656}
13657
13659 const ARMSubtarget *Subtarget) {
13660 if (!Subtarget->hasMVEIntegerOps())
13661 return SDValue();
13662
13664 return R;
13665
13666 EVT VT = N->getValueType(0);
13667 SDValue N0 = N->getOperand(0);
13668 SDValue N1 = N->getOperand(1);
13669 SDLoc dl(N);
13670
13671 if (VT != MVT::i64)
13672 return SDValue();
13673
13674 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13675 // will look like:
13676 // t1: i32,i32 = ARMISD::VADDLVs x
13677 // t2: i64 = build_pair t1, t1:1
13678 // t3: i64 = add t2, y
13679 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13680 // the add to be simplified separately.
13681 // We also need to check for sext / zext and commutitive adds.
13682 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13683 SDValue NB) {
13684 if (NB->getOpcode() != ISD::BUILD_PAIR)
13685 return SDValue();
13686 SDValue VecRed = NB->getOperand(0);
13687 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13688 VecRed.getResNo() != 0 ||
13689 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13690 return SDValue();
13691
13692 if (VecRed->getOpcode() == OpcodeA) {
13693 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13694 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13695 VecRed.getOperand(0), VecRed.getOperand(1));
13696 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13697 }
13698
13700 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13701
13702 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13703 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13704 Ops.push_back(VecRed->getOperand(I));
13705 SDValue Red =
13706 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13707 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13708 SDValue(Red.getNode(), 1));
13709 };
13710
13711 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13712 return M;
13713 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13714 return M;
13715 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13716 return M;
13717 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13718 return M;
13719 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13720 return M;
13721 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13722 return M;
13723 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13724 return M;
13725 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13726 return M;
13727 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13728 return M;
13729 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13730 return M;
13731 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13732 return M;
13733 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13734 return M;
13735 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13736 return M;
13737 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13738 return M;
13739 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13740 return M;
13741 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13742 return M;
13743 return SDValue();
13744}
13745
13746bool
13748 CombineLevel Level) const {
13749 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13750 N->getOpcode() == ISD::SRL) &&
13751 "Expected shift op");
13752
13753 SDValue ShiftLHS = N->getOperand(0);
13754 if (!ShiftLHS->hasOneUse())
13755 return false;
13756
13757 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13758 !ShiftLHS.getOperand(0)->hasOneUse())
13759 return false;
13760
13761 if (Level == BeforeLegalizeTypes)
13762 return true;
13763
13764 if (N->getOpcode() != ISD::SHL)
13765 return true;
13766
13767 if (Subtarget->isThumb1Only()) {
13768 // Avoid making expensive immediates by commuting shifts. (This logic
13769 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13770 // for free.)
13771 if (N->getOpcode() != ISD::SHL)
13772 return true;
13773 SDValue N1 = N->getOperand(0);
13774 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13775 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13776 return true;
13777 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13778 if (Const->getAPIntValue().ult(256))
13779 return false;
13780 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13781 Const->getAPIntValue().sgt(-256))
13782 return false;
13783 }
13784 return true;
13785 }
13786
13787 // Turn off commute-with-shift transform after legalization, so it doesn't
13788 // conflict with PerformSHLSimplify. (We could try to detect when
13789 // PerformSHLSimplify would trigger more precisely, but it isn't
13790 // really necessary.)
13791 return false;
13792}
13793
13795 const SDNode *N) const {
13796 assert(N->getOpcode() == ISD::XOR &&
13797 (N->getOperand(0).getOpcode() == ISD::SHL ||
13798 N->getOperand(0).getOpcode() == ISD::SRL) &&
13799 "Expected XOR(SHIFT) pattern");
13800
13801 // Only commute if the entire NOT mask is a hidden shifted mask.
13802 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13803 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13804 if (XorC && ShiftC) {
13805 unsigned MaskIdx, MaskLen;
13806 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13807 unsigned ShiftAmt = ShiftC->getZExtValue();
13808 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13809 if (N->getOperand(0).getOpcode() == ISD::SHL)
13810 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13811 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13812 }
13813 }
13814
13815 return false;
13816}
13817
13819 const SDNode *N) const {
13820 assert(((N->getOpcode() == ISD::SHL &&
13821 N->getOperand(0).getOpcode() == ISD::SRL) ||
13822 (N->getOpcode() == ISD::SRL &&
13823 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13824 "Expected shift-shift mask");
13825
13826 if (!Subtarget->isThumb1Only())
13827 return true;
13828
13829 EVT VT = N->getValueType(0);
13830 if (VT.getScalarSizeInBits() > 32)
13831 return true;
13832
13833 return false;
13834}
13835
13837 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13838 SDValue Y) const {
13839 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13840 SelectOpcode == ISD::VSELECT;
13841}
13842
13844 if (!Subtarget->hasNEON()) {
13845 if (Subtarget->isThumb1Only())
13846 return VT.getScalarSizeInBits() <= 32;
13847 return true;
13848 }
13849 return VT.isScalarInteger();
13850}
13851
13853 EVT VT) const {
13854 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13855 return false;
13856
13857 switch (FPVT.getSimpleVT().SimpleTy) {
13858 case MVT::f16:
13859 return Subtarget->hasVFP2Base();
13860 case MVT::f32:
13861 return Subtarget->hasVFP2Base();
13862 case MVT::f64:
13863 return Subtarget->hasFP64();
13864 case MVT::v4f32:
13865 case MVT::v8f16:
13866 return Subtarget->hasMVEFloatOps();
13867 default:
13868 return false;
13869 }
13870}
13871
13874 const ARMSubtarget *ST) {
13875 // Allow the generic combiner to identify potential bswaps.
13876 if (DCI.isBeforeLegalize())
13877 return SDValue();
13878
13879 // DAG combiner will fold:
13880 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13881 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13882 // Other code patterns that can be also be modified have the following form:
13883 // b + ((a << 1) | 510)
13884 // b + ((a << 1) & 510)
13885 // b + ((a << 1) ^ 510)
13886 // b + ((a << 1) + 510)
13887
13888 // Many instructions can perform the shift for free, but it requires both
13889 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13890 // instruction will needed. So, unfold back to the original pattern if:
13891 // - if c1 and c2 are small enough that they don't require mov imms.
13892 // - the user(s) of the node can perform an shl
13893
13894 // No shifted operands for 16-bit instructions.
13895 if (ST->isThumb() && ST->isThumb1Only())
13896 return SDValue();
13897
13898 // Check that all the users could perform the shl themselves.
13899 for (auto *U : N->users()) {
13900 switch(U->getOpcode()) {
13901 default:
13902 return SDValue();
13903 case ISD::SUB:
13904 case ISD::ADD:
13905 case ISD::AND:
13906 case ISD::OR:
13907 case ISD::XOR:
13908 case ISD::SETCC:
13909 case ARMISD::CMP:
13910 // Check that the user isn't already using a constant because there
13911 // aren't any instructions that support an immediate operand and a
13912 // shifted operand.
13913 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13914 isa<ConstantSDNode>(U->getOperand(1)))
13915 return SDValue();
13916
13917 // Check that it's not already using a shift.
13918 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13919 U->getOperand(1).getOpcode() == ISD::SHL)
13920 return SDValue();
13921 break;
13922 }
13923 }
13924
13925 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13926 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13927 return SDValue();
13928
13929 if (N->getOperand(0).getOpcode() != ISD::SHL)
13930 return SDValue();
13931
13932 SDValue SHL = N->getOperand(0);
13933
13934 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13935 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13936 if (!C1ShlC2 || !C2)
13937 return SDValue();
13938
13939 APInt C2Int = C2->getAPIntValue();
13940 APInt C1Int = C1ShlC2->getAPIntValue();
13941 unsigned C2Width = C2Int.getBitWidth();
13942 if (C2Int.uge(C2Width))
13943 return SDValue();
13944 uint64_t C2Value = C2Int.getZExtValue();
13945
13946 // Check that performing a lshr will not lose any information.
13947 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13948 if ((C1Int & Mask) != C1Int)
13949 return SDValue();
13950
13951 // Shift the first constant.
13952 C1Int.lshrInPlace(C2Int);
13953
13954 // The immediates are encoded as an 8-bit value that can be rotated.
13955 auto LargeImm = [](const APInt &Imm) {
13956 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13957 return Imm.getBitWidth() - Zeros > 8;
13958 };
13959
13960 if (LargeImm(C1Int) || LargeImm(C2Int))
13961 return SDValue();
13962
13963 SelectionDAG &DAG = DCI.DAG;
13964 SDLoc dl(N);
13965 SDValue X = SHL.getOperand(0);
13966 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13967 DAG.getConstant(C1Int, dl, MVT::i32));
13968 // Shift left to compensate for the lshr of C1Int.
13969 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13970
13971 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13972 SHL.dump(); N->dump());
13973 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13974 return Res;
13975}
13976
13977
13978/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13979///
13982 const ARMSubtarget *Subtarget) {
13983 SDValue N0 = N->getOperand(0);
13984 SDValue N1 = N->getOperand(1);
13985
13986 // Only works one way, because it needs an immediate operand.
13987 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13988 return Result;
13989
13990 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13991 return Result;
13992
13993 // First try with the default operand order.
13994 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13995 return Result;
13996
13997 // If that didn't work, try again with the operands commuted.
13998 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13999}
14000
14001// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14002// providing -X is as cheap as X (currently, just a constant).
14004 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14005 return SDValue();
14006 SDValue CSINC = N->getOperand(1);
14007 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14008 return SDValue();
14009
14010 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14011 if (!X)
14012 return SDValue();
14013
14014 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14015 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14016 CSINC.getOperand(0)),
14017 CSINC.getOperand(1), CSINC.getOperand(2),
14018 CSINC.getOperand(3));
14019}
14020
14022 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
14023}
14024
14025// Try to fold
14026//
14027// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
14028//
14029// The folding helps cmov to be matched with csneg without generating
14030// redundant neg instruction.
14032 if (!isNegatedInteger(SDValue(N, 0)))
14033 return SDValue();
14034
14035 SDValue CMov = N->getOperand(1);
14036 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
14037 return SDValue();
14038
14039 SDValue N0 = CMov.getOperand(0);
14040 SDValue N1 = CMov.getOperand(1);
14041
14042 // If neither of them are negations, it's not worth the folding as it
14043 // introduces two additional negations while reducing one negation.
14044 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
14045 return SDValue();
14046
14047 SDLoc DL(N);
14048 EVT VT = CMov.getValueType();
14049
14050 SDValue N0N = DAG.getNegative(N0, DL, VT);
14051 SDValue N1N = DAG.getNegative(N1, DL, VT);
14052 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
14053 CMov.getOperand(3));
14054}
14055
14056/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14057///
14060 const ARMSubtarget *Subtarget) {
14061 SDValue N0 = N->getOperand(0);
14062 SDValue N1 = N->getOperand(1);
14063
14064 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14065 if (N1.getNode()->hasOneUse())
14066 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14067 return Result;
14068
14069 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14070 return R;
14071
14072 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14073 return Val;
14074
14075 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14076 return SDValue();
14077
14078 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14079 // so that we can readily pattern match more mve instructions which can use
14080 // a scalar operand.
14081 SDValue VDup = N->getOperand(1);
14082 if (VDup->getOpcode() != ARMISD::VDUP)
14083 return SDValue();
14084
14085 SDValue VMov = N->getOperand(0);
14086 if (VMov->getOpcode() == ISD::BITCAST)
14087 VMov = VMov->getOperand(0);
14088
14089 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14090 return SDValue();
14091
14092 SDLoc dl(N);
14093 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14094 DCI.DAG.getConstant(0, dl, MVT::i32),
14095 VDup->getOperand(0));
14096 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14097}
14098
14099/// PerformVMULCombine
14100/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14101/// special multiplier accumulator forwarding.
14102/// vmul d3, d0, d2
14103/// vmla d3, d1, d2
14104/// is faster than
14105/// vadd d3, d0, d1
14106/// vmul d3, d3, d2
14107// However, for (A + B) * (A + B),
14108// vadd d2, d0, d1
14109// vmul d3, d0, d2
14110// vmla d3, d1, d2
14111// is slower than
14112// vadd d2, d0, d1
14113// vmul d3, d2, d2
14116 const ARMSubtarget *Subtarget) {
14117 if (!Subtarget->hasVMLxForwarding())
14118 return SDValue();
14119
14120 SelectionDAG &DAG = DCI.DAG;
14121 SDValue N0 = N->getOperand(0);
14122 SDValue N1 = N->getOperand(1);
14123 unsigned Opcode = N0.getOpcode();
14124 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14125 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14126 Opcode = N1.getOpcode();
14127 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14128 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14129 return SDValue();
14130 std::swap(N0, N1);
14131 }
14132
14133 if (N0 == N1)
14134 return SDValue();
14135
14136 EVT VT = N->getValueType(0);
14137 SDLoc DL(N);
14138 SDValue N00 = N0->getOperand(0);
14139 SDValue N01 = N0->getOperand(1);
14140 return DAG.getNode(Opcode, DL, VT,
14141 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14142 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14143}
14144
14146 const ARMSubtarget *Subtarget) {
14147 EVT VT = N->getValueType(0);
14148 if (VT != MVT::v2i64)
14149 return SDValue();
14150
14151 SDValue N0 = N->getOperand(0);
14152 SDValue N1 = N->getOperand(1);
14153
14154 auto IsSignExt = [&](SDValue Op) {
14155 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14156 return SDValue();
14157 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14158 if (VT.getScalarSizeInBits() == 32)
14159 return Op->getOperand(0);
14160 return SDValue();
14161 };
14162 auto IsZeroExt = [&](SDValue Op) {
14163 // Zero extends are a little more awkward. At the point we are matching
14164 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14165 // That might be before of after a bitcast depending on how the and is
14166 // placed. Because this has to look through bitcasts, it is currently only
14167 // supported on LE.
14168 if (!Subtarget->isLittle())
14169 return SDValue();
14170
14171 SDValue And = Op;
14172 if (And->getOpcode() == ISD::BITCAST)
14173 And = And->getOperand(0);
14174 if (And->getOpcode() != ISD::AND)
14175 return SDValue();
14176 SDValue Mask = And->getOperand(1);
14177 if (Mask->getOpcode() == ISD::BITCAST)
14178 Mask = Mask->getOperand(0);
14179
14180 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14181 Mask.getValueType() != MVT::v4i32)
14182 return SDValue();
14183 if (isAllOnesConstant(Mask->getOperand(0)) &&
14184 isNullConstant(Mask->getOperand(1)) &&
14185 isAllOnesConstant(Mask->getOperand(2)) &&
14186 isNullConstant(Mask->getOperand(3)))
14187 return And->getOperand(0);
14188 return SDValue();
14189 };
14190
14191 SDLoc dl(N);
14192 if (SDValue Op0 = IsSignExt(N0)) {
14193 if (SDValue Op1 = IsSignExt(N1)) {
14194 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14195 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14196 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14197 }
14198 }
14199 if (SDValue Op0 = IsZeroExt(N0)) {
14200 if (SDValue Op1 = IsZeroExt(N1)) {
14201 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14202 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14203 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14204 }
14205 }
14206
14207 return SDValue();
14208}
14209
14212 const ARMSubtarget *Subtarget) {
14213 SelectionDAG &DAG = DCI.DAG;
14214
14215 EVT VT = N->getValueType(0);
14216 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14217 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14218
14219 if (Subtarget->isThumb1Only())
14220 return SDValue();
14221
14222 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14223 return SDValue();
14224
14225 if (VT.is64BitVector() || VT.is128BitVector())
14226 return PerformVMULCombine(N, DCI, Subtarget);
14227 if (VT != MVT::i32)
14228 return SDValue();
14229
14230 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14231 if (!C)
14232 return SDValue();
14233
14234 int64_t MulAmt = C->getSExtValue();
14235 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14236
14237 ShiftAmt = ShiftAmt & (32 - 1);
14238 SDValue V = N->getOperand(0);
14239 SDLoc DL(N);
14240
14241 SDValue Res;
14242 MulAmt >>= ShiftAmt;
14243
14244 if (MulAmt >= 0) {
14245 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14246 // (mul x, 2^N + 1) => (add (shl x, N), x)
14247 Res = DAG.getNode(ISD::ADD, DL, VT,
14248 V,
14249 DAG.getNode(ISD::SHL, DL, VT,
14250 V,
14251 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14252 MVT::i32)));
14253 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14254 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14255 Res = DAG.getNode(ISD::SUB, DL, VT,
14256 DAG.getNode(ISD::SHL, DL, VT,
14257 V,
14258 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14259 MVT::i32)),
14260 V);
14261 } else
14262 return SDValue();
14263 } else {
14264 uint64_t MulAmtAbs = -MulAmt;
14265 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14266 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14267 Res = DAG.getNode(ISD::SUB, DL, VT,
14268 V,
14269 DAG.getNode(ISD::SHL, DL, VT,
14270 V,
14271 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14272 MVT::i32)));
14273 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14274 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14275 Res = DAG.getNode(ISD::ADD, DL, VT,
14276 V,
14277 DAG.getNode(ISD::SHL, DL, VT,
14278 V,
14279 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14280 MVT::i32)));
14281 Res = DAG.getNode(ISD::SUB, DL, VT,
14282 DAG.getConstant(0, DL, MVT::i32), Res);
14283 } else
14284 return SDValue();
14285 }
14286
14287 if (ShiftAmt != 0)
14288 Res = DAG.getNode(ISD::SHL, DL, VT,
14289 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14290
14291 // Do not add new nodes to DAG combiner worklist.
14292 DCI.CombineTo(N, Res, false);
14293 return SDValue();
14294}
14295
14298 const ARMSubtarget *Subtarget) {
14299 // Allow DAGCombine to pattern-match before we touch the canonical form.
14300 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14301 return SDValue();
14302
14303 if (N->getValueType(0) != MVT::i32)
14304 return SDValue();
14305
14306 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14307 if (!N1C)
14308 return SDValue();
14309
14310 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14311 // Don't transform uxtb/uxth.
14312 if (C1 == 255 || C1 == 65535)
14313 return SDValue();
14314
14315 SDNode *N0 = N->getOperand(0).getNode();
14316 if (!N0->hasOneUse())
14317 return SDValue();
14318
14319 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14320 return SDValue();
14321
14322 bool LeftShift = N0->getOpcode() == ISD::SHL;
14323
14325 if (!N01C)
14326 return SDValue();
14327
14328 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14329 if (!C2 || C2 >= 32)
14330 return SDValue();
14331
14332 // Clear irrelevant bits in the mask.
14333 if (LeftShift)
14334 C1 &= (-1U << C2);
14335 else
14336 C1 &= (-1U >> C2);
14337
14338 SelectionDAG &DAG = DCI.DAG;
14339 SDLoc DL(N);
14340
14341 // We have a pattern of the form "(and (shl x, c2) c1)" or
14342 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14343 // transform to a pair of shifts, to save materializing c1.
14344
14345 // First pattern: right shift, then mask off leading bits.
14346 // FIXME: Use demanded bits?
14347 if (!LeftShift && isMask_32(C1)) {
14348 uint32_t C3 = llvm::countl_zero(C1);
14349 if (C2 < C3) {
14350 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14351 DAG.getConstant(C3 - C2, DL, MVT::i32));
14352 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14353 DAG.getConstant(C3, DL, MVT::i32));
14354 }
14355 }
14356
14357 // First pattern, reversed: left shift, then mask off trailing bits.
14358 if (LeftShift && isMask_32(~C1)) {
14359 uint32_t C3 = llvm::countr_zero(C1);
14360 if (C2 < C3) {
14361 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14362 DAG.getConstant(C3 - C2, DL, MVT::i32));
14363 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14364 DAG.getConstant(C3, DL, MVT::i32));
14365 }
14366 }
14367
14368 // Second pattern: left shift, then mask off leading bits.
14369 // FIXME: Use demanded bits?
14370 if (LeftShift && isShiftedMask_32(C1)) {
14371 uint32_t Trailing = llvm::countr_zero(C1);
14372 uint32_t C3 = llvm::countl_zero(C1);
14373 if (Trailing == C2 && C2 + C3 < 32) {
14374 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14375 DAG.getConstant(C2 + C3, DL, MVT::i32));
14376 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14377 DAG.getConstant(C3, DL, MVT::i32));
14378 }
14379 }
14380
14381 // Second pattern, reversed: right shift, then mask off trailing bits.
14382 // FIXME: Handle other patterns of known/demanded bits.
14383 if (!LeftShift && isShiftedMask_32(C1)) {
14384 uint32_t Leading = llvm::countl_zero(C1);
14385 uint32_t C3 = llvm::countr_zero(C1);
14386 if (Leading == C2 && C2 + C3 < 32) {
14387 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14388 DAG.getConstant(C2 + C3, DL, MVT::i32));
14389 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14390 DAG.getConstant(C3, DL, MVT::i32));
14391 }
14392 }
14393
14394 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14395 // if "c1 >> c2" is a cheaper immediate than "c1"
14396 if (LeftShift &&
14397 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14398
14399 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14400 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14401 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14402 DAG.getConstant(C2, DL, MVT::i32));
14403 }
14404
14405 return SDValue();
14406}
14407
14410 const ARMSubtarget *Subtarget) {
14411 // Attempt to use immediate-form VBIC
14412 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14413 SDLoc dl(N);
14414 EVT VT = N->getValueType(0);
14415 SelectionDAG &DAG = DCI.DAG;
14416
14417 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14418 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14419 return SDValue();
14420
14421 APInt SplatBits, SplatUndef;
14422 unsigned SplatBitSize;
14423 bool HasAnyUndefs;
14424 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14425 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14426 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14427 SplatBitSize == 64) {
14428 EVT VbicVT;
14429 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14430 SplatUndef.getZExtValue(), SplatBitSize,
14431 DAG, dl, VbicVT, VT, OtherModImm);
14432 if (Val.getNode()) {
14433 SDValue Input =
14434 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14435 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14436 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14437 }
14438 }
14439 }
14440
14441 if (!Subtarget->isThumb1Only()) {
14442 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14443 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14444 return Result;
14445
14446 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14447 return Result;
14448 }
14449
14450 if (Subtarget->isThumb1Only())
14451 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14452 return Result;
14453
14454 return SDValue();
14455}
14456
14457// Try combining OR nodes to SMULWB, SMULWT.
14460 const ARMSubtarget *Subtarget) {
14461 if (!Subtarget->hasV6Ops() ||
14462 (Subtarget->isThumb() &&
14463 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14464 return SDValue();
14465
14466 SDValue SRL = OR->getOperand(0);
14467 SDValue SHL = OR->getOperand(1);
14468
14469 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14470 SRL = OR->getOperand(1);
14471 SHL = OR->getOperand(0);
14472 }
14473 if (!isSRL16(SRL) || !isSHL16(SHL))
14474 return SDValue();
14475
14476 // The first operands to the shifts need to be the two results from the
14477 // same smul_lohi node.
14478 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14479 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14480 return SDValue();
14481
14482 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14483 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14484 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14485 return SDValue();
14486
14487 // Now we have:
14488 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14489 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14490 // For SMUWB the 16-bit value will signed extended somehow.
14491 // For SMULWT only the SRA is required.
14492 // Check both sides of SMUL_LOHI
14493 SDValue OpS16 = SMULLOHI->getOperand(0);
14494 SDValue OpS32 = SMULLOHI->getOperand(1);
14495
14496 SelectionDAG &DAG = DCI.DAG;
14497 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14498 OpS16 = OpS32;
14499 OpS32 = SMULLOHI->getOperand(0);
14500 }
14501
14502 SDLoc dl(OR);
14503 unsigned Opcode = 0;
14504 if (isS16(OpS16, DAG))
14505 Opcode = ARMISD::SMULWB;
14506 else if (isSRA16(OpS16)) {
14507 Opcode = ARMISD::SMULWT;
14508 OpS16 = OpS16->getOperand(0);
14509 }
14510 else
14511 return SDValue();
14512
14513 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14514 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14515 return SDValue(OR, 0);
14516}
14517
14520 const ARMSubtarget *Subtarget) {
14521 // BFI is only available on V6T2+
14522 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14523 return SDValue();
14524
14525 EVT VT = N->getValueType(0);
14526 SDValue N0 = N->getOperand(0);
14527 SDValue N1 = N->getOperand(1);
14528 SelectionDAG &DAG = DCI.DAG;
14529 SDLoc DL(N);
14530 // 1) or (and A, mask), val => ARMbfi A, val, mask
14531 // iff (val & mask) == val
14532 //
14533 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14534 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14535 // && mask == ~mask2
14536 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14537 // && ~mask == mask2
14538 // (i.e., copy a bitfield value into another bitfield of the same width)
14539
14540 if (VT != MVT::i32)
14541 return SDValue();
14542
14543 SDValue N00 = N0.getOperand(0);
14544
14545 // The value and the mask need to be constants so we can verify this is
14546 // actually a bitfield set. If the mask is 0xffff, we can do better
14547 // via a movt instruction, so don't use BFI in that case.
14548 SDValue MaskOp = N0.getOperand(1);
14550 if (!MaskC)
14551 return SDValue();
14552 unsigned Mask = MaskC->getZExtValue();
14553 if (Mask == 0xffff)
14554 return SDValue();
14555 SDValue Res;
14556 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14558 if (N1C) {
14559 unsigned Val = N1C->getZExtValue();
14560 if ((Val & ~Mask) != Val)
14561 return SDValue();
14562
14563 if (ARM::isBitFieldInvertedMask(Mask)) {
14564 Val >>= llvm::countr_zero(~Mask);
14565
14566 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14567 DAG.getConstant(Val, DL, MVT::i32),
14568 DAG.getConstant(Mask, DL, MVT::i32));
14569
14570 DCI.CombineTo(N, Res, false);
14571 // Return value from the original node to inform the combiner than N is
14572 // now dead.
14573 return SDValue(N, 0);
14574 }
14575 } else if (N1.getOpcode() == ISD::AND) {
14576 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14578 if (!N11C)
14579 return SDValue();
14580 unsigned Mask2 = N11C->getZExtValue();
14581
14582 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14583 // as is to match.
14584 if (ARM::isBitFieldInvertedMask(Mask) &&
14585 (Mask == ~Mask2)) {
14586 // The pack halfword instruction works better for masks that fit it,
14587 // so use that when it's available.
14588 if (Subtarget->hasDSP() &&
14589 (Mask == 0xffff || Mask == 0xffff0000))
14590 return SDValue();
14591 // 2a
14592 unsigned amt = llvm::countr_zero(Mask2);
14593 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14594 DAG.getConstant(amt, DL, MVT::i32));
14595 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14596 DAG.getConstant(Mask, DL, MVT::i32));
14597 DCI.CombineTo(N, Res, false);
14598 // Return value from the original node to inform the combiner than N is
14599 // now dead.
14600 return SDValue(N, 0);
14601 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14602 (~Mask == Mask2)) {
14603 // The pack halfword instruction works better for masks that fit it,
14604 // so use that when it's available.
14605 if (Subtarget->hasDSP() &&
14606 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14607 return SDValue();
14608 // 2b
14609 unsigned lsb = llvm::countr_zero(Mask);
14610 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14611 DAG.getConstant(lsb, DL, MVT::i32));
14612 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14613 DAG.getConstant(Mask2, DL, MVT::i32));
14614 DCI.CombineTo(N, Res, false);
14615 // Return value from the original node to inform the combiner than N is
14616 // now dead.
14617 return SDValue(N, 0);
14618 }
14619 }
14620
14621 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14622 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14624 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14625 // where lsb(mask) == #shamt and masked bits of B are known zero.
14626 SDValue ShAmt = N00.getOperand(1);
14627 unsigned ShAmtC = ShAmt->getAsZExtVal();
14628 unsigned LSB = llvm::countr_zero(Mask);
14629 if (ShAmtC != LSB)
14630 return SDValue();
14631
14632 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14633 DAG.getConstant(~Mask, DL, MVT::i32));
14634
14635 DCI.CombineTo(N, Res, false);
14636 // Return value from the original node to inform the combiner than N is
14637 // now dead.
14638 return SDValue(N, 0);
14639 }
14640
14641 return SDValue();
14642}
14643
14644static bool isValidMVECond(unsigned CC, bool IsFloat) {
14645 switch (CC) {
14646 case ARMCC::EQ:
14647 case ARMCC::NE:
14648 case ARMCC::LE:
14649 case ARMCC::GT:
14650 case ARMCC::GE:
14651 case ARMCC::LT:
14652 return true;
14653 case ARMCC::HS:
14654 case ARMCC::HI:
14655 return !IsFloat;
14656 default:
14657 return false;
14658 };
14659}
14660
14662 if (N->getOpcode() == ARMISD::VCMP)
14663 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14664 else if (N->getOpcode() == ARMISD::VCMPZ)
14665 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14666 else
14667 llvm_unreachable("Not a VCMP/VCMPZ!");
14668}
14669
14672 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14673}
14674
14676 const ARMSubtarget *Subtarget) {
14677 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14678 // together with predicates
14679 EVT VT = N->getValueType(0);
14680 SDLoc DL(N);
14681 SDValue N0 = N->getOperand(0);
14682 SDValue N1 = N->getOperand(1);
14683
14684 auto IsFreelyInvertable = [&](SDValue V) {
14685 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14686 return CanInvertMVEVCMP(V);
14687 return false;
14688 };
14689
14690 // At least one operand must be freely invertable.
14691 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14692 return SDValue();
14693
14694 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14695 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14696 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14697 return DAG.getLogicalNOT(DL, And, VT);
14698}
14699
14700/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14703 const ARMSubtarget *Subtarget) {
14704 // Attempt to use immediate-form VORR
14705 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14706 SDLoc dl(N);
14707 EVT VT = N->getValueType(0);
14708 SelectionDAG &DAG = DCI.DAG;
14709
14710 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14711 return SDValue();
14712
14713 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14714 VT == MVT::v8i1 || VT == MVT::v16i1))
14715 return PerformORCombine_i1(N, DAG, Subtarget);
14716
14717 APInt SplatBits, SplatUndef;
14718 unsigned SplatBitSize;
14719 bool HasAnyUndefs;
14720 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14721 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14722 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14723 SplatBitSize == 64) {
14724 EVT VorrVT;
14725 SDValue Val =
14726 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14727 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14728 if (Val.getNode()) {
14729 SDValue Input =
14730 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14731 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14732 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14733 }
14734 }
14735 }
14736
14737 if (!Subtarget->isThumb1Only()) {
14738 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14739 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14740 return Result;
14741 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14742 return Result;
14743 }
14744
14745 SDValue N0 = N->getOperand(0);
14746 SDValue N1 = N->getOperand(1);
14747
14748 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14749 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14751
14752 // The code below optimizes (or (and X, Y), Z).
14753 // The AND operand needs to have a single user to make these optimizations
14754 // profitable.
14755 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14756 return SDValue();
14757
14758 APInt SplatUndef;
14759 unsigned SplatBitSize;
14760 bool HasAnyUndefs;
14761
14762 APInt SplatBits0, SplatBits1;
14765 // Ensure that the second operand of both ands are constants
14766 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14767 HasAnyUndefs) && !HasAnyUndefs) {
14768 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14769 HasAnyUndefs) && !HasAnyUndefs) {
14770 // Ensure that the bit width of the constants are the same and that
14771 // the splat arguments are logical inverses as per the pattern we
14772 // are trying to simplify.
14773 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14774 SplatBits0 == ~SplatBits1) {
14775 // Canonicalize the vector type to make instruction selection
14776 // simpler.
14777 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14778 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14779 N0->getOperand(1),
14780 N0->getOperand(0),
14781 N1->getOperand(0));
14782 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14783 }
14784 }
14785 }
14786 }
14787
14788 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14789 // reasonable.
14790 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14791 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14792 return Res;
14793 }
14794
14795 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14796 return Result;
14797
14798 return SDValue();
14799}
14800
14803 const ARMSubtarget *Subtarget) {
14804 EVT VT = N->getValueType(0);
14805 SelectionDAG &DAG = DCI.DAG;
14806
14807 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14808 return SDValue();
14809
14810 if (!Subtarget->isThumb1Only()) {
14811 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14812 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14813 return Result;
14814
14815 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14816 return Result;
14817 }
14818
14819 if (Subtarget->hasMVEIntegerOps()) {
14820 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14821 SDValue N0 = N->getOperand(0);
14822 SDValue N1 = N->getOperand(1);
14823 const TargetLowering *TLI = Subtarget->getTargetLowering();
14824 if (TLI->isConstTrueVal(N1) &&
14825 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14826 if (CanInvertMVEVCMP(N0)) {
14827 SDLoc DL(N0);
14829
14831 Ops.push_back(N0->getOperand(0));
14832 if (N0->getOpcode() == ARMISD::VCMP)
14833 Ops.push_back(N0->getOperand(1));
14834 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14835 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14836 }
14837 }
14838 }
14839
14840 return SDValue();
14841}
14842
14843// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14844// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14845// their position in "to" (Rd).
14846static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14847 assert(N->getOpcode() == ARMISD::BFI);
14848
14849 SDValue From = N->getOperand(1);
14850 ToMask = ~N->getConstantOperandAPInt(2);
14851 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14852
14853 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14854 // #C in the base of the SHR.
14855 if (From->getOpcode() == ISD::SRL &&
14856 isa<ConstantSDNode>(From->getOperand(1))) {
14857 APInt Shift = From->getConstantOperandAPInt(1);
14858 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14859 FromMask <<= Shift.getLimitedValue(31);
14860 From = From->getOperand(0);
14861 }
14862
14863 return From;
14864}
14865
14866// If A and B contain one contiguous set of bits, does A | B == A . B?
14867//
14868// Neither A nor B must be zero.
14869static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14870 unsigned LastActiveBitInA = A.countr_zero();
14871 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14872 return LastActiveBitInA - 1 == FirstActiveBitInB;
14873}
14874
14876 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14877 APInt ToMask, FromMask;
14878 SDValue From = ParseBFI(N, ToMask, FromMask);
14879 SDValue To = N->getOperand(0);
14880
14881 SDValue V = To;
14882 if (V.getOpcode() != ARMISD::BFI)
14883 return SDValue();
14884
14885 APInt NewToMask, NewFromMask;
14886 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14887 if (NewFrom != From)
14888 return SDValue();
14889
14890 // Do the written bits conflict with any we've seen so far?
14891 if ((NewToMask & ToMask).getBoolValue())
14892 // Conflicting bits.
14893 return SDValue();
14894
14895 // Are the new bits contiguous when combined with the old bits?
14896 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14897 BitsProperlyConcatenate(FromMask, NewFromMask))
14898 return V;
14899 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14900 BitsProperlyConcatenate(NewFromMask, FromMask))
14901 return V;
14902
14903 return SDValue();
14904}
14905
14907 SDValue N0 = N->getOperand(0);
14908 SDValue N1 = N->getOperand(1);
14909
14910 if (N1.getOpcode() == ISD::AND) {
14911 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14912 // the bits being cleared by the AND are not demanded by the BFI.
14914 if (!N11C)
14915 return SDValue();
14916 unsigned InvMask = N->getConstantOperandVal(2);
14917 unsigned LSB = llvm::countr_zero(~InvMask);
14918 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14919 assert(Width <
14920 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14921 "undefined behavior");
14922 unsigned Mask = (1u << Width) - 1;
14923 unsigned Mask2 = N11C->getZExtValue();
14924 if ((Mask & (~Mask2)) == 0)
14925 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14926 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14927 return SDValue();
14928 }
14929
14930 // Look for another BFI to combine with.
14931 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14932 // We've found a BFI.
14933 APInt ToMask1, FromMask1;
14934 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14935
14936 APInt ToMask2, FromMask2;
14937 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14938 assert(From1 == From2);
14939 (void)From2;
14940
14941 // Create a new BFI, combining the two together.
14942 APInt NewFromMask = FromMask1 | FromMask2;
14943 APInt NewToMask = ToMask1 | ToMask2;
14944
14945 EVT VT = N->getValueType(0);
14946 SDLoc dl(N);
14947
14948 if (NewFromMask[0] == 0)
14949 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14950 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14951 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14952 DAG.getConstant(~NewToMask, dl, VT));
14953 }
14954
14955 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14956 // that lower bit insertions are performed first, providing that M1 and M2
14957 // do no overlap. This can allow multiple BFI instructions to be combined
14958 // together by the other folds above.
14959 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14960 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14961 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14962
14963 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14964 ToMask1.countl_zero() < ToMask2.countl_zero())
14965 return SDValue();
14966
14967 EVT VT = N->getValueType(0);
14968 SDLoc dl(N);
14969 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14970 N->getOperand(1), N->getOperand(2));
14971 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14972 N0.getOperand(2));
14973 }
14974
14975 return SDValue();
14976}
14977
14978// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14979// or CMPZ(CMOV(1, 0, CC, X))
14980// return X if valid.
14982 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14983 return SDValue();
14984 SDValue CSInc = Cmp->getOperand(0);
14985
14986 // Ignore any `And 1` nodes that may not yet have been removed. We are
14987 // looking for a value that produces 1/0, so these have no effect on the
14988 // code.
14989 while (CSInc.getOpcode() == ISD::AND &&
14990 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14991 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14992 CSInc = CSInc.getOperand(0);
14993
14994 if (CSInc.getOpcode() == ARMISD::CSINC &&
14995 isNullConstant(CSInc.getOperand(0)) &&
14996 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14998 return CSInc.getOperand(3);
14999 }
15000 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15001 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15003 return CSInc.getOperand(3);
15004 }
15005 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15006 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15009 return CSInc.getOperand(3);
15010 }
15011 return SDValue();
15012}
15013
15015 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15016 // t92: flags = ARMISD::CMPZ t74, 0
15017 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15018 // t96: flags = ARMISD::CMPZ t93, 0
15019 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15021 if (SDValue C = IsCMPZCSINC(N, Cond))
15022 if (Cond == ARMCC::EQ)
15023 return C;
15024 return SDValue();
15025}
15026
15028 // Fold away an unneccessary CMPZ/CSINC
15029 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15030 // if C1==EQ -> CSXYZ A, B, C2, D
15031 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15033 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15034 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15035 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15036 N->getOperand(1),
15037 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15038 if (N->getConstantOperandVal(2) == ARMCC::NE)
15039 return DAG.getNode(
15040 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15041 N->getOperand(1),
15043 }
15044 return SDValue();
15045}
15046
15047/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15048/// ARMISD::VMOVRRD.
15051 const ARMSubtarget *Subtarget) {
15052 // vmovrrd(vmovdrr x, y) -> x,y
15053 SDValue InDouble = N->getOperand(0);
15054 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15055 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15056
15057 // vmovrrd(load f64) -> (load i32), (load i32)
15058 SDNode *InNode = InDouble.getNode();
15059 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15060 InNode->getValueType(0) == MVT::f64 &&
15061 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15062 !cast<LoadSDNode>(InNode)->isVolatile()) {
15063 // TODO: Should this be done for non-FrameIndex operands?
15064 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15065
15066 SelectionDAG &DAG = DCI.DAG;
15067 SDLoc DL(LD);
15068 SDValue BasePtr = LD->getBasePtr();
15069 SDValue NewLD1 =
15070 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15071 LD->getAlign(), LD->getMemOperand()->getFlags());
15072
15073 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15074 DAG.getConstant(4, DL, MVT::i32));
15075
15076 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15077 LD->getPointerInfo().getWithOffset(4),
15078 commonAlignment(LD->getAlign(), 4),
15079 LD->getMemOperand()->getFlags());
15080
15081 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15082 if (DCI.DAG.getDataLayout().isBigEndian())
15083 std::swap (NewLD1, NewLD2);
15084 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15085 return Result;
15086 }
15087
15088 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15089 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15090 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15091 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15092 SDValue BV = InDouble.getOperand(0);
15093 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15094 // change lane order under big endian.
15095 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15096 while (
15097 (BV.getOpcode() == ISD::BITCAST ||
15099 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15100 BVSwap = BV.getOpcode() == ISD::BITCAST;
15101 BV = BV.getOperand(0);
15102 }
15103 if (BV.getValueType() != MVT::v4i32)
15104 return SDValue();
15105
15106 // Handle buildvectors, pulling out the correct lane depending on
15107 // endianness.
15108 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15109 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15110 SDValue Op0 = BV.getOperand(Offset);
15111 SDValue Op1 = BV.getOperand(Offset + 1);
15112 if (!Subtarget->isLittle() && BVSwap)
15113 std::swap(Op0, Op1);
15114
15115 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15116 }
15117
15118 // A chain of insert_vectors, grabbing the correct value of the chain of
15119 // inserts.
15120 SDValue Op0, Op1;
15121 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15122 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15123 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15124 Op0 = BV.getOperand(1);
15125 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15126 Op1 = BV.getOperand(1);
15127 }
15128 BV = BV.getOperand(0);
15129 }
15130 if (!Subtarget->isLittle() && BVSwap)
15131 std::swap(Op0, Op1);
15132 if (Op0 && Op1)
15133 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15134 }
15135
15136 return SDValue();
15137}
15138
15139/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15140/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15142 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15143 SDValue Op0 = N->getOperand(0);
15144 SDValue Op1 = N->getOperand(1);
15145 if (Op0.getOpcode() == ISD::BITCAST)
15146 Op0 = Op0.getOperand(0);
15147 if (Op1.getOpcode() == ISD::BITCAST)
15148 Op1 = Op1.getOperand(0);
15149 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15150 Op0.getNode() == Op1.getNode() &&
15151 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15152 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15153 N->getValueType(0), Op0.getOperand(0));
15154 return SDValue();
15155}
15156
15159 SDValue Op0 = N->getOperand(0);
15160
15161 // VMOVhr (VMOVrh (X)) -> X
15162 if (Op0->getOpcode() == ARMISD::VMOVrh)
15163 return Op0->getOperand(0);
15164
15165 // FullFP16: half values are passed in S-registers, and we don't
15166 // need any of the bitcast and moves:
15167 //
15168 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15169 // t5: i32 = bitcast t2
15170 // t18: f16 = ARMISD::VMOVhr t5
15171 // =>
15172 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15173 if (Op0->getOpcode() == ISD::BITCAST) {
15174 SDValue Copy = Op0->getOperand(0);
15175 if (Copy.getValueType() == MVT::f32 &&
15176 Copy->getOpcode() == ISD::CopyFromReg) {
15177 bool HasGlue = Copy->getNumOperands() == 3;
15178 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15179 HasGlue ? Copy->getOperand(2) : SDValue()};
15180 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15181 SDValue NewCopy =
15183 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15184 ArrayRef(Ops, HasGlue ? 3 : 2));
15185
15186 // Update Users, Chains, and Potential Glue.
15187 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15188 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15189 if (HasGlue)
15190 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15191 NewCopy.getValue(2));
15192
15193 return NewCopy;
15194 }
15195 }
15196
15197 // fold (VMOVhr (load x)) -> (load (f16*)x)
15198 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15199 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15200 LN0->getMemoryVT() == MVT::i16) {
15201 SDValue Load =
15202 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15203 LN0->getBasePtr(), LN0->getMemOperand());
15204 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15205 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15206 return Load;
15207 }
15208 }
15209
15210 // Only the bottom 16 bits of the source register are used.
15211 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15212 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15213 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15214 return SDValue(N, 0);
15215
15216 return SDValue();
15217}
15218
15220 SDValue N0 = N->getOperand(0);
15221 EVT VT = N->getValueType(0);
15222
15223 // fold (VMOVrh (fpconst x)) -> const x
15225 APFloat V = C->getValueAPF();
15226 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15227 }
15228
15229 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15230 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15231 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15232
15233 SDValue Load =
15234 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15235 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15236 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15237 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15238 return Load;
15239 }
15240
15241 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15242 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15244 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15245 N0->getOperand(1));
15246
15247 return SDValue();
15248}
15249
15250/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15251/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15252/// i64 vector to have f64 elements, since the value can then be loaded
15253/// directly into a VFP register.
15255 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15256 for (unsigned i = 0; i < NumElts; ++i) {
15257 SDNode *Elt = N->getOperand(i).getNode();
15258 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15259 return true;
15260 }
15261 return false;
15262}
15263
15264/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15265/// ISD::BUILD_VECTOR.
15268 const ARMSubtarget *Subtarget) {
15269 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15270 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15271 // into a pair of GPRs, which is fine when the value is used as a scalar,
15272 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15273 SelectionDAG &DAG = DCI.DAG;
15274 if (N->getNumOperands() == 2)
15275 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15276 return RV;
15277
15278 // Load i64 elements as f64 values so that type legalization does not split
15279 // them up into i32 values.
15280 EVT VT = N->getValueType(0);
15281 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15282 return SDValue();
15283 SDLoc dl(N);
15285 unsigned NumElts = VT.getVectorNumElements();
15286 for (unsigned i = 0; i < NumElts; ++i) {
15287 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15288 Ops.push_back(V);
15289 // Make the DAGCombiner fold the bitcast.
15290 DCI.AddToWorklist(V.getNode());
15291 }
15292 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15293 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15294 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15295}
15296
15297/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15298static SDValue
15300 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15301 // At that time, we may have inserted bitcasts from integer to float.
15302 // If these bitcasts have survived DAGCombine, change the lowering of this
15303 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15304 // force to use floating point types.
15305
15306 // Make sure we can change the type of the vector.
15307 // This is possible iff:
15308 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15309 // 1.1. Vector is used only once.
15310 // 1.2. Use is a bit convert to an integer type.
15311 // 2. The size of its operands are 32-bits (64-bits are not legal).
15312 EVT VT = N->getValueType(0);
15313 EVT EltVT = VT.getVectorElementType();
15314
15315 // Check 1.1. and 2.
15316 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15317 return SDValue();
15318
15319 // By construction, the input type must be float.
15320 assert(EltVT == MVT::f32 && "Unexpected type!");
15321
15322 // Check 1.2.
15323 SDNode *Use = *N->user_begin();
15324 if (Use->getOpcode() != ISD::BITCAST ||
15325 Use->getValueType(0).isFloatingPoint())
15326 return SDValue();
15327
15328 // Check profitability.
15329 // Model is, if more than half of the relevant operands are bitcast from
15330 // i32, turn the build_vector into a sequence of insert_vector_elt.
15331 // Relevant operands are everything that is not statically
15332 // (i.e., at compile time) bitcasted.
15333 unsigned NumOfBitCastedElts = 0;
15334 unsigned NumElts = VT.getVectorNumElements();
15335 unsigned NumOfRelevantElts = NumElts;
15336 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15337 SDValue Elt = N->getOperand(Idx);
15338 if (Elt->getOpcode() == ISD::BITCAST) {
15339 // Assume only bit cast to i32 will go away.
15340 if (Elt->getOperand(0).getValueType() == MVT::i32)
15341 ++NumOfBitCastedElts;
15342 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15343 // Constants are statically casted, thus do not count them as
15344 // relevant operands.
15345 --NumOfRelevantElts;
15346 }
15347
15348 // Check if more than half of the elements require a non-free bitcast.
15349 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15350 return SDValue();
15351
15352 SelectionDAG &DAG = DCI.DAG;
15353 // Create the new vector type.
15354 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15355 // Check if the type is legal.
15356 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15357 if (!TLI.isTypeLegal(VecVT))
15358 return SDValue();
15359
15360 // Combine:
15361 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15362 // => BITCAST INSERT_VECTOR_ELT
15363 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15364 // (BITCAST EN), N.
15365 SDValue Vec = DAG.getUNDEF(VecVT);
15366 SDLoc dl(N);
15367 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15368 SDValue V = N->getOperand(Idx);
15369 if (V.isUndef())
15370 continue;
15371 if (V.getOpcode() == ISD::BITCAST &&
15372 V->getOperand(0).getValueType() == MVT::i32)
15373 // Fold obvious case.
15374 V = V.getOperand(0);
15375 else {
15376 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15377 // Make the DAGCombiner fold the bitcasts.
15378 DCI.AddToWorklist(V.getNode());
15379 }
15380 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15381 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15382 }
15383 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15384 // Make the DAGCombiner fold the bitcasts.
15385 DCI.AddToWorklist(Vec.getNode());
15386 return Vec;
15387}
15388
15389static SDValue
15391 EVT VT = N->getValueType(0);
15392 SDValue Op = N->getOperand(0);
15393 SDLoc dl(N);
15394
15395 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15396 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15397 // If the valuetypes are the same, we can remove the cast entirely.
15398 if (Op->getOperand(0).getValueType() == VT)
15399 return Op->getOperand(0);
15400 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15401 }
15402
15403 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15404 // more VPNOT which might get folded as else predicates.
15405 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15406 SDValue X =
15407 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15409 DCI.DAG.getConstant(65535, dl, MVT::i32));
15410 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15411 }
15412
15413 // Only the bottom 16 bits of the source register are used.
15414 if (Op.getValueType() == MVT::i32) {
15415 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15416 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15417 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15418 return SDValue(N, 0);
15419 }
15420 return SDValue();
15421}
15422
15424 const ARMSubtarget *ST) {
15425 EVT VT = N->getValueType(0);
15426 SDValue Op = N->getOperand(0);
15427 SDLoc dl(N);
15428
15429 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15430 if (ST->isLittle())
15431 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15432
15433 // VT VECTOR_REG_CAST (VT Op) -> Op
15434 if (Op.getValueType() == VT)
15435 return Op;
15436 // VECTOR_REG_CAST undef -> undef
15437 if (Op.isUndef())
15438 return DAG.getUNDEF(VT);
15439
15440 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15441 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15442 // If the valuetypes are the same, we can remove the cast entirely.
15443 if (Op->getOperand(0).getValueType() == VT)
15444 return Op->getOperand(0);
15445 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15446 }
15447
15448 return SDValue();
15449}
15450
15452 const ARMSubtarget *Subtarget) {
15453 if (!Subtarget->hasMVEIntegerOps())
15454 return SDValue();
15455
15456 EVT VT = N->getValueType(0);
15457 SDValue Op0 = N->getOperand(0);
15458 SDValue Op1 = N->getOperand(1);
15459 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15460 SDLoc dl(N);
15461
15462 // vcmp X, 0, cc -> vcmpz X, cc
15463 if (isZeroVector(Op1))
15464 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15465
15466 unsigned SwappedCond = getSwappedCondition(Cond);
15467 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15468 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15469 if (isZeroVector(Op0))
15470 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15471 DAG.getConstant(SwappedCond, dl, MVT::i32));
15472 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15473 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15474 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15475 DAG.getConstant(SwappedCond, dl, MVT::i32));
15476 }
15477
15478 return SDValue();
15479}
15480
15481/// PerformInsertEltCombine - Target-specific dag combine xforms for
15482/// ISD::INSERT_VECTOR_ELT.
15485 // Bitcast an i64 load inserted into a vector to f64.
15486 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15487 EVT VT = N->getValueType(0);
15488 SDNode *Elt = N->getOperand(1).getNode();
15489 if (VT.getVectorElementType() != MVT::i64 ||
15490 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15491 return SDValue();
15492
15493 SelectionDAG &DAG = DCI.DAG;
15494 SDLoc dl(N);
15495 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15497 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15498 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15499 // Make the DAGCombiner fold the bitcasts.
15500 DCI.AddToWorklist(Vec.getNode());
15501 DCI.AddToWorklist(V.getNode());
15502 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15503 Vec, V, N->getOperand(2));
15504 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15505}
15506
15507// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15508// directly or bitcast to an integer if the original is a float vector.
15509// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15510// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15511static SDValue
15513 EVT VT = N->getValueType(0);
15514 SDLoc dl(N);
15515
15516 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15517 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15518 return SDValue();
15519
15520 SDValue Ext = SDValue(N, 0);
15521 if (Ext.getOpcode() == ISD::BITCAST &&
15522 Ext.getOperand(0).getValueType() == MVT::f32)
15523 Ext = Ext.getOperand(0);
15524 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15525 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15526 Ext.getConstantOperandVal(1) % 2 != 0)
15527 return SDValue();
15528 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15529 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15530 return SDValue();
15531
15532 SDValue Op0 = Ext.getOperand(0);
15533 EVT VecVT = Op0.getValueType();
15534 unsigned ResNo = Op0.getResNo();
15535 unsigned Lane = Ext.getConstantOperandVal(1);
15536 if (VecVT.getVectorNumElements() != 4)
15537 return SDValue();
15538
15539 // Find another extract, of Lane + 1
15540 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15541 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15542 isa<ConstantSDNode>(V->getOperand(1)) &&
15543 V->getConstantOperandVal(1) == Lane + 1 &&
15544 V->getOperand(0).getResNo() == ResNo;
15545 });
15546 if (OtherIt == Op0->users().end())
15547 return SDValue();
15548
15549 // For float extracts, we need to be converting to a i32 for both vector
15550 // lanes.
15551 SDValue OtherExt(*OtherIt, 0);
15552 if (OtherExt.getValueType() != MVT::i32) {
15553 if (!OtherExt->hasOneUse() ||
15554 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15555 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15556 return SDValue();
15557 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15558 }
15559
15560 // Convert the type to a f64 and extract with a VMOVRRD.
15561 SDValue F64 = DCI.DAG.getNode(
15562 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15563 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15564 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15565 SDValue VMOVRRD =
15566 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15567
15568 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15569 return VMOVRRD;
15570}
15571
15574 const ARMSubtarget *ST) {
15575 SDValue Op0 = N->getOperand(0);
15576 EVT VT = N->getValueType(0);
15577 SDLoc dl(N);
15578
15579 // extract (vdup x) -> x
15580 if (Op0->getOpcode() == ARMISD::VDUP) {
15581 SDValue X = Op0->getOperand(0);
15582 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15583 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15584 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15585 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15586 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15587 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15588
15589 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15590 X = X->getOperand(0);
15591 if (X.getValueType() == VT)
15592 return X;
15593 }
15594
15595 // extract ARM_BUILD_VECTOR -> x
15596 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15597 isa<ConstantSDNode>(N->getOperand(1)) &&
15598 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15599 return Op0.getOperand(N->getConstantOperandVal(1));
15600 }
15601
15602 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15603 if (Op0.getValueType() == MVT::v4i32 &&
15604 isa<ConstantSDNode>(N->getOperand(1)) &&
15605 Op0.getOpcode() == ISD::BITCAST &&
15607 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15608 SDValue BV = Op0.getOperand(0);
15609 unsigned Offset = N->getConstantOperandVal(1);
15610 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15611 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15612 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15613 }
15614
15615 // extract x, n; extract x, n+1 -> VMOVRRD x
15616 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15617 return R;
15618
15619 // extract (MVETrunc(x)) -> extract x
15620 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15621 unsigned Idx = N->getConstantOperandVal(1);
15622 unsigned Vec =
15624 unsigned SubIdx =
15626 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15627 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15628 }
15629
15630 return SDValue();
15631}
15632
15634 SDValue Op = N->getOperand(0);
15635 EVT VT = N->getValueType(0);
15636
15637 // sext_inreg(VGETLANEu) -> VGETLANEs
15638 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15639 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15640 Op.getOperand(0).getValueType().getScalarType())
15641 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15642 Op.getOperand(1));
15643
15644 return SDValue();
15645}
15646
15647static SDValue
15649 SDValue Vec = N->getOperand(0);
15650 SDValue SubVec = N->getOperand(1);
15651 uint64_t IdxVal = N->getConstantOperandVal(2);
15652 EVT VecVT = Vec.getValueType();
15653 EVT SubVT = SubVec.getValueType();
15654
15655 // Only do this for legal fixed vector types.
15656 if (!VecVT.isFixedLengthVector() ||
15657 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15659 return SDValue();
15660
15661 // Ignore widening patterns.
15662 if (IdxVal == 0 && Vec.isUndef())
15663 return SDValue();
15664
15665 // Subvector must be half the width and an "aligned" insertion.
15666 unsigned NumSubElts = SubVT.getVectorNumElements();
15667 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15668 (IdxVal != 0 && IdxVal != NumSubElts))
15669 return SDValue();
15670
15671 // Fold insert_subvector -> concat_vectors
15672 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15673 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15674 SDLoc DL(N);
15675 SDValue Lo, Hi;
15676 if (IdxVal == 0) {
15677 Lo = SubVec;
15678 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15679 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15680 } else {
15681 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15682 DCI.DAG.getVectorIdxConstant(0, DL));
15683 Hi = SubVec;
15684 }
15685 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15686}
15687
15688// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15690 SelectionDAG &DAG) {
15691 SDValue Trunc = N->getOperand(0);
15692 EVT VT = Trunc.getValueType();
15693 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15694 return SDValue();
15695
15696 SDLoc DL(Trunc);
15697 if (isVMOVNTruncMask(N->getMask(), VT, false))
15698 return DAG.getNode(
15699 ARMISD::VMOVN, DL, VT,
15700 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15701 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15702 DAG.getConstant(1, DL, MVT::i32));
15703 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15704 return DAG.getNode(
15705 ARMISD::VMOVN, DL, VT,
15706 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15707 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15708 DAG.getConstant(1, DL, MVT::i32));
15709 return SDValue();
15710}
15711
15712/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15713/// ISD::VECTOR_SHUFFLE.
15716 return R;
15717
15718 // The LLVM shufflevector instruction does not require the shuffle mask
15719 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15720 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15721 // operands do not match the mask length, they are extended by concatenating
15722 // them with undef vectors. That is probably the right thing for other
15723 // targets, but for NEON it is better to concatenate two double-register
15724 // size vector operands into a single quad-register size vector. Do that
15725 // transformation here:
15726 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15727 // shuffle(concat(v1, v2), undef)
15728 SDValue Op0 = N->getOperand(0);
15729 SDValue Op1 = N->getOperand(1);
15730 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15731 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15732 Op0.getNumOperands() != 2 ||
15733 Op1.getNumOperands() != 2)
15734 return SDValue();
15735 SDValue Concat0Op1 = Op0.getOperand(1);
15736 SDValue Concat1Op1 = Op1.getOperand(1);
15737 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15738 return SDValue();
15739 // Skip the transformation if any of the types are illegal.
15740 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15741 EVT VT = N->getValueType(0);
15742 if (!TLI.isTypeLegal(VT) ||
15743 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15744 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15745 return SDValue();
15746
15747 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15748 Op0.getOperand(0), Op1.getOperand(0));
15749 // Translate the shuffle mask.
15750 SmallVector<int, 16> NewMask;
15751 unsigned NumElts = VT.getVectorNumElements();
15752 unsigned HalfElts = NumElts/2;
15754 for (unsigned n = 0; n < NumElts; ++n) {
15755 int MaskElt = SVN->getMaskElt(n);
15756 int NewElt = -1;
15757 if (MaskElt < (int)HalfElts)
15758 NewElt = MaskElt;
15759 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15760 NewElt = HalfElts + MaskElt - NumElts;
15761 NewMask.push_back(NewElt);
15762 }
15763 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15764 DAG.getUNDEF(VT), NewMask);
15765}
15766
15767/// Load/store instruction that can be merged with a base address
15768/// update
15773 unsigned AddrOpIdx;
15774};
15775
15777 /// Instruction that updates a pointer
15779 /// Pointer increment operand
15781 /// Pointer increment value if it is a constant, or 0 otherwise
15782 unsigned ConstInc;
15783};
15784
15786 // Check that the add is independent of the load/store.
15787 // Otherwise, folding it would create a cycle. Search through Addr
15788 // as well, since the User may not be a direct user of Addr and
15789 // only share a base pointer.
15792 Worklist.push_back(N);
15793 Worklist.push_back(User);
15794 const unsigned MaxSteps = 1024;
15795 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15796 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15797 return false;
15798 return true;
15799}
15800
15802 struct BaseUpdateUser &User,
15803 bool SimpleConstIncOnly,
15805 SelectionDAG &DAG = DCI.DAG;
15806 SDNode *N = Target.N;
15807 MemSDNode *MemN = cast<MemSDNode>(N);
15808 SDLoc dl(N);
15809
15810 // Find the new opcode for the updating load/store.
15811 bool isLoadOp = true;
15812 bool isLaneOp = false;
15813 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15814 // as an operand.
15815 bool hasAlignment = true;
15816 unsigned NewOpc = 0;
15817 unsigned NumVecs = 0;
15818 if (Target.isIntrinsic) {
15819 unsigned IntNo = N->getConstantOperandVal(1);
15820 switch (IntNo) {
15821 default:
15822 llvm_unreachable("unexpected intrinsic for Neon base update");
15823 case Intrinsic::arm_neon_vld1:
15824 NewOpc = ARMISD::VLD1_UPD;
15825 NumVecs = 1;
15826 break;
15827 case Intrinsic::arm_neon_vld2:
15828 NewOpc = ARMISD::VLD2_UPD;
15829 NumVecs = 2;
15830 break;
15831 case Intrinsic::arm_neon_vld3:
15832 NewOpc = ARMISD::VLD3_UPD;
15833 NumVecs = 3;
15834 break;
15835 case Intrinsic::arm_neon_vld4:
15836 NewOpc = ARMISD::VLD4_UPD;
15837 NumVecs = 4;
15838 break;
15839 case Intrinsic::arm_neon_vld1x2:
15840 NewOpc = ARMISD::VLD1x2_UPD;
15841 NumVecs = 2;
15842 hasAlignment = false;
15843 break;
15844 case Intrinsic::arm_neon_vld1x3:
15845 NewOpc = ARMISD::VLD1x3_UPD;
15846 NumVecs = 3;
15847 hasAlignment = false;
15848 break;
15849 case Intrinsic::arm_neon_vld1x4:
15850 NewOpc = ARMISD::VLD1x4_UPD;
15851 NumVecs = 4;
15852 hasAlignment = false;
15853 break;
15854 case Intrinsic::arm_neon_vld2dup:
15855 NewOpc = ARMISD::VLD2DUP_UPD;
15856 NumVecs = 2;
15857 break;
15858 case Intrinsic::arm_neon_vld3dup:
15859 NewOpc = ARMISD::VLD3DUP_UPD;
15860 NumVecs = 3;
15861 break;
15862 case Intrinsic::arm_neon_vld4dup:
15863 NewOpc = ARMISD::VLD4DUP_UPD;
15864 NumVecs = 4;
15865 break;
15866 case Intrinsic::arm_neon_vld2lane:
15867 NewOpc = ARMISD::VLD2LN_UPD;
15868 NumVecs = 2;
15869 isLaneOp = true;
15870 break;
15871 case Intrinsic::arm_neon_vld3lane:
15872 NewOpc = ARMISD::VLD3LN_UPD;
15873 NumVecs = 3;
15874 isLaneOp = true;
15875 break;
15876 case Intrinsic::arm_neon_vld4lane:
15877 NewOpc = ARMISD::VLD4LN_UPD;
15878 NumVecs = 4;
15879 isLaneOp = true;
15880 break;
15881 case Intrinsic::arm_neon_vst1:
15882 NewOpc = ARMISD::VST1_UPD;
15883 NumVecs = 1;
15884 isLoadOp = false;
15885 break;
15886 case Intrinsic::arm_neon_vst2:
15887 NewOpc = ARMISD::VST2_UPD;
15888 NumVecs = 2;
15889 isLoadOp = false;
15890 break;
15891 case Intrinsic::arm_neon_vst3:
15892 NewOpc = ARMISD::VST3_UPD;
15893 NumVecs = 3;
15894 isLoadOp = false;
15895 break;
15896 case Intrinsic::arm_neon_vst4:
15897 NewOpc = ARMISD::VST4_UPD;
15898 NumVecs = 4;
15899 isLoadOp = false;
15900 break;
15901 case Intrinsic::arm_neon_vst2lane:
15902 NewOpc = ARMISD::VST2LN_UPD;
15903 NumVecs = 2;
15904 isLoadOp = false;
15905 isLaneOp = true;
15906 break;
15907 case Intrinsic::arm_neon_vst3lane:
15908 NewOpc = ARMISD::VST3LN_UPD;
15909 NumVecs = 3;
15910 isLoadOp = false;
15911 isLaneOp = true;
15912 break;
15913 case Intrinsic::arm_neon_vst4lane:
15914 NewOpc = ARMISD::VST4LN_UPD;
15915 NumVecs = 4;
15916 isLoadOp = false;
15917 isLaneOp = true;
15918 break;
15919 case Intrinsic::arm_neon_vst1x2:
15920 NewOpc = ARMISD::VST1x2_UPD;
15921 NumVecs = 2;
15922 isLoadOp = false;
15923 hasAlignment = false;
15924 break;
15925 case Intrinsic::arm_neon_vst1x3:
15926 NewOpc = ARMISD::VST1x3_UPD;
15927 NumVecs = 3;
15928 isLoadOp = false;
15929 hasAlignment = false;
15930 break;
15931 case Intrinsic::arm_neon_vst1x4:
15932 NewOpc = ARMISD::VST1x4_UPD;
15933 NumVecs = 4;
15934 isLoadOp = false;
15935 hasAlignment = false;
15936 break;
15937 }
15938 } else {
15939 isLaneOp = true;
15940 switch (N->getOpcode()) {
15941 default:
15942 llvm_unreachable("unexpected opcode for Neon base update");
15943 case ARMISD::VLD1DUP:
15944 NewOpc = ARMISD::VLD1DUP_UPD;
15945 NumVecs = 1;
15946 break;
15947 case ARMISD::VLD2DUP:
15948 NewOpc = ARMISD::VLD2DUP_UPD;
15949 NumVecs = 2;
15950 break;
15951 case ARMISD::VLD3DUP:
15952 NewOpc = ARMISD::VLD3DUP_UPD;
15953 NumVecs = 3;
15954 break;
15955 case ARMISD::VLD4DUP:
15956 NewOpc = ARMISD::VLD4DUP_UPD;
15957 NumVecs = 4;
15958 break;
15959 case ISD::LOAD:
15960 NewOpc = ARMISD::VLD1_UPD;
15961 NumVecs = 1;
15962 isLaneOp = false;
15963 break;
15964 case ISD::STORE:
15965 NewOpc = ARMISD::VST1_UPD;
15966 NumVecs = 1;
15967 isLaneOp = false;
15968 isLoadOp = false;
15969 break;
15970 }
15971 }
15972
15973 // Find the size of memory referenced by the load/store.
15974 EVT VecTy;
15975 if (isLoadOp) {
15976 VecTy = N->getValueType(0);
15977 } else if (Target.isIntrinsic) {
15978 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15979 } else {
15980 assert(Target.isStore &&
15981 "Node has to be a load, a store, or an intrinsic!");
15982 VecTy = N->getOperand(1).getValueType();
15983 }
15984
15985 bool isVLDDUPOp =
15986 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15987 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15988
15989 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15990 if (isLaneOp || isVLDDUPOp)
15991 NumBytes /= VecTy.getVectorNumElements();
15992
15993 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15994 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15995 // separate instructions that make it harder to use a non-constant update.
15996 return false;
15997 }
15998
15999 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16000 return false;
16001
16002 if (!isValidBaseUpdate(N, User.N))
16003 return false;
16004
16005 // OK, we found an ADD we can fold into the base update.
16006 // Now, create a _UPD node, taking care of not breaking alignment.
16007
16008 EVT AlignedVecTy = VecTy;
16009 Align Alignment = MemN->getAlign();
16010
16011 // If this is a less-than-standard-aligned load/store, change the type to
16012 // match the standard alignment.
16013 // The alignment is overlooked when selecting _UPD variants; and it's
16014 // easier to introduce bitcasts here than fix that.
16015 // There are 3 ways to get to this base-update combine:
16016 // - intrinsics: they are assumed to be properly aligned (to the standard
16017 // alignment of the memory type), so we don't need to do anything.
16018 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16019 // intrinsics, so, likewise, there's nothing to do.
16020 // - generic load/store instructions: the alignment is specified as an
16021 // explicit operand, rather than implicitly as the standard alignment
16022 // of the memory type (like the intrisics). We need to change the
16023 // memory type to match the explicit alignment. That way, we don't
16024 // generate non-standard-aligned ARMISD::VLDx nodes.
16025 if (isa<LSBaseSDNode>(N)) {
16026 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16027 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16028 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16029 assert(!isLaneOp && "Unexpected generic load/store lane.");
16030 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16031 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16032 }
16033 // Don't set an explicit alignment on regular load/stores that we want
16034 // to transform to VLD/VST 1_UPD nodes.
16035 // This matches the behavior of regular load/stores, which only get an
16036 // explicit alignment if the MMO alignment is larger than the standard
16037 // alignment of the memory type.
16038 // Intrinsics, however, always get an explicit alignment, set to the
16039 // alignment of the MMO.
16040 Alignment = Align(1);
16041 }
16042
16043 // Create the new updating load/store node.
16044 // First, create an SDVTList for the new updating node's results.
16045 EVT Tys[6];
16046 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16047 unsigned n;
16048 for (n = 0; n < NumResultVecs; ++n)
16049 Tys[n] = AlignedVecTy;
16050 Tys[n++] = MVT::i32;
16051 Tys[n] = MVT::Other;
16052 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16053
16054 // Then, gather the new node's operands.
16056 Ops.push_back(N->getOperand(0)); // incoming chain
16057 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16058 Ops.push_back(User.Inc);
16059
16060 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16061 // Try to match the intrinsic's signature
16062 Ops.push_back(StN->getValue());
16063 } else {
16064 // Loads (and of course intrinsics) match the intrinsics' signature,
16065 // so just add all but the alignment operand.
16066 unsigned LastOperand =
16067 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16068 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16069 Ops.push_back(N->getOperand(i));
16070 }
16071
16072 // For all node types, the alignment operand is always the last one.
16073 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16074
16075 // If this is a non-standard-aligned STORE, the penultimate operand is the
16076 // stored value. Bitcast it to the aligned type.
16077 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16078 SDValue &StVal = Ops[Ops.size() - 2];
16079 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16080 }
16081
16082 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16083 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16084 MemN->getMemOperand());
16085
16086 // Update the uses.
16087 SmallVector<SDValue, 5> NewResults;
16088 for (unsigned i = 0; i < NumResultVecs; ++i)
16089 NewResults.push_back(SDValue(UpdN.getNode(), i));
16090
16091 // If this is an non-standard-aligned LOAD, the first result is the loaded
16092 // value. Bitcast it to the expected result type.
16093 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16094 SDValue &LdVal = NewResults[0];
16095 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16096 }
16097
16098 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16099 DCI.CombineTo(N, NewResults);
16100 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16101
16102 return true;
16103}
16104
16105// If (opcode ptr inc) is and ADD-like instruction, return the
16106// increment value. Otherwise return 0.
16107static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16108 SDValue Inc, const SelectionDAG &DAG) {
16110 if (!CInc)
16111 return 0;
16112
16113 switch (Opcode) {
16114 case ARMISD::VLD1_UPD:
16115 case ISD::ADD:
16116 return CInc->getZExtValue();
16117 case ISD::OR: {
16118 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16119 // (OR ptr inc) is the same as (ADD ptr inc)
16120 return CInc->getZExtValue();
16121 }
16122 return 0;
16123 }
16124 default:
16125 return 0;
16126 }
16127}
16128
16130 switch (N->getOpcode()) {
16131 case ISD::ADD:
16132 case ISD::OR: {
16133 if (isa<ConstantSDNode>(N->getOperand(1))) {
16134 *Ptr = N->getOperand(0);
16135 *CInc = N->getOperand(1);
16136 return true;
16137 }
16138 return false;
16139 }
16140 case ARMISD::VLD1_UPD: {
16141 if (isa<ConstantSDNode>(N->getOperand(2))) {
16142 *Ptr = N->getOperand(1);
16143 *CInc = N->getOperand(2);
16144 return true;
16145 }
16146 return false;
16147 }
16148 default:
16149 return false;
16150 }
16151}
16152
16153/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16154/// NEON load/store intrinsics, and generic vector load/stores, to merge
16155/// base address updates.
16156/// For generic load/stores, the memory type is assumed to be a vector.
16157/// The caller is assumed to have checked legality.
16160 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16161 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16162 const bool isStore = N->getOpcode() == ISD::STORE;
16163 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16164 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16165
16166 // Limit the number of possible base-updates we look at to prevent degenerate
16167 // cases.
16168 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16169
16170 SDValue Addr = N->getOperand(AddrOpIdx);
16171
16173
16174 // Search for a use of the address operand that is an increment.
16175 for (SDUse &Use : Addr->uses()) {
16176 SDNode *User = Use.getUser();
16177 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16178 continue;
16179
16180 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16181 unsigned ConstInc =
16182 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16183
16184 if (ConstInc || User->getOpcode() == ISD::ADD) {
16185 BaseUpdates.push_back({User, Inc, ConstInc});
16186 if (BaseUpdates.size() >= MaxBaseUpdates)
16187 break;
16188 }
16189 }
16190
16191 // If the address is a constant pointer increment itself, find
16192 // another constant increment that has the same base operand
16193 SDValue Base;
16194 SDValue CInc;
16195 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16196 unsigned Offset =
16197 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16198 for (SDUse &Use : Base->uses()) {
16199
16200 SDNode *User = Use.getUser();
16201 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16202 User->getNumOperands() != 2)
16203 continue;
16204
16205 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16206 unsigned UserOffset =
16207 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16208
16209 if (!UserOffset || UserOffset <= Offset)
16210 continue;
16211
16212 unsigned NewConstInc = UserOffset - Offset;
16213 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16214 BaseUpdates.push_back({User, NewInc, NewConstInc});
16215 if (BaseUpdates.size() >= MaxBaseUpdates)
16216 break;
16217 }
16218 }
16219
16220 // Try to fold the load/store with an update that matches memory
16221 // access size. This should work well for sequential loads.
16222 unsigned NumValidUpd = BaseUpdates.size();
16223 for (unsigned I = 0; I < NumValidUpd; I++) {
16224 BaseUpdateUser &User = BaseUpdates[I];
16225 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16226 return SDValue();
16227 }
16228
16229 // Try to fold with other users. Non-constant updates are considered
16230 // first, and constant updates are sorted to not break a sequence of
16231 // strided accesses (if there is any).
16232 llvm::stable_sort(BaseUpdates,
16233 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16234 return LHS.ConstInc < RHS.ConstInc;
16235 });
16236 for (BaseUpdateUser &User : BaseUpdates) {
16237 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16238 return SDValue();
16239 }
16240 return SDValue();
16241}
16242
16245 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16246 return SDValue();
16247
16248 return CombineBaseUpdate(N, DCI);
16249}
16250
16253 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16254 return SDValue();
16255
16256 SelectionDAG &DAG = DCI.DAG;
16257 SDValue Addr = N->getOperand(2);
16258 MemSDNode *MemN = cast<MemSDNode>(N);
16259 SDLoc dl(N);
16260
16261 // For the stores, where there are multiple intrinsics we only actually want
16262 // to post-inc the last of the them.
16263 unsigned IntNo = N->getConstantOperandVal(1);
16264 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16265 return SDValue();
16266 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16267 return SDValue();
16268
16269 // Search for a use of the address operand that is an increment.
16270 for (SDUse &Use : Addr->uses()) {
16271 SDNode *User = Use.getUser();
16272 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16273 continue;
16274
16275 // Check that the add is independent of the load/store. Otherwise, folding
16276 // it would create a cycle. We can avoid searching through Addr as it's a
16277 // predecessor to both.
16280 Visited.insert(Addr.getNode());
16281 Worklist.push_back(N);
16282 Worklist.push_back(User);
16283 const unsigned MaxSteps = 1024;
16284 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16285 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16286 continue;
16287
16288 // Find the new opcode for the updating load/store.
16289 bool isLoadOp = true;
16290 unsigned NewOpc = 0;
16291 unsigned NumVecs = 0;
16292 switch (IntNo) {
16293 default:
16294 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16295 case Intrinsic::arm_mve_vld2q:
16296 NewOpc = ARMISD::VLD2_UPD;
16297 NumVecs = 2;
16298 break;
16299 case Intrinsic::arm_mve_vld4q:
16300 NewOpc = ARMISD::VLD4_UPD;
16301 NumVecs = 4;
16302 break;
16303 case Intrinsic::arm_mve_vst2q:
16304 NewOpc = ARMISD::VST2_UPD;
16305 NumVecs = 2;
16306 isLoadOp = false;
16307 break;
16308 case Intrinsic::arm_mve_vst4q:
16309 NewOpc = ARMISD::VST4_UPD;
16310 NumVecs = 4;
16311 isLoadOp = false;
16312 break;
16313 }
16314
16315 // Find the size of memory referenced by the load/store.
16316 EVT VecTy;
16317 if (isLoadOp) {
16318 VecTy = N->getValueType(0);
16319 } else {
16320 VecTy = N->getOperand(3).getValueType();
16321 }
16322
16323 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16324
16325 // If the increment is a constant, it must match the memory ref size.
16326 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16328 if (!CInc || CInc->getZExtValue() != NumBytes)
16329 continue;
16330
16331 // Create the new updating load/store node.
16332 // First, create an SDVTList for the new updating node's results.
16333 EVT Tys[6];
16334 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16335 unsigned n;
16336 for (n = 0; n < NumResultVecs; ++n)
16337 Tys[n] = VecTy;
16338 Tys[n++] = MVT::i32;
16339 Tys[n] = MVT::Other;
16340 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16341
16342 // Then, gather the new node's operands.
16344 Ops.push_back(N->getOperand(0)); // incoming chain
16345 Ops.push_back(N->getOperand(2)); // ptr
16346 Ops.push_back(Inc);
16347
16348 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16349 Ops.push_back(N->getOperand(i));
16350
16351 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16352 MemN->getMemOperand());
16353
16354 // Update the uses.
16355 SmallVector<SDValue, 5> NewResults;
16356 for (unsigned i = 0; i < NumResultVecs; ++i)
16357 NewResults.push_back(SDValue(UpdN.getNode(), i));
16358
16359 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16360 DCI.CombineTo(N, NewResults);
16361 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16362
16363 break;
16364 }
16365
16366 return SDValue();
16367}
16368
16369/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16370/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16371/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16372/// return true.
16374 SelectionDAG &DAG = DCI.DAG;
16375 EVT VT = N->getValueType(0);
16376 // vldN-dup instructions only support 64-bit vectors for N > 1.
16377 if (!VT.is64BitVector())
16378 return false;
16379
16380 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16381 SDNode *VLD = N->getOperand(0).getNode();
16382 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16383 return false;
16384 unsigned NumVecs = 0;
16385 unsigned NewOpc = 0;
16386 unsigned IntNo = VLD->getConstantOperandVal(1);
16387 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16388 NumVecs = 2;
16389 NewOpc = ARMISD::VLD2DUP;
16390 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16391 NumVecs = 3;
16392 NewOpc = ARMISD::VLD3DUP;
16393 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16394 NumVecs = 4;
16395 NewOpc = ARMISD::VLD4DUP;
16396 } else {
16397 return false;
16398 }
16399
16400 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16401 // numbers match the load.
16402 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16403 for (SDUse &Use : VLD->uses()) {
16404 // Ignore uses of the chain result.
16405 if (Use.getResNo() == NumVecs)
16406 continue;
16407 SDNode *User = Use.getUser();
16408 if (User->getOpcode() != ARMISD::VDUPLANE ||
16409 VLDLaneNo != User->getConstantOperandVal(1))
16410 return false;
16411 }
16412
16413 // Create the vldN-dup node.
16414 EVT Tys[5];
16415 unsigned n;
16416 for (n = 0; n < NumVecs; ++n)
16417 Tys[n] = VT;
16418 Tys[n] = MVT::Other;
16419 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16420 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16422 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16423 Ops, VLDMemInt->getMemoryVT(),
16424 VLDMemInt->getMemOperand());
16425
16426 // Update the uses.
16427 for (SDUse &Use : VLD->uses()) {
16428 unsigned ResNo = Use.getResNo();
16429 // Ignore uses of the chain result.
16430 if (ResNo == NumVecs)
16431 continue;
16432 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16433 }
16434
16435 // Now the vldN-lane intrinsic is dead except for its chain result.
16436 // Update uses of the chain.
16437 std::vector<SDValue> VLDDupResults;
16438 for (unsigned n = 0; n < NumVecs; ++n)
16439 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16440 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16441 DCI.CombineTo(VLD, VLDDupResults);
16442
16443 return true;
16444}
16445
16446/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16447/// ARMISD::VDUPLANE.
16450 const ARMSubtarget *Subtarget) {
16451 SDValue Op = N->getOperand(0);
16452 EVT VT = N->getValueType(0);
16453
16454 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16455 if (Subtarget->hasMVEIntegerOps()) {
16456 EVT ExtractVT = VT.getVectorElementType();
16457 // We need to ensure we are creating a legal type.
16458 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16459 ExtractVT = MVT::i32;
16460 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16461 N->getOperand(0), N->getOperand(1));
16462 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16463 }
16464
16465 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16466 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16467 if (CombineVLDDUP(N, DCI))
16468 return SDValue(N, 0);
16469
16470 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16471 // redundant. Ignore bit_converts for now; element sizes are checked below.
16472 while (Op.getOpcode() == ISD::BITCAST)
16473 Op = Op.getOperand(0);
16474 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16475 return SDValue();
16476
16477 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16478 unsigned EltSize = Op.getScalarValueSizeInBits();
16479 // The canonical VMOV for a zero vector uses a 32-bit element size.
16480 unsigned Imm = Op.getConstantOperandVal(0);
16481 unsigned EltBits;
16482 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16483 EltSize = 8;
16484 if (EltSize > VT.getScalarSizeInBits())
16485 return SDValue();
16486
16487 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16488}
16489
16490/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16492 const ARMSubtarget *Subtarget) {
16493 SDValue Op = N->getOperand(0);
16494 SDLoc dl(N);
16495
16496 if (Subtarget->hasMVEIntegerOps()) {
16497 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16498 // need to come from a GPR.
16499 if (Op.getValueType() == MVT::f32)
16500 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16501 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16502 else if (Op.getValueType() == MVT::f16)
16503 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16504 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16505 }
16506
16507 if (!Subtarget->hasNEON())
16508 return SDValue();
16509
16510 // Match VDUP(LOAD) -> VLD1DUP.
16511 // We match this pattern here rather than waiting for isel because the
16512 // transform is only legal for unindexed loads.
16513 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16514 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16515 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16516 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16517 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16518 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16519 SDValue VLDDup =
16521 LD->getMemoryVT(), LD->getMemOperand());
16522 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16523 return VLDDup;
16524 }
16525
16526 return SDValue();
16527}
16528
16531 const ARMSubtarget *Subtarget) {
16532 EVT VT = N->getValueType(0);
16533
16534 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16535 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16537 return CombineBaseUpdate(N, DCI);
16538
16539 return SDValue();
16540}
16541
16542// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16543// pack all of the elements in one place. Next, store to memory in fewer
16544// chunks.
16546 SelectionDAG &DAG) {
16547 SDValue StVal = St->getValue();
16548 EVT VT = StVal.getValueType();
16549 if (!St->isTruncatingStore() || !VT.isVector())
16550 return SDValue();
16551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16552 EVT StVT = St->getMemoryVT();
16553 unsigned NumElems = VT.getVectorNumElements();
16554 assert(StVT != VT && "Cannot truncate to the same type");
16555 unsigned FromEltSz = VT.getScalarSizeInBits();
16556 unsigned ToEltSz = StVT.getScalarSizeInBits();
16557
16558 // From, To sizes and ElemCount must be pow of two
16559 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16560 return SDValue();
16561
16562 // We are going to use the original vector elt for storing.
16563 // Accumulated smaller vector elements must be a multiple of the store size.
16564 if (0 != (NumElems * FromEltSz) % ToEltSz)
16565 return SDValue();
16566
16567 unsigned SizeRatio = FromEltSz / ToEltSz;
16568 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16569
16570 // Create a type on which we perform the shuffle.
16571 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16572 NumElems * SizeRatio);
16573 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16574
16575 SDLoc DL(St);
16576 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16577 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16578 for (unsigned i = 0; i < NumElems; ++i)
16579 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16580 : i * SizeRatio;
16581
16582 // Can't shuffle using an illegal type.
16583 if (!TLI.isTypeLegal(WideVecVT))
16584 return SDValue();
16585
16586 SDValue Shuff = DAG.getVectorShuffle(
16587 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16588 // At this point all of the data is stored at the bottom of the
16589 // register. We now need to save it to mem.
16590
16591 // Find the largest store unit
16592 MVT StoreType = MVT::i8;
16593 for (MVT Tp : MVT::integer_valuetypes()) {
16594 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16595 StoreType = Tp;
16596 }
16597 // Didn't find a legal store type.
16598 if (!TLI.isTypeLegal(StoreType))
16599 return SDValue();
16600
16601 // Bitcast the original vector into a vector of store-size units
16602 EVT StoreVecVT =
16603 EVT::getVectorVT(*DAG.getContext(), StoreType,
16604 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16605 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16606 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16608 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16609 TLI.getPointerTy(DAG.getDataLayout()));
16610 SDValue BasePtr = St->getBasePtr();
16611
16612 // Perform one or more big stores into memory.
16613 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16614 for (unsigned I = 0; I < E; I++) {
16615 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16616 ShuffWide, DAG.getIntPtrConstant(I, DL));
16617 SDValue Ch =
16618 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16619 St->getAlign(), St->getMemOperand()->getFlags());
16620 BasePtr =
16621 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16622 Chains.push_back(Ch);
16623 }
16624 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16625}
16626
16627// Try taking a single vector store from an fpround (which would otherwise turn
16628// into an expensive buildvector) and splitting it into a series of narrowing
16629// stores.
16631 SelectionDAG &DAG) {
16632 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16633 return SDValue();
16634 SDValue Trunc = St->getValue();
16635 if (Trunc->getOpcode() != ISD::FP_ROUND)
16636 return SDValue();
16637 EVT FromVT = Trunc->getOperand(0).getValueType();
16638 EVT ToVT = Trunc.getValueType();
16639 if (!ToVT.isVector())
16640 return SDValue();
16642 EVT ToEltVT = ToVT.getVectorElementType();
16643 EVT FromEltVT = FromVT.getVectorElementType();
16644
16645 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16646 return SDValue();
16647
16648 unsigned NumElements = 4;
16649 if (FromVT.getVectorNumElements() % NumElements != 0)
16650 return SDValue();
16651
16652 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16653 // use the VMOVN over splitting the store. We are looking for patterns of:
16654 // !rev: 0 N 1 N+1 2 N+2 ...
16655 // rev: N 0 N+1 1 N+2 2 ...
16656 // The shuffle may either be a single source (in which case N = NumElts/2) or
16657 // two inputs extended with concat to the same size (in which case N =
16658 // NumElts).
16659 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16660 ArrayRef<int> M = SVN->getMask();
16661 unsigned NumElts = ToVT.getVectorNumElements();
16662 if (SVN->getOperand(1).isUndef())
16663 NumElts /= 2;
16664
16665 unsigned Off0 = Rev ? NumElts : 0;
16666 unsigned Off1 = Rev ? 0 : NumElts;
16667
16668 for (unsigned I = 0; I < NumElts; I += 2) {
16669 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16670 return false;
16671 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16672 return false;
16673 }
16674
16675 return true;
16676 };
16677
16678 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16679 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16680 return SDValue();
16681
16682 LLVMContext &C = *DAG.getContext();
16683 SDLoc DL(St);
16684 // Details about the old store
16685 SDValue Ch = St->getChain();
16686 SDValue BasePtr = St->getBasePtr();
16687 Align Alignment = St->getBaseAlign();
16689 AAMDNodes AAInfo = St->getAAInfo();
16690
16691 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16692 // and then stored as truncating integer stores.
16693 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16694 EVT NewToVT = EVT::getVectorVT(
16695 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16696
16698 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16699 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16700 SDValue NewPtr =
16701 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16702
16703 SDValue Extract =
16704 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16705 DAG.getConstant(i * NumElements, DL, MVT::i32));
16706
16707 SDValue FPTrunc =
16708 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16709 Extract, DAG.getConstant(0, DL, MVT::i32));
16710 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16711
16712 SDValue Store = DAG.getTruncStore(
16713 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16714 NewToVT, Alignment, MMOFlags, AAInfo);
16715 Stores.push_back(Store);
16716 }
16717 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16718}
16719
16720// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16721// into an expensive buildvector) and splitting it into a series of narrowing
16722// stores.
16724 SelectionDAG &DAG) {
16725 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16726 return SDValue();
16727 SDValue Trunc = St->getValue();
16728 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16729 return SDValue();
16730 EVT FromVT = Trunc->getOperand(0).getValueType();
16731 EVT ToVT = Trunc.getValueType();
16732
16733 LLVMContext &C = *DAG.getContext();
16734 SDLoc DL(St);
16735 // Details about the old store
16736 SDValue Ch = St->getChain();
16737 SDValue BasePtr = St->getBasePtr();
16738 Align Alignment = St->getBaseAlign();
16740 AAMDNodes AAInfo = St->getAAInfo();
16741
16742 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16743 FromVT.getVectorNumElements());
16744
16746 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16747 unsigned NewOffset =
16748 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16749 SDValue NewPtr =
16750 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16751
16752 SDValue Extract = Trunc.getOperand(i);
16753 SDValue Store = DAG.getTruncStore(
16754 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16755 NewToVT, Alignment, MMOFlags, AAInfo);
16756 Stores.push_back(Store);
16757 }
16758 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16759}
16760
16761// Given a floating point store from an extracted vector, with an integer
16762// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16763// help reduce fp register pressure, doesn't require the fp extract and allows
16764// use of more integer post-inc stores not available with vstr.
16766 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16767 return SDValue();
16768 SDValue Extract = St->getValue();
16769 EVT VT = Extract.getValueType();
16770 // For now only uses f16. This may be useful for f32 too, but that will
16771 // be bitcast(extract), not the VGETLANEu we currently check here.
16772 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16773 return SDValue();
16774
16775 SDNode *GetLane =
16776 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16777 {Extract.getOperand(0), Extract.getOperand(1)});
16778 if (!GetLane)
16779 return SDValue();
16780
16781 LLVMContext &C = *DAG.getContext();
16782 SDLoc DL(St);
16783 // Create a new integer store to replace the existing floating point version.
16784 SDValue Ch = St->getChain();
16785 SDValue BasePtr = St->getBasePtr();
16786 Align Alignment = St->getBaseAlign();
16788 AAMDNodes AAInfo = St->getAAInfo();
16789 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16790 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16791 St->getPointerInfo(), NewToVT, Alignment,
16792 MMOFlags, AAInfo);
16793
16794 return Store;
16795}
16796
16797/// PerformSTORECombine - Target-specific dag combine xforms for
16798/// ISD::STORE.
16801 const ARMSubtarget *Subtarget) {
16803 if (St->isVolatile())
16804 return SDValue();
16805 SDValue StVal = St->getValue();
16806 EVT VT = StVal.getValueType();
16807
16808 if (Subtarget->hasNEON())
16809 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16810 return Store;
16811
16812 if (Subtarget->hasMVEFloatOps())
16813 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16814 return NewToken;
16815
16816 if (Subtarget->hasMVEIntegerOps()) {
16817 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16818 return NewChain;
16819 if (SDValue NewToken =
16821 return NewToken;
16822 }
16823
16824 if (!ISD::isNormalStore(St))
16825 return SDValue();
16826
16827 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16828 // ARM stores of arguments in the same cache line.
16829 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16830 StVal.getNode()->hasOneUse()) {
16831 SelectionDAG &DAG = DCI.DAG;
16832 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16833 SDLoc DL(St);
16834 SDValue BasePtr = St->getBasePtr();
16835 SDValue NewST1 = DAG.getStore(
16836 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16837 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16838 St->getMemOperand()->getFlags());
16839
16840 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16841 DAG.getConstant(4, DL, MVT::i32));
16842 return DAG.getStore(NewST1.getValue(0), DL,
16843 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16844 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16845 St->getBaseAlign(), St->getMemOperand()->getFlags());
16846 }
16847
16848 if (StVal.getValueType() == MVT::i64 &&
16850
16851 // Bitcast an i64 store extracted from a vector to f64.
16852 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16853 SelectionDAG &DAG = DCI.DAG;
16854 SDLoc dl(StVal);
16855 SDValue IntVec = StVal.getOperand(0);
16856 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16858 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16859 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16860 Vec, StVal.getOperand(1));
16861 dl = SDLoc(N);
16862 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16863 // Make the DAGCombiner fold the bitcasts.
16864 DCI.AddToWorklist(Vec.getNode());
16865 DCI.AddToWorklist(ExtElt.getNode());
16866 DCI.AddToWorklist(V.getNode());
16867 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16868 St->getPointerInfo(), St->getAlign(),
16869 St->getMemOperand()->getFlags(), St->getAAInfo());
16870 }
16871
16872 // If this is a legal vector store, try to combine it into a VST1_UPD.
16873 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16875 return CombineBaseUpdate(N, DCI);
16876
16877 return SDValue();
16878}
16879
16880/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16881/// can replace combinations of VMUL and VCVT (floating-point to integer)
16882/// when the VMUL has a constant operand that is a power of 2.
16883///
16884/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16885/// vmul.f32 d16, d17, d16
16886/// vcvt.s32.f32 d16, d16
16887/// becomes:
16888/// vcvt.s32.f32 d16, d16, #3
16890 const ARMSubtarget *Subtarget) {
16891 if (!Subtarget->hasNEON())
16892 return SDValue();
16893
16894 SDValue Op = N->getOperand(0);
16895 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16896 Op.getOpcode() != ISD::FMUL)
16897 return SDValue();
16898
16899 SDValue ConstVec = Op->getOperand(1);
16900 if (!isa<BuildVectorSDNode>(ConstVec))
16901 return SDValue();
16902
16903 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16904 uint32_t FloatBits = FloatTy.getSizeInBits();
16905 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16906 uint32_t IntBits = IntTy.getSizeInBits();
16907 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16908 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16909 // These instructions only exist converting from f32 to i32. We can handle
16910 // smaller integers by generating an extra truncate, but larger ones would
16911 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16912 // these intructions only support v2i32/v4i32 types.
16913 return SDValue();
16914 }
16915
16916 BitVector UndefElements;
16918 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16919 if (C == -1 || C == 0 || C > 32)
16920 return SDValue();
16921
16922 SDLoc dl(N);
16923 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16924 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16925 Intrinsic::arm_neon_vcvtfp2fxu;
16926 SDValue FixConv = DAG.getNode(
16927 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16928 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16929 DAG.getConstant(C, dl, MVT::i32));
16930
16931 if (IntBits < FloatBits)
16932 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16933
16934 return FixConv;
16935}
16936
16938 const ARMSubtarget *Subtarget) {
16939 if (!Subtarget->hasMVEFloatOps())
16940 return SDValue();
16941
16942 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16943 // The second form can be more easily turned into a predicated vadd, and
16944 // possibly combined into a fma to become a predicated vfma.
16945 SDValue Op0 = N->getOperand(0);
16946 SDValue Op1 = N->getOperand(1);
16947 EVT VT = N->getValueType(0);
16948 SDLoc DL(N);
16949
16950 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16951 // which these VMOV's represent.
16952 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16953 if (Op.getOpcode() != ISD::BITCAST ||
16954 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16955 return false;
16956 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16957 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16958 return true;
16959 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16960 return true;
16961 return false;
16962 };
16963
16964 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16965 std::swap(Op0, Op1);
16966
16967 if (Op1.getOpcode() != ISD::VSELECT)
16968 return SDValue();
16969
16970 SDNodeFlags FaddFlags = N->getFlags();
16971 bool NSZ = FaddFlags.hasNoSignedZeros();
16972 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16973 return SDValue();
16974
16975 SDValue FAdd =
16976 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16977 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16978}
16979
16981 SDValue LHS = N->getOperand(0);
16982 SDValue RHS = N->getOperand(1);
16983 EVT VT = N->getValueType(0);
16984 SDLoc DL(N);
16985
16986 if (!N->getFlags().hasAllowReassociation())
16987 return SDValue();
16988
16989 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16990 auto ReassocComplex = [&](SDValue A, SDValue B) {
16991 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16992 return SDValue();
16993 unsigned Opc = A.getConstantOperandVal(0);
16994 if (Opc != Intrinsic::arm_mve_vcmlaq)
16995 return SDValue();
16996 SDValue VCMLA = DAG.getNode(
16997 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16998 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16999 A.getOperand(3), A.getOperand(4));
17000 VCMLA->setFlags(A->getFlags());
17001 return VCMLA;
17002 };
17003 if (SDValue R = ReassocComplex(LHS, RHS))
17004 return R;
17005 if (SDValue R = ReassocComplex(RHS, LHS))
17006 return R;
17007
17008 return SDValue();
17009}
17010
17012 const ARMSubtarget *Subtarget) {
17013 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17014 return S;
17015 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17016 return S;
17017 return SDValue();
17018}
17019
17020/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17021/// can replace combinations of VCVT (integer to floating-point) and VMUL
17022/// when the VMUL has a constant operand that is a power of 2.
17023///
17024/// Example (assume d17 = <float 0.125, float 0.125>):
17025/// vcvt.f32.s32 d16, d16
17026/// vmul.f32 d16, d16, d17
17027/// becomes:
17028/// vcvt.f32.s32 d16, d16, #3
17030 const ARMSubtarget *Subtarget) {
17031 if (!Subtarget->hasNEON())
17032 return SDValue();
17033
17034 SDValue Op = N->getOperand(0);
17035 unsigned OpOpcode = Op.getNode()->getOpcode();
17036 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17037 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17038 return SDValue();
17039
17040 SDValue ConstVec = N->getOperand(1);
17041 if (!isa<BuildVectorSDNode>(ConstVec))
17042 return SDValue();
17043
17044 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17045 uint32_t FloatBits = FloatTy.getSizeInBits();
17046 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17047 uint32_t IntBits = IntTy.getSizeInBits();
17048 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17049 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17050 // These instructions only exist converting from i32 to f32. We can handle
17051 // smaller integers by generating an extra extend, but larger ones would
17052 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17053 // these intructions only support v2i32/v4i32 types.
17054 return SDValue();
17055 }
17056
17057 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17058 APFloat Recip(0.0f);
17059 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17060 return SDValue();
17061
17062 bool IsExact;
17063 APSInt IntVal(33);
17064 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17065 APFloat::opOK ||
17066 !IsExact)
17067 return SDValue();
17068
17069 int32_t C = IntVal.exactLogBase2();
17070 if (C == -1 || C == 0 || C > 32)
17071 return SDValue();
17072
17073 SDLoc DL(N);
17074 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17075 SDValue ConvInput = Op.getOperand(0);
17076 if (IntBits < FloatBits)
17078 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17079
17080 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17081 : Intrinsic::arm_neon_vcvtfxu2fp;
17082 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17083 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17084 DAG.getConstant(C, DL, MVT::i32));
17085}
17086
17088 const ARMSubtarget *ST) {
17089 if (!ST->hasMVEIntegerOps())
17090 return SDValue();
17091
17092 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17093 EVT ResVT = N->getValueType(0);
17094 SDValue N0 = N->getOperand(0);
17095 SDLoc dl(N);
17096
17097 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17098 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17099 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17100 N0.getValueType() == MVT::v16i8)) {
17101 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17102 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17103 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17104 }
17105
17106 // We are looking for something that will have illegal types if left alone,
17107 // but that we can convert to a single instruction under MVE. For example
17108 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17109 // or
17110 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17111
17112 // The legal cases are:
17113 // VADDV u/s 8/16/32
17114 // VMLAV u/s 8/16/32
17115 // VADDLV u/s 32
17116 // VMLALV u/s 16/32
17117
17118 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17119 // extend it and use v4i32 instead.
17120 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17121 EVT AVT = A.getValueType();
17122 return any_of(ExtTypes, [&](MVT Ty) {
17123 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17124 AVT.bitsLE(Ty);
17125 });
17126 };
17127 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17128 EVT AVT = A.getValueType();
17129 if (!AVT.is128BitVector())
17130 A = DAG.getNode(ExtendCode, dl,
17132 128 / AVT.getVectorMinNumElements())),
17133 A);
17134 return A;
17135 };
17136 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17137 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17138 return SDValue();
17139 SDValue A = N0->getOperand(0);
17140 if (ExtTypeMatches(A, ExtTypes))
17141 return ExtendIfNeeded(A, ExtendCode);
17142 return SDValue();
17143 };
17144 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17145 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17146 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17148 return SDValue();
17149 Mask = N0->getOperand(0);
17150 SDValue Ext = N0->getOperand(1);
17151 if (Ext->getOpcode() != ExtendCode)
17152 return SDValue();
17153 SDValue A = Ext->getOperand(0);
17154 if (ExtTypeMatches(A, ExtTypes))
17155 return ExtendIfNeeded(A, ExtendCode);
17156 return SDValue();
17157 };
17158 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17159 SDValue &A, SDValue &B) {
17160 // For a vmla we are trying to match a larger pattern:
17161 // ExtA = sext/zext A
17162 // ExtB = sext/zext B
17163 // Mul = mul ExtA, ExtB
17164 // vecreduce.add Mul
17165 // There might also be en extra extend between the mul and the addreduce, so
17166 // long as the bitwidth is high enough to make them equivalent (for example
17167 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17168 if (ResVT != RetTy)
17169 return false;
17170 SDValue Mul = N0;
17171 if (Mul->getOpcode() == ExtendCode &&
17172 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17173 ResVT.getScalarSizeInBits())
17174 Mul = Mul->getOperand(0);
17175 if (Mul->getOpcode() != ISD::MUL)
17176 return false;
17177 SDValue ExtA = Mul->getOperand(0);
17178 SDValue ExtB = Mul->getOperand(1);
17179 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17180 return false;
17181 A = ExtA->getOperand(0);
17182 B = ExtB->getOperand(0);
17183 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17184 A = ExtendIfNeeded(A, ExtendCode);
17185 B = ExtendIfNeeded(B, ExtendCode);
17186 return true;
17187 }
17188 return false;
17189 };
17190 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17191 SDValue &A, SDValue &B, SDValue &Mask) {
17192 // Same as the pattern above with a select for the zero predicated lanes
17193 // ExtA = sext/zext A
17194 // ExtB = sext/zext B
17195 // Mul = mul ExtA, ExtB
17196 // N0 = select Mask, Mul, 0
17197 // vecreduce.add N0
17198 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17200 return false;
17201 Mask = N0->getOperand(0);
17202 SDValue Mul = N0->getOperand(1);
17203 if (Mul->getOpcode() == ExtendCode &&
17204 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17205 ResVT.getScalarSizeInBits())
17206 Mul = Mul->getOperand(0);
17207 if (Mul->getOpcode() != ISD::MUL)
17208 return false;
17209 SDValue ExtA = Mul->getOperand(0);
17210 SDValue ExtB = Mul->getOperand(1);
17211 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17212 return false;
17213 A = ExtA->getOperand(0);
17214 B = ExtB->getOperand(0);
17215 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17216 A = ExtendIfNeeded(A, ExtendCode);
17217 B = ExtendIfNeeded(B, ExtendCode);
17218 return true;
17219 }
17220 return false;
17221 };
17222 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17223 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17224 // reductions. The operands are extended with MVEEXT, but as they are
17225 // reductions the lane orders do not matter. MVEEXT may be combined with
17226 // loads to produce two extending loads, or else they will be expanded to
17227 // VREV/VMOVL.
17228 EVT VT = Ops[0].getValueType();
17229 if (VT == MVT::v16i8) {
17230 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17231 "Unexpected illegal long reduction opcode");
17232 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17233
17234 SDValue Ext0 =
17235 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17236 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17237 SDValue Ext1 =
17238 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17239 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17240
17241 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17242 Ext0, Ext1);
17243 SDValue MLA1 =
17244 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17245 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17246 Ext0.getValue(1), Ext1.getValue(1));
17247 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17248 }
17249 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17250 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17251 SDValue(Node.getNode(), 1));
17252 };
17253
17254 SDValue A, B;
17255 SDValue Mask;
17256 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17257 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17258 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17259 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17260 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17261 A, B))
17262 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17263 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17264 A, B))
17265 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17266 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17267 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17268 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17269 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17270 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17271 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17272
17273 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17274 Mask))
17275 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17276 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17277 Mask))
17278 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17279 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17280 Mask))
17281 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17282 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17283 Mask))
17284 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17285 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17286 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17287 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17288 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17289 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17290 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17291
17292 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17293 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17294 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17295 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17296 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17297 return Create64bitNode(ARMISD::VADDLVs, {A});
17298 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17299 return Create64bitNode(ARMISD::VADDLVu, {A});
17300 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17301 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17302 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17303 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17304 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17305 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17306
17307 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17308 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17309 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17310 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17311 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17312 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17313 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17314 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17315 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17316 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17317 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17318 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17319 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17320 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17321
17322 // Some complications. We can get a case where the two inputs of the mul are
17323 // the same, then the output sext will have been helpfully converted to a
17324 // zext. Turn it back.
17325 SDValue Op = N0;
17326 if (Op->getOpcode() == ISD::VSELECT)
17327 Op = Op->getOperand(1);
17328 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17329 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17330 SDValue Mul = Op->getOperand(0);
17331 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17332 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17333 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17334 if (Op != N0)
17335 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17336 N0->getOperand(0), Ext, N0->getOperand(2));
17337 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17338 }
17339 }
17340
17341 return SDValue();
17342}
17343
17344// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17345// the lanes are used. Due to the reduction being commutative the shuffle can be
17346// removed.
17348 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17349 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17350 if (!Shuf || !Shuf->getOperand(1).isUndef())
17351 return SDValue();
17352
17353 // Check all elements are used once in the mask.
17354 ArrayRef<int> Mask = Shuf->getMask();
17355 APInt SetElts(Mask.size(), 0);
17356 for (int E : Mask) {
17357 if (E < 0 || E >= (int)Mask.size())
17358 return SDValue();
17359 SetElts.setBit(E);
17360 }
17361 if (!SetElts.isAllOnes())
17362 return SDValue();
17363
17364 if (N->getNumOperands() != VecOp + 1) {
17365 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17366 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17367 return SDValue();
17368 }
17369
17371 for (SDValue Op : N->ops()) {
17372 if (Op.getValueType().isVector())
17373 Ops.push_back(Op.getOperand(0));
17374 else
17375 Ops.push_back(Op);
17376 }
17377 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17378}
17379
17382 SDValue Op0 = N->getOperand(0);
17383 SDValue Op1 = N->getOperand(1);
17384 unsigned IsTop = N->getConstantOperandVal(2);
17385
17386 // VMOVNT a undef -> a
17387 // VMOVNB a undef -> a
17388 // VMOVNB undef a -> a
17389 if (Op1->isUndef())
17390 return Op0;
17391 if (Op0->isUndef() && !IsTop)
17392 return Op1;
17393
17394 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17395 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17396 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17397 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17398 Op1->getConstantOperandVal(2) == 0)
17399 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17400 Op0, Op1->getOperand(1), N->getOperand(2));
17401
17402 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17403 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17404 // into the top or bottom lanes.
17405 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17406 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17407 APInt Op0DemandedElts =
17408 IsTop ? Op1DemandedElts
17409 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17410
17411 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17412 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17413 return SDValue(N, 0);
17414 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17415 return SDValue(N, 0);
17416
17417 return SDValue();
17418}
17419
17422 SDValue Op0 = N->getOperand(0);
17423 unsigned IsTop = N->getConstantOperandVal(2);
17424
17425 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17426 APInt Op0DemandedElts =
17427 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17428 : APInt::getHighBitsSet(2, 1));
17429
17430 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17431 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17432 return SDValue(N, 0);
17433 return SDValue();
17434}
17435
17438 EVT VT = N->getValueType(0);
17439 SDValue LHS = N->getOperand(0);
17440 SDValue RHS = N->getOperand(1);
17441
17442 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17443 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17444 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17445 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17446 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17447 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17448 SDLoc DL(N);
17449 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17450 LHS.getOperand(0), RHS.getOperand(0));
17451 SDValue UndefV = LHS.getOperand(1);
17452 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17453 }
17454 return SDValue();
17455}
17456
17458 SDLoc DL(N);
17459 SDValue Op0 = N->getOperand(0);
17460 SDValue Op1 = N->getOperand(1);
17461
17462 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17463 // uses of the intrinsics.
17464 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17465 int ShiftAmt = C->getSExtValue();
17466 if (ShiftAmt == 0) {
17467 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17468 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17469 return SDValue();
17470 }
17471
17472 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17473 unsigned NewOpcode =
17474 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17475 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17476 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17477 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17478 return NewShift;
17479 }
17480 }
17481
17482 return SDValue();
17483}
17484
17485/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17487 DAGCombinerInfo &DCI) const {
17488 SelectionDAG &DAG = DCI.DAG;
17489 unsigned IntNo = N->getConstantOperandVal(0);
17490 switch (IntNo) {
17491 default:
17492 // Don't do anything for most intrinsics.
17493 break;
17494
17495 // Vector shifts: check for immediate versions and lower them.
17496 // Note: This is done during DAG combining instead of DAG legalizing because
17497 // the build_vectors for 64-bit vector element shift counts are generally
17498 // not legal, and it is hard to see their values after they get legalized to
17499 // loads from a constant pool.
17500 case Intrinsic::arm_neon_vshifts:
17501 case Intrinsic::arm_neon_vshiftu:
17502 case Intrinsic::arm_neon_vrshifts:
17503 case Intrinsic::arm_neon_vrshiftu:
17504 case Intrinsic::arm_neon_vrshiftn:
17505 case Intrinsic::arm_neon_vqshifts:
17506 case Intrinsic::arm_neon_vqshiftu:
17507 case Intrinsic::arm_neon_vqshiftsu:
17508 case Intrinsic::arm_neon_vqshiftns:
17509 case Intrinsic::arm_neon_vqshiftnu:
17510 case Intrinsic::arm_neon_vqshiftnsu:
17511 case Intrinsic::arm_neon_vqrshiftns:
17512 case Intrinsic::arm_neon_vqrshiftnu:
17513 case Intrinsic::arm_neon_vqrshiftnsu: {
17514 EVT VT = N->getOperand(1).getValueType();
17515 int64_t Cnt;
17516 unsigned VShiftOpc = 0;
17517
17518 switch (IntNo) {
17519 case Intrinsic::arm_neon_vshifts:
17520 case Intrinsic::arm_neon_vshiftu:
17521 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17522 VShiftOpc = ARMISD::VSHLIMM;
17523 break;
17524 }
17525 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17526 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17528 break;
17529 }
17530 return SDValue();
17531
17532 case Intrinsic::arm_neon_vrshifts:
17533 case Intrinsic::arm_neon_vrshiftu:
17534 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17535 break;
17536 return SDValue();
17537
17538 case Intrinsic::arm_neon_vqshifts:
17539 case Intrinsic::arm_neon_vqshiftu:
17540 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17541 break;
17542 return SDValue();
17543
17544 case Intrinsic::arm_neon_vqshiftsu:
17545 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17546 break;
17547 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17548
17549 case Intrinsic::arm_neon_vrshiftn:
17550 case Intrinsic::arm_neon_vqshiftns:
17551 case Intrinsic::arm_neon_vqshiftnu:
17552 case Intrinsic::arm_neon_vqshiftnsu:
17553 case Intrinsic::arm_neon_vqrshiftns:
17554 case Intrinsic::arm_neon_vqrshiftnu:
17555 case Intrinsic::arm_neon_vqrshiftnsu:
17556 // Narrowing shifts require an immediate right shift.
17557 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17558 break;
17559 llvm_unreachable("invalid shift count for narrowing vector shift "
17560 "intrinsic");
17561
17562 default:
17563 llvm_unreachable("unhandled vector shift");
17564 }
17565
17566 switch (IntNo) {
17567 case Intrinsic::arm_neon_vshifts:
17568 case Intrinsic::arm_neon_vshiftu:
17569 // Opcode already set above.
17570 break;
17571 case Intrinsic::arm_neon_vrshifts:
17572 VShiftOpc = ARMISD::VRSHRsIMM;
17573 break;
17574 case Intrinsic::arm_neon_vrshiftu:
17575 VShiftOpc = ARMISD::VRSHRuIMM;
17576 break;
17577 case Intrinsic::arm_neon_vrshiftn:
17578 VShiftOpc = ARMISD::VRSHRNIMM;
17579 break;
17580 case Intrinsic::arm_neon_vqshifts:
17581 VShiftOpc = ARMISD::VQSHLsIMM;
17582 break;
17583 case Intrinsic::arm_neon_vqshiftu:
17584 VShiftOpc = ARMISD::VQSHLuIMM;
17585 break;
17586 case Intrinsic::arm_neon_vqshiftsu:
17587 VShiftOpc = ARMISD::VQSHLsuIMM;
17588 break;
17589 case Intrinsic::arm_neon_vqshiftns:
17590 VShiftOpc = ARMISD::VQSHRNsIMM;
17591 break;
17592 case Intrinsic::arm_neon_vqshiftnu:
17593 VShiftOpc = ARMISD::VQSHRNuIMM;
17594 break;
17595 case Intrinsic::arm_neon_vqshiftnsu:
17596 VShiftOpc = ARMISD::VQSHRNsuIMM;
17597 break;
17598 case Intrinsic::arm_neon_vqrshiftns:
17599 VShiftOpc = ARMISD::VQRSHRNsIMM;
17600 break;
17601 case Intrinsic::arm_neon_vqrshiftnu:
17602 VShiftOpc = ARMISD::VQRSHRNuIMM;
17603 break;
17604 case Intrinsic::arm_neon_vqrshiftnsu:
17605 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17606 break;
17607 }
17608
17609 SDLoc dl(N);
17610 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17611 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17612 }
17613
17614 case Intrinsic::arm_neon_vshiftins: {
17615 EVT VT = N->getOperand(1).getValueType();
17616 int64_t Cnt;
17617 unsigned VShiftOpc = 0;
17618
17619 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17620 VShiftOpc = ARMISD::VSLIIMM;
17621 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17622 VShiftOpc = ARMISD::VSRIIMM;
17623 else {
17624 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17625 }
17626
17627 SDLoc dl(N);
17628 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17629 N->getOperand(1), N->getOperand(2),
17630 DAG.getConstant(Cnt, dl, MVT::i32));
17631 }
17632
17633 case Intrinsic::arm_neon_vqrshifts:
17634 case Intrinsic::arm_neon_vqrshiftu:
17635 // No immediate versions of these to check for.
17636 break;
17637
17638 case Intrinsic::arm_neon_vbsl: {
17639 SDLoc dl(N);
17640 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17641 N->getOperand(2), N->getOperand(3));
17642 }
17643 case Intrinsic::arm_mve_vqdmlah:
17644 case Intrinsic::arm_mve_vqdmlash:
17645 case Intrinsic::arm_mve_vqrdmlah:
17646 case Intrinsic::arm_mve_vqrdmlash:
17647 case Intrinsic::arm_mve_vmla_n_predicated:
17648 case Intrinsic::arm_mve_vmlas_n_predicated:
17649 case Intrinsic::arm_mve_vqdmlah_predicated:
17650 case Intrinsic::arm_mve_vqdmlash_predicated:
17651 case Intrinsic::arm_mve_vqrdmlah_predicated:
17652 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17653 // These intrinsics all take an i32 scalar operand which is narrowed to the
17654 // size of a single lane of the vector type they return. So we don't need
17655 // any bits of that operand above that point, which allows us to eliminate
17656 // uxth/sxth.
17657 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17658 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17659 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17660 return SDValue();
17661 break;
17662 }
17663
17664 case Intrinsic::arm_mve_minv:
17665 case Intrinsic::arm_mve_maxv:
17666 case Intrinsic::arm_mve_minav:
17667 case Intrinsic::arm_mve_maxav:
17668 case Intrinsic::arm_mve_minv_predicated:
17669 case Intrinsic::arm_mve_maxv_predicated:
17670 case Intrinsic::arm_mve_minav_predicated:
17671 case Intrinsic::arm_mve_maxav_predicated: {
17672 // These intrinsics all take an i32 scalar operand which is narrowed to the
17673 // size of a single lane of the vector type they take as the other input.
17674 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17675 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17676 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17677 return SDValue();
17678 break;
17679 }
17680
17681 case Intrinsic::arm_mve_addv: {
17682 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17683 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17684 bool Unsigned = N->getConstantOperandVal(2);
17686 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17687 }
17688
17689 case Intrinsic::arm_mve_addlv:
17690 case Intrinsic::arm_mve_addlv_predicated: {
17691 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17692 // which recombines the two outputs into an i64
17693 bool Unsigned = N->getConstantOperandVal(2);
17694 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17697
17699 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17700 if (i != 2) // skip the unsigned flag
17701 Ops.push_back(N->getOperand(i));
17702
17703 SDLoc dl(N);
17704 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17705 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17706 val.getValue(1));
17707 }
17708 }
17709
17710 return SDValue();
17711}
17712
17713/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17714/// lowers them. As with the vector shift intrinsics, this is done during DAG
17715/// combining instead of DAG legalizing because the build_vectors for 64-bit
17716/// vector element shift counts are generally not legal, and it is hard to see
17717/// their values after they get legalized to loads from a constant pool.
17720 const ARMSubtarget *ST) {
17721 SelectionDAG &DAG = DCI.DAG;
17722 EVT VT = N->getValueType(0);
17723
17724 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17725 N->getOperand(0)->getOpcode() == ISD::AND &&
17726 N->getOperand(0)->hasOneUse()) {
17727 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17728 return SDValue();
17729 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17730 // usually show up because instcombine prefers to canonicalize it to
17731 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17732 // out of GEP lowering in some cases.
17733 SDValue N0 = N->getOperand(0);
17734 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17735 if (!ShiftAmtNode)
17736 return SDValue();
17737 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17738 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17739 if (!AndMaskNode)
17740 return SDValue();
17741 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17742 // Don't transform uxtb/uxth.
17743 if (AndMask == 255 || AndMask == 65535)
17744 return SDValue();
17745 if (isMask_32(AndMask)) {
17746 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17747 if (MaskedBits > ShiftAmt) {
17748 SDLoc DL(N);
17749 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17750 DAG.getConstant(MaskedBits, DL, MVT::i32));
17751 return DAG.getNode(
17752 ISD::SRL, DL, MVT::i32, SHL,
17753 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17754 }
17755 }
17756 }
17757
17758 // Nothing to be done for scalar shifts.
17759 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17760 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17761 return SDValue();
17762 if (ST->hasMVEIntegerOps())
17763 return SDValue();
17764
17765 int64_t Cnt;
17766
17767 switch (N->getOpcode()) {
17768 default: llvm_unreachable("unexpected shift opcode");
17769
17770 case ISD::SHL:
17771 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17772 SDLoc dl(N);
17773 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17774 DAG.getConstant(Cnt, dl, MVT::i32));
17775 }
17776 break;
17777
17778 case ISD::SRA:
17779 case ISD::SRL:
17780 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17781 unsigned VShiftOpc =
17782 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17783 SDLoc dl(N);
17784 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17785 DAG.getConstant(Cnt, dl, MVT::i32));
17786 }
17787 }
17788 return SDValue();
17789}
17790
17791// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17792// split into multiple extending loads, which are simpler to deal with than an
17793// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17794// to convert the type to an f32.
17796 SDValue N0 = N->getOperand(0);
17797 if (N0.getOpcode() != ISD::LOAD)
17798 return SDValue();
17800 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17801 LD->getExtensionType() != ISD::NON_EXTLOAD)
17802 return SDValue();
17803 EVT FromVT = LD->getValueType(0);
17804 EVT ToVT = N->getValueType(0);
17805 if (!ToVT.isVector())
17806 return SDValue();
17808 EVT ToEltVT = ToVT.getVectorElementType();
17809 EVT FromEltVT = FromVT.getVectorElementType();
17810
17811 unsigned NumElements = 0;
17812 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17813 NumElements = 4;
17814 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17815 NumElements = 4;
17816 if (NumElements == 0 ||
17817 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17818 FromVT.getVectorNumElements() % NumElements != 0 ||
17819 !isPowerOf2_32(NumElements))
17820 return SDValue();
17821
17822 LLVMContext &C = *DAG.getContext();
17823 SDLoc DL(LD);
17824 // Details about the old load
17825 SDValue Ch = LD->getChain();
17826 SDValue BasePtr = LD->getBasePtr();
17827 Align Alignment = LD->getBaseAlign();
17828 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17829 AAMDNodes AAInfo = LD->getAAInfo();
17830
17831 ISD::LoadExtType NewExtType =
17832 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17833 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17834 EVT NewFromVT = EVT::getVectorVT(
17835 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17836 EVT NewToVT = EVT::getVectorVT(
17837 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17838
17841 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17842 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17843 SDValue NewPtr =
17844 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17845
17846 SDValue NewLoad =
17847 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17848 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17849 Alignment, MMOFlags, AAInfo);
17850 Loads.push_back(NewLoad);
17851 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17852 }
17853
17854 // Float truncs need to extended with VCVTB's into their floating point types.
17855 if (FromEltVT == MVT::f16) {
17857
17858 for (unsigned i = 0; i < Loads.size(); i++) {
17859 SDValue LoadBC =
17860 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17861 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17862 DAG.getConstant(0, DL, MVT::i32));
17863 Extends.push_back(FPExt);
17864 }
17865
17866 Loads = Extends;
17867 }
17868
17869 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17870 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17871 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17872}
17873
17874/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17875/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17877 const ARMSubtarget *ST) {
17878 SDValue N0 = N->getOperand(0);
17879
17880 // Check for sign- and zero-extensions of vector extract operations of 8- and
17881 // 16-bit vector elements. NEON and MVE support these directly. They are
17882 // handled during DAG combining because type legalization will promote them
17883 // to 32-bit types and it is messy to recognize the operations after that.
17884 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17886 SDValue Vec = N0.getOperand(0);
17887 SDValue Lane = N0.getOperand(1);
17888 EVT VT = N->getValueType(0);
17889 EVT EltVT = N0.getValueType();
17890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17891
17892 if (VT == MVT::i32 &&
17893 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17894 TLI.isTypeLegal(Vec.getValueType()) &&
17895 isa<ConstantSDNode>(Lane)) {
17896
17897 unsigned Opc = 0;
17898 switch (N->getOpcode()) {
17899 default: llvm_unreachable("unexpected opcode");
17900 case ISD::SIGN_EXTEND:
17902 break;
17903 case ISD::ZERO_EXTEND:
17904 case ISD::ANY_EXTEND:
17906 break;
17907 }
17908 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17909 }
17910 }
17911
17912 if (ST->hasMVEIntegerOps())
17913 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17914 return NewLoad;
17915
17916 return SDValue();
17917}
17918
17920 const ARMSubtarget *ST) {
17921 if (ST->hasMVEFloatOps())
17922 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17923 return NewLoad;
17924
17925 return SDValue();
17926}
17927
17928// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17929// constant bounds.
17931 const ARMSubtarget *Subtarget) {
17932 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17933 !Subtarget->isThumb2())
17934 return SDValue();
17935
17936 EVT VT = Op.getValueType();
17937 SDValue Op0 = Op.getOperand(0);
17938
17939 if (VT != MVT::i32 ||
17940 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17941 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17943 return SDValue();
17944
17945 SDValue Min = Op;
17946 SDValue Max = Op0;
17947 SDValue Input = Op0.getOperand(0);
17948 if (Min.getOpcode() == ISD::SMAX)
17949 std::swap(Min, Max);
17950
17951 APInt MinC = Min.getConstantOperandAPInt(1);
17952 APInt MaxC = Max.getConstantOperandAPInt(1);
17953
17954 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17955 !(MinC + 1).isPowerOf2())
17956 return SDValue();
17957
17958 SDLoc DL(Op);
17959 if (MinC == ~MaxC)
17960 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17961 DAG.getConstant(MinC.countr_one(), DL, VT));
17962 if (MaxC == 0)
17963 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17964 DAG.getConstant(MinC.countr_one(), DL, VT));
17965
17966 return SDValue();
17967}
17968
17969/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17970/// saturates.
17972 const ARMSubtarget *ST) {
17973 EVT VT = N->getValueType(0);
17974 SDValue N0 = N->getOperand(0);
17975
17976 if (VT == MVT::i32)
17977 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17978
17979 if (!ST->hasMVEIntegerOps())
17980 return SDValue();
17981
17982 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17983 return V;
17984
17985 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17986 return SDValue();
17987
17988 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17989 // Check one is a smin and the other is a smax
17990 if (Min->getOpcode() != ISD::SMIN)
17991 std::swap(Min, Max);
17992 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17993 return false;
17994
17995 APInt SaturateC;
17996 if (VT == MVT::v4i32)
17997 SaturateC = APInt(32, (1 << 15) - 1, true);
17998 else //if (VT == MVT::v8i16)
17999 SaturateC = APInt(16, (1 << 7) - 1, true);
18000
18001 APInt MinC, MaxC;
18002 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18003 MinC != SaturateC)
18004 return false;
18005 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18006 MaxC != ~SaturateC)
18007 return false;
18008 return true;
18009 };
18010
18011 if (IsSignedSaturate(N, N0.getNode())) {
18012 SDLoc DL(N);
18013 MVT ExtVT, HalfVT;
18014 if (VT == MVT::v4i32) {
18015 HalfVT = MVT::v8i16;
18016 ExtVT = MVT::v4i16;
18017 } else { // if (VT == MVT::v8i16)
18018 HalfVT = MVT::v16i8;
18019 ExtVT = MVT::v8i8;
18020 }
18021
18022 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18023 // half. That extend will hopefully be removed if only the bottom bits are
18024 // demanded (though a truncating store, for example).
18025 SDValue VQMOVN =
18026 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18027 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18028 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18029 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18030 DAG.getValueType(ExtVT));
18031 }
18032
18033 auto IsUnsignedSaturate = [&](SDNode *Min) {
18034 // For unsigned, we just need to check for <= 0xffff
18035 if (Min->getOpcode() != ISD::UMIN)
18036 return false;
18037
18038 APInt SaturateC;
18039 if (VT == MVT::v4i32)
18040 SaturateC = APInt(32, (1 << 16) - 1, true);
18041 else //if (VT == MVT::v8i16)
18042 SaturateC = APInt(16, (1 << 8) - 1, true);
18043
18044 APInt MinC;
18045 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18046 MinC != SaturateC)
18047 return false;
18048 return true;
18049 };
18050
18051 if (IsUnsignedSaturate(N)) {
18052 SDLoc DL(N);
18053 MVT HalfVT;
18054 unsigned ExtConst;
18055 if (VT == MVT::v4i32) {
18056 HalfVT = MVT::v8i16;
18057 ExtConst = 0x0000FFFF;
18058 } else { //if (VT == MVT::v8i16)
18059 HalfVT = MVT::v16i8;
18060 ExtConst = 0x00FF;
18061 }
18062
18063 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18064 // an AND. That extend will hopefully be removed if only the bottom bits are
18065 // demanded (though a truncating store, for example).
18066 SDValue VQMOVN =
18067 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18068 DAG.getConstant(0, DL, MVT::i32));
18069 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18070 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18071 DAG.getConstant(ExtConst, DL, VT));
18072 }
18073
18074 return SDValue();
18075}
18076
18079 if (!C)
18080 return nullptr;
18081 const APInt *CV = &C->getAPIntValue();
18082 return CV->isPowerOf2() ? CV : nullptr;
18083}
18084
18086 // If we have a CMOV, OR and AND combination such as:
18087 // if (x & CN)
18088 // y |= CM;
18089 //
18090 // And:
18091 // * CN is a single bit;
18092 // * All bits covered by CM are known zero in y
18093 //
18094 // Then we can convert this into a sequence of BFI instructions. This will
18095 // always be a win if CM is a single bit, will always be no worse than the
18096 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18097 // three bits (due to the extra IT instruction).
18098
18099 SDValue Op0 = CMOV->getOperand(0);
18100 SDValue Op1 = CMOV->getOperand(1);
18101 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18102 SDValue CmpZ = CMOV->getOperand(3);
18103
18104 // The compare must be against zero.
18105 if (!isNullConstant(CmpZ->getOperand(1)))
18106 return SDValue();
18107
18108 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18109 SDValue And = CmpZ->getOperand(0);
18110 if (And->getOpcode() != ISD::AND)
18111 return SDValue();
18112 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18113 if (!AndC)
18114 return SDValue();
18115 SDValue X = And->getOperand(0);
18116
18117 if (CC == ARMCC::EQ) {
18118 // We're performing an "equal to zero" compare. Swap the operands so we
18119 // canonicalize on a "not equal to zero" compare.
18120 std::swap(Op0, Op1);
18121 } else {
18122 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18123 }
18124
18125 if (Op1->getOpcode() != ISD::OR)
18126 return SDValue();
18127
18129 if (!OrC)
18130 return SDValue();
18131 SDValue Y = Op1->getOperand(0);
18132
18133 if (Op0 != Y)
18134 return SDValue();
18135
18136 // Now, is it profitable to continue?
18137 APInt OrCI = OrC->getAPIntValue();
18138 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18139 if (OrCI.popcount() > Heuristic)
18140 return SDValue();
18141
18142 // Lastly, can we determine that the bits defined by OrCI
18143 // are zero in Y?
18144 KnownBits Known = DAG.computeKnownBits(Y);
18145 if ((OrCI & Known.Zero) != OrCI)
18146 return SDValue();
18147
18148 // OK, we can do the combine.
18149 SDValue V = Y;
18150 SDLoc dl(X);
18151 EVT VT = X.getValueType();
18152 unsigned BitInX = AndC->logBase2();
18153
18154 if (BitInX != 0) {
18155 // We must shift X first.
18156 X = DAG.getNode(ISD::SRL, dl, VT, X,
18157 DAG.getConstant(BitInX, dl, VT));
18158 }
18159
18160 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18161 BitInY < NumActiveBits; ++BitInY) {
18162 if (OrCI[BitInY] == 0)
18163 continue;
18164 APInt Mask(VT.getSizeInBits(), 0);
18165 Mask.setBit(BitInY);
18166 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18167 // Confusingly, the operand is an *inverted* mask.
18168 DAG.getConstant(~Mask, dl, VT));
18169 }
18170
18171 return V;
18172}
18173
18174// Given N, the value controlling the conditional branch, search for the loop
18175// intrinsic, returning it, along with how the value is used. We need to handle
18176// patterns such as the following:
18177// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18178// (brcond (setcc (loop.decrement), 0, eq), exit)
18179// (brcond (setcc (loop.decrement), 0, ne), header)
18181 bool &Negate) {
18182 switch (N->getOpcode()) {
18183 default:
18184 break;
18185 case ISD::XOR: {
18186 if (!isa<ConstantSDNode>(N.getOperand(1)))
18187 return SDValue();
18188 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18189 return SDValue();
18190 Negate = !Negate;
18191 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18192 }
18193 case ISD::SETCC: {
18194 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18195 if (!Const)
18196 return SDValue();
18197 if (Const->isZero())
18198 Imm = 0;
18199 else if (Const->isOne())
18200 Imm = 1;
18201 else
18202 return SDValue();
18203 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18204 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18205 }
18207 unsigned IntOp = N.getConstantOperandVal(1);
18208 if (IntOp != Intrinsic::test_start_loop_iterations &&
18209 IntOp != Intrinsic::loop_decrement_reg)
18210 return SDValue();
18211 return N;
18212 }
18213 }
18214 return SDValue();
18215}
18216
18219 const ARMSubtarget *ST) {
18220
18221 // The hwloop intrinsics that we're interested are used for control-flow,
18222 // either for entering or exiting the loop:
18223 // - test.start.loop.iterations will test whether its operand is zero. If it
18224 // is zero, the proceeding branch should not enter the loop.
18225 // - loop.decrement.reg also tests whether its operand is zero. If it is
18226 // zero, the proceeding branch should not branch back to the beginning of
18227 // the loop.
18228 // So here, we need to check that how the brcond is using the result of each
18229 // of the intrinsics to ensure that we're branching to the right place at the
18230 // right time.
18231
18232 ISD::CondCode CC;
18233 SDValue Cond;
18234 int Imm = 1;
18235 bool Negate = false;
18236 SDValue Chain = N->getOperand(0);
18237 SDValue Dest;
18238
18239 if (N->getOpcode() == ISD::BRCOND) {
18240 CC = ISD::SETEQ;
18241 Cond = N->getOperand(1);
18242 Dest = N->getOperand(2);
18243 } else {
18244 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18245 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18246 Cond = N->getOperand(2);
18247 Dest = N->getOperand(4);
18248 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18249 if (!Const->isOne() && !Const->isZero())
18250 return SDValue();
18251 Imm = Const->getZExtValue();
18252 } else
18253 return SDValue();
18254 }
18255
18256 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18257 if (!Int)
18258 return SDValue();
18259
18260 if (Negate)
18261 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18262
18263 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18264 return (CC == ISD::SETEQ && Imm == 0) ||
18265 (CC == ISD::SETNE && Imm == 1) ||
18266 (CC == ISD::SETLT && Imm == 1) ||
18267 (CC == ISD::SETULT && Imm == 1);
18268 };
18269
18270 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18271 return (CC == ISD::SETEQ && Imm == 1) ||
18272 (CC == ISD::SETNE && Imm == 0) ||
18273 (CC == ISD::SETGT && Imm == 0) ||
18274 (CC == ISD::SETUGT && Imm == 0) ||
18275 (CC == ISD::SETGE && Imm == 1) ||
18276 (CC == ISD::SETUGE && Imm == 1);
18277 };
18278
18279 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18280 "unsupported condition");
18281
18282 SDLoc dl(Int);
18283 SelectionDAG &DAG = DCI.DAG;
18284 SDValue Elements = Int.getOperand(2);
18285 unsigned IntOp = Int->getConstantOperandVal(1);
18286 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18287 "expected single br user");
18288 SDNode *Br = *N->user_begin();
18289 SDValue OtherTarget = Br->getOperand(1);
18290
18291 // Update the unconditional branch to branch to the given Dest.
18292 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18293 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18294 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18295 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18296 };
18297
18298 if (IntOp == Intrinsic::test_start_loop_iterations) {
18299 SDValue Res;
18300 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18301 // We expect this 'instruction' to branch when the counter is zero.
18302 if (IsTrueIfZero(CC, Imm)) {
18303 SDValue Ops[] = {Chain, Setup, Dest};
18304 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18305 } else {
18306 // The logic is the reverse of what we need for WLS, so find the other
18307 // basic block target: the target of the proceeding br.
18308 UpdateUncondBr(Br, Dest, DAG);
18309
18310 SDValue Ops[] = {Chain, Setup, OtherTarget};
18311 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18312 }
18313 // Update LR count to the new value
18314 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18315 // Update chain
18316 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18317 return Res;
18318 } else {
18319 SDValue Size =
18320 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18321 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18322 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18323 DAG.getVTList(MVT::i32, MVT::Other), Args);
18324 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18325
18326 // We expect this instruction to branch when the count is not zero.
18327 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18328
18329 // Update the unconditional branch to target the loop preheader if we've
18330 // found the condition has been reversed.
18331 if (Target == OtherTarget)
18332 UpdateUncondBr(Br, Dest, DAG);
18333
18334 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18335 SDValue(LoopDec.getNode(), 1), Chain);
18336
18337 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18338 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18339 }
18340 return SDValue();
18341}
18342
18343/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18344SDValue
18346 SDValue Cmp = N->getOperand(3);
18347 if (Cmp.getOpcode() != ARMISD::CMPZ)
18348 // Only looking at NE cases.
18349 return SDValue();
18350
18351 SDLoc dl(N);
18352 SDValue LHS = Cmp.getOperand(0);
18353 SDValue RHS = Cmp.getOperand(1);
18354 SDValue Chain = N->getOperand(0);
18355 SDValue BB = N->getOperand(1);
18356 SDValue ARMcc = N->getOperand(2);
18358
18359 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18360 // -> (brcond Chain BB CC Flags)
18361 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18362 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18363 LHS->getOperand(0)->hasOneUse() &&
18364 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18365 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18366 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18367 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18368 LHS->getOperand(0)->getOperand(2),
18369 LHS->getOperand(0)->getOperand(3));
18370 }
18371
18372 return SDValue();
18373}
18374
18375/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18376SDValue
18378 SDValue Cmp = N->getOperand(3);
18379 if (Cmp.getOpcode() != ARMISD::CMPZ)
18380 // Only looking at EQ and NE cases.
18381 return SDValue();
18382
18383 EVT VT = N->getValueType(0);
18384 SDLoc dl(N);
18385 SDValue LHS = Cmp.getOperand(0);
18386 SDValue RHS = Cmp.getOperand(1);
18387 SDValue FalseVal = N->getOperand(0);
18388 SDValue TrueVal = N->getOperand(1);
18389 SDValue ARMcc = N->getOperand(2);
18391
18392 // BFI is only available on V6T2+.
18393 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18395 if (R)
18396 return R;
18397 }
18398
18399 // Simplify
18400 // mov r1, r0
18401 // cmp r1, x
18402 // mov r0, y
18403 // moveq r0, x
18404 // to
18405 // cmp r0, x
18406 // movne r0, y
18407 //
18408 // mov r1, r0
18409 // cmp r1, x
18410 // mov r0, x
18411 // movne r0, y
18412 // to
18413 // cmp r0, x
18414 // movne r0, y
18415 /// FIXME: Turn this into a target neutral optimization?
18416 SDValue Res;
18417 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18418 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18419 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18420 SDValue ARMcc;
18421 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18422 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18423 }
18424
18425 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18426 // -> (cmov F T CC Flags)
18427 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18428 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18429 isNullConstant(RHS)) {
18430 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18431 LHS->getOperand(2), LHS->getOperand(3));
18432 }
18433
18434 if (!VT.isInteger())
18435 return SDValue();
18436
18437 // Fold away an unneccessary CMPZ/CMOV
18438 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18439 // if C1==EQ -> CMOV A, B, C2, D
18440 // if C1==NE -> CMOV A, B, NOT(C2), D
18441 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18442 N->getConstantOperandVal(2) == ARMCC::NE) {
18444 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18445 if (N->getConstantOperandVal(2) == ARMCC::NE)
18447 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18448 N->getOperand(1),
18449 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18450 }
18451 }
18452
18453 // Materialize a boolean comparison for integers so we can avoid branching.
18454 if (isNullConstant(FalseVal)) {
18455 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18456 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18457 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18458 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18459 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18460 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18461 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18462 DAG.getConstant(5, dl, MVT::i32));
18463 } else {
18464 // CMOV 0, 1, ==, (CMPZ x, y) ->
18465 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18466 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18467 //
18468 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18469 // x != y. In other words, a carry C == 1 when x == y, C == 0
18470 // otherwise.
18471 // The final UADDO_CARRY computes
18472 // x - y + (0 - (x - y)) + C == C
18473 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18474 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18475 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18476 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18477 // actually.
18478 SDValue Carry =
18479 DAG.getNode(ISD::SUB, dl, MVT::i32,
18480 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18481 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18482 }
18483 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18484 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18485 // This seems pointless but will allow us to combine it further below.
18486 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18487 SDValue Sub =
18488 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18489 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18490 Sub.getValue(1));
18491 FalseVal = Sub;
18492 }
18493 } else if (isNullConstant(TrueVal)) {
18494 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18495 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18496 // This seems pointless but will allow us to combine it further below
18497 // Note that we change == for != as this is the dual for the case above.
18498 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18499 SDValue Sub =
18500 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18501 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18502 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18503 Sub.getValue(1));
18504 FalseVal = Sub;
18505 }
18506 }
18507
18508 // On Thumb1, the DAG above may be further combined if z is a power of 2
18509 // (z == 2 ^ K).
18510 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18511 // t1 = (USUBO (SUB x, y), 1)
18512 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18513 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18514 //
18515 // This also handles the special case of comparing against zero; it's
18516 // essentially, the same pattern, except there's no SUBC:
18517 // CMOV x, z, !=, (CMPZ x, 0) ->
18518 // t1 = (USUBO x, 1)
18519 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18520 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18521 const APInt *TrueConst;
18522 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18523 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18524 FalseVal.getOperand(1) == RHS) ||
18525 (FalseVal == LHS && isNullConstant(RHS))) &&
18526 (TrueConst = isPowerOf2Constant(TrueVal))) {
18527 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18528 unsigned ShiftAmount = TrueConst->logBase2();
18529 if (ShiftAmount)
18530 TrueVal = DAG.getConstant(1, dl, VT);
18531 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18532 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18533 Subc.getValue(1));
18534
18535 if (ShiftAmount)
18536 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18537 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18538 }
18539
18540 if (Res.getNode()) {
18541 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18542 // Capture demanded bits information that would be otherwise lost.
18543 if (Known.Zero == 0xfffffffe)
18544 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18545 DAG.getValueType(MVT::i1));
18546 else if (Known.Zero == 0xffffff00)
18547 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18548 DAG.getValueType(MVT::i8));
18549 else if (Known.Zero == 0xffff0000)
18550 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18551 DAG.getValueType(MVT::i16));
18552 }
18553
18554 return Res;
18555}
18556
18559 const ARMSubtarget *ST) {
18560 SelectionDAG &DAG = DCI.DAG;
18561 SDValue Src = N->getOperand(0);
18562 EVT DstVT = N->getValueType(0);
18563
18564 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18565 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18566 EVT SrcVT = Src.getValueType();
18567 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18568 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18569 }
18570
18571 // We may have a bitcast of something that has already had this bitcast
18572 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18573 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18574 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18575 Src.getValueType().getScalarSizeInBits())
18576 Src = Src.getOperand(0);
18577
18578 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18579 // would be generated is at least the width of the element type.
18580 EVT SrcVT = Src.getValueType();
18581 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18582 Src.getOpcode() == ARMISD::VMVNIMM ||
18583 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18584 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18585 DAG.getDataLayout().isBigEndian())
18586 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18587
18588 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18589 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18590 return R;
18591
18592 return SDValue();
18593}
18594
18595// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18596// node into stack operations after legalizeOps.
18599 SelectionDAG &DAG = DCI.DAG;
18600 EVT VT = N->getValueType(0);
18601 SDLoc DL(N);
18602
18603 // MVETrunc(Undef, Undef) -> Undef
18604 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18605 return DAG.getUNDEF(VT);
18606
18607 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18608 if (N->getNumOperands() == 2 &&
18609 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18610 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18611 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18612 N->getOperand(0).getOperand(1),
18613 N->getOperand(1).getOperand(0),
18614 N->getOperand(1).getOperand(1));
18615
18616 // MVETrunc(shuffle, shuffle) -> VMOVN
18617 if (N->getNumOperands() == 2 &&
18618 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18619 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18620 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18621 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18622
18623 if (S0->getOperand(0) == S1->getOperand(0) &&
18624 S0->getOperand(1) == S1->getOperand(1)) {
18625 // Construct complete shuffle mask
18626 SmallVector<int, 8> Mask(S0->getMask());
18627 Mask.append(S1->getMask().begin(), S1->getMask().end());
18628
18629 if (isVMOVNTruncMask(Mask, VT, false))
18630 return DAG.getNode(
18631 ARMISD::VMOVN, DL, VT,
18632 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18633 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18634 DAG.getConstant(1, DL, MVT::i32));
18635 if (isVMOVNTruncMask(Mask, VT, true))
18636 return DAG.getNode(
18637 ARMISD::VMOVN, DL, VT,
18638 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18639 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18640 DAG.getConstant(1, DL, MVT::i32));
18641 }
18642 }
18643
18644 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18645 // truncate to a buildvector to allow the generic optimisations to kick in.
18646 if (all_of(N->ops(), [](SDValue Op) {
18647 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18648 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18649 (Op.getOpcode() == ISD::BITCAST &&
18650 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18651 })) {
18652 SmallVector<SDValue, 8> Extracts;
18653 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18654 SDValue O = N->getOperand(Op);
18655 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18656 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18657 DAG.getConstant(i, DL, MVT::i32));
18658 Extracts.push_back(Ext);
18659 }
18660 }
18661 return DAG.getBuildVector(VT, DL, Extracts);
18662 }
18663
18664 // If we are late in the legalization process and nothing has optimised
18665 // the trunc to anything better, lower it to a stack store and reload,
18666 // performing the truncation whilst keeping the lanes in the correct order:
18667 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18668 if (!DCI.isAfterLegalizeDAG())
18669 return SDValue();
18670
18671 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18672 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18673 int NumIns = N->getNumOperands();
18674 assert((NumIns == 2 || NumIns == 4) &&
18675 "Expected 2 or 4 inputs to an MVETrunc");
18676 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18677 if (N->getNumOperands() == 4)
18678 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18679
18680 SmallVector<SDValue> Chains;
18681 for (int I = 0; I < NumIns; I++) {
18682 SDValue Ptr = DAG.getNode(
18683 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18684 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18686 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18687 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18688 Ptr, MPI, StoreVT, Align(4));
18689 Chains.push_back(Ch);
18690 }
18691
18692 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18693 MachinePointerInfo MPI =
18695 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18696}
18697
18698// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18700 SelectionDAG &DAG) {
18701 SDValue N0 = N->getOperand(0);
18703 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18704 return SDValue();
18705
18706 EVT FromVT = LD->getMemoryVT();
18707 EVT ToVT = N->getValueType(0);
18708 if (!ToVT.isVector())
18709 return SDValue();
18710 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18711 EVT ToEltVT = ToVT.getVectorElementType();
18712 EVT FromEltVT = FromVT.getVectorElementType();
18713
18714 unsigned NumElements = 0;
18715 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18716 NumElements = 4;
18717 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18718 NumElements = 8;
18719 assert(NumElements != 0);
18720
18721 ISD::LoadExtType NewExtType =
18722 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18723 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18724 LD->getExtensionType() != ISD::EXTLOAD &&
18725 LD->getExtensionType() != NewExtType)
18726 return SDValue();
18727
18728 LLVMContext &C = *DAG.getContext();
18729 SDLoc DL(LD);
18730 // Details about the old load
18731 SDValue Ch = LD->getChain();
18732 SDValue BasePtr = LD->getBasePtr();
18733 Align Alignment = LD->getBaseAlign();
18734 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18735 AAMDNodes AAInfo = LD->getAAInfo();
18736
18737 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18738 EVT NewFromVT = EVT::getVectorVT(
18739 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18740 EVT NewToVT = EVT::getVectorVT(
18741 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18742
18745 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18746 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18747 SDValue NewPtr =
18748 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18749
18750 SDValue NewLoad =
18751 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18752 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18753 Alignment, MMOFlags, AAInfo);
18754 Loads.push_back(NewLoad);
18755 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18756 }
18757
18758 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18759 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18760 return DAG.getMergeValues(Loads, DL);
18761}
18762
18763// Perform combines for MVEEXT. If it has not be optimized to anything better
18764// before lowering, it gets converted to stack store and extloads performing the
18765// extend whilst still keeping the same lane ordering.
18768 SelectionDAG &DAG = DCI.DAG;
18769 EVT VT = N->getValueType(0);
18770 SDLoc DL(N);
18771 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18772 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18773
18774 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18775 *DAG.getContext());
18776 auto Extend = [&](SDValue V) {
18777 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18778 return N->getOpcode() == ARMISD::MVESEXT
18779 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18780 DAG.getValueType(ExtVT))
18781 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18782 };
18783
18784 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18785 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18786 SDValue Ext = Extend(N->getOperand(0));
18787 return DAG.getMergeValues({Ext, Ext}, DL);
18788 }
18789
18790 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18791 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18792 ArrayRef<int> Mask = SVN->getMask();
18793 assert(Mask.size() == 2 * VT.getVectorNumElements());
18794 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18795 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18796 SDValue Op0 = SVN->getOperand(0);
18797 SDValue Op1 = SVN->getOperand(1);
18798
18799 auto CheckInregMask = [&](int Start, int Offset) {
18800 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18801 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18802 return false;
18803 return true;
18804 };
18805 SDValue V0 = SDValue(N, 0);
18806 SDValue V1 = SDValue(N, 1);
18807 if (CheckInregMask(0, 0))
18808 V0 = Extend(Op0);
18809 else if (CheckInregMask(0, 1))
18810 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18811 else if (CheckInregMask(0, Mask.size()))
18812 V0 = Extend(Op1);
18813 else if (CheckInregMask(0, Mask.size() + 1))
18814 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18815
18816 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18817 V1 = Extend(Op1);
18818 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18819 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18820 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18821 V1 = Extend(Op0);
18822 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18823 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18824
18825 if (V0.getNode() != N || V1.getNode() != N)
18826 return DAG.getMergeValues({V0, V1}, DL);
18827 }
18828
18829 // MVEEXT(load) -> extload, extload
18830 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18832 return L;
18833
18834 if (!DCI.isAfterLegalizeDAG())
18835 return SDValue();
18836
18837 // Lower to a stack store and reload:
18838 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18839 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18840 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18841 int NumOuts = N->getNumValues();
18842 assert((NumOuts == 2 || NumOuts == 4) &&
18843 "Expected 2 or 4 outputs to an MVEEXT");
18844 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18845 *DAG.getContext());
18846 if (N->getNumOperands() == 4)
18847 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18848
18849 MachinePointerInfo MPI =
18851 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18852 StackPtr, MPI, Align(4));
18853
18855 for (int I = 0; I < NumOuts; I++) {
18856 SDValue Ptr = DAG.getNode(
18857 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18858 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18860 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18861 SDValue Load = DAG.getExtLoad(
18862 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18863 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18864 Loads.push_back(Load);
18865 }
18866
18867 return DAG.getMergeValues(Loads, DL);
18868}
18869
18871 DAGCombinerInfo &DCI) const {
18872 switch (N->getOpcode()) {
18873 default: break;
18874 case ISD::SELECT_CC:
18875 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18876 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18877 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18878 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18879 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18880 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18881 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18882 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18883 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18884 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18885 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18886 case ISD::BRCOND:
18887 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18888 case ARMISD::ADDC:
18889 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18890 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18891 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18892 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18893 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18894 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18895 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18896 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18897 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18900 return PerformExtractEltCombine(N, DCI, Subtarget);
18904 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18905 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18906 case ISD::FP_TO_SINT:
18907 case ISD::FP_TO_UINT:
18908 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18909 case ISD::FADD:
18910 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18911 case ISD::FMUL:
18912 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18914 return PerformIntrinsicCombine(N, DCI);
18915 case ISD::SHL:
18916 case ISD::SRA:
18917 case ISD::SRL:
18918 return PerformShiftCombine(N, DCI, Subtarget);
18919 case ISD::SIGN_EXTEND:
18920 case ISD::ZERO_EXTEND:
18921 case ISD::ANY_EXTEND:
18922 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18923 case ISD::FP_EXTEND:
18924 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18925 case ISD::SMIN:
18926 case ISD::UMIN:
18927 case ISD::SMAX:
18928 case ISD::UMAX:
18929 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18930 case ARMISD::CMOV:
18931 return PerformCMOVCombine(N, DCI.DAG);
18932 case ARMISD::BRCOND:
18933 return PerformBRCONDCombine(N, DCI.DAG);
18934 case ARMISD::CMPZ:
18935 return PerformCMPZCombine(N, DCI.DAG);
18936 case ARMISD::CSINC:
18937 case ARMISD::CSINV:
18938 case ARMISD::CSNEG:
18939 return PerformCSETCombine(N, DCI.DAG);
18940 case ISD::LOAD:
18941 return PerformLOADCombine(N, DCI, Subtarget);
18942 case ARMISD::VLD1DUP:
18943 case ARMISD::VLD2DUP:
18944 case ARMISD::VLD3DUP:
18945 case ARMISD::VLD4DUP:
18946 return PerformVLDCombine(N, DCI);
18948 return PerformARMBUILD_VECTORCombine(N, DCI);
18949 case ISD::BITCAST:
18950 return PerformBITCASTCombine(N, DCI, Subtarget);
18952 return PerformPREDICATE_CASTCombine(N, DCI);
18954 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18955 case ARMISD::MVETRUNC:
18956 return PerformMVETruncCombine(N, DCI);
18957 case ARMISD::MVESEXT:
18958 case ARMISD::MVEZEXT:
18959 return PerformMVEExtCombine(N, DCI);
18960 case ARMISD::VCMP:
18961 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18962 case ISD::VECREDUCE_ADD:
18963 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18964 case ARMISD::VADDVs:
18965 case ARMISD::VADDVu:
18966 case ARMISD::VADDLVs:
18967 case ARMISD::VADDLVu:
18968 case ARMISD::VADDLVAs:
18969 case ARMISD::VADDLVAu:
18970 case ARMISD::VMLAVs:
18971 case ARMISD::VMLAVu:
18972 case ARMISD::VMLALVs:
18973 case ARMISD::VMLALVu:
18974 case ARMISD::VMLALVAs:
18975 case ARMISD::VMLALVAu:
18976 return PerformReduceShuffleCombine(N, DCI.DAG);
18977 case ARMISD::VMOVN:
18978 return PerformVMOVNCombine(N, DCI);
18979 case ARMISD::VQMOVNs:
18980 case ARMISD::VQMOVNu:
18981 return PerformVQMOVNCombine(N, DCI);
18982 case ARMISD::VQDMULH:
18983 return PerformVQDMULHCombine(N, DCI);
18984 case ARMISD::ASRL:
18985 case ARMISD::LSRL:
18986 case ARMISD::LSLL:
18987 return PerformLongShiftCombine(N, DCI.DAG);
18988 case ARMISD::SMULWB: {
18989 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18990 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18991 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18992 return SDValue();
18993 break;
18994 }
18995 case ARMISD::SMULWT: {
18996 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18997 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18998 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18999 return SDValue();
19000 break;
19001 }
19002 case ARMISD::SMLALBB:
19003 case ARMISD::QADD16b:
19004 case ARMISD::QSUB16b:
19005 case ARMISD::UQADD16b:
19006 case ARMISD::UQSUB16b: {
19007 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19008 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19009 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19010 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19011 return SDValue();
19012 break;
19013 }
19014 case ARMISD::SMLALBT: {
19015 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19016 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19017 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19018 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19019 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19020 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19021 return SDValue();
19022 break;
19023 }
19024 case ARMISD::SMLALTB: {
19025 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19026 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19027 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19028 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19029 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19030 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19031 return SDValue();
19032 break;
19033 }
19034 case ARMISD::SMLALTT: {
19035 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19036 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19037 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19038 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19039 return SDValue();
19040 break;
19041 }
19042 case ARMISD::QADD8b:
19043 case ARMISD::QSUB8b:
19044 case ARMISD::UQADD8b:
19045 case ARMISD::UQSUB8b: {
19046 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19047 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19048 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19049 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19050 return SDValue();
19051 break;
19052 }
19053 case ARMISD::VBSP:
19054 if (N->getOperand(1) == N->getOperand(2))
19055 return N->getOperand(1);
19056 return SDValue();
19059 switch (N->getConstantOperandVal(1)) {
19060 case Intrinsic::arm_neon_vld1:
19061 case Intrinsic::arm_neon_vld1x2:
19062 case Intrinsic::arm_neon_vld1x3:
19063 case Intrinsic::arm_neon_vld1x4:
19064 case Intrinsic::arm_neon_vld2:
19065 case Intrinsic::arm_neon_vld3:
19066 case Intrinsic::arm_neon_vld4:
19067 case Intrinsic::arm_neon_vld2lane:
19068 case Intrinsic::arm_neon_vld3lane:
19069 case Intrinsic::arm_neon_vld4lane:
19070 case Intrinsic::arm_neon_vld2dup:
19071 case Intrinsic::arm_neon_vld3dup:
19072 case Intrinsic::arm_neon_vld4dup:
19073 case Intrinsic::arm_neon_vst1:
19074 case Intrinsic::arm_neon_vst1x2:
19075 case Intrinsic::arm_neon_vst1x3:
19076 case Intrinsic::arm_neon_vst1x4:
19077 case Intrinsic::arm_neon_vst2:
19078 case Intrinsic::arm_neon_vst3:
19079 case Intrinsic::arm_neon_vst4:
19080 case Intrinsic::arm_neon_vst2lane:
19081 case Intrinsic::arm_neon_vst3lane:
19082 case Intrinsic::arm_neon_vst4lane:
19083 return PerformVLDCombine(N, DCI);
19084 case Intrinsic::arm_mve_vld2q:
19085 case Intrinsic::arm_mve_vld4q:
19086 case Intrinsic::arm_mve_vst2q:
19087 case Intrinsic::arm_mve_vst4q:
19088 return PerformMVEVLDCombine(N, DCI);
19089 default: break;
19090 }
19091 break;
19092 }
19093 return SDValue();
19094}
19095
19097 EVT VT) const {
19098 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19099}
19100
19102 Align Alignment,
19104 unsigned *Fast) const {
19105 // Depends what it gets converted into if the type is weird.
19106 if (!VT.isSimple())
19107 return false;
19108
19109 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19110 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19111 auto Ty = VT.getSimpleVT().SimpleTy;
19112
19113 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19114 // Unaligned access can use (for example) LRDB, LRDH, LDR
19115 if (AllowsUnaligned) {
19116 if (Fast)
19117 *Fast = Subtarget->hasV7Ops();
19118 return true;
19119 }
19120 }
19121
19122 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19123 // For any little-endian targets with neon, we can support unaligned ld/st
19124 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19125 // A big-endian target may also explicitly support unaligned accesses
19126 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19127 if (Fast)
19128 *Fast = 1;
19129 return true;
19130 }
19131 }
19132
19133 if (!Subtarget->hasMVEIntegerOps())
19134 return false;
19135
19136 // These are for predicates
19137 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19138 Ty == MVT::v2i1)) {
19139 if (Fast)
19140 *Fast = 1;
19141 return true;
19142 }
19143
19144 // These are for truncated stores/narrowing loads. They are fine so long as
19145 // the alignment is at least the size of the item being loaded
19146 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19147 Alignment >= VT.getScalarSizeInBits() / 8) {
19148 if (Fast)
19149 *Fast = true;
19150 return true;
19151 }
19152
19153 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19154 // VSTRW.U32 all store the vector register in exactly the same format, and
19155 // differ only in the range of their immediate offset field and the required
19156 // alignment. So there is always a store that can be used, regardless of
19157 // actual type.
19158 //
19159 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19160 // VREV64.8) pair and get the same effect. This will likely be better than
19161 // aligning the vector through the stack.
19162 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19163 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19164 Ty == MVT::v2f64) {
19165 if (Fast)
19166 *Fast = 1;
19167 return true;
19168 }
19169
19170 return false;
19171}
19172
19174 LLVMContext &Context, const MemOp &Op,
19175 const AttributeList &FuncAttributes) const {
19176 // See if we can use NEON instructions for this...
19177 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19178 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19179 unsigned Fast;
19180 if (Op.size() >= 16 &&
19181 (Op.isAligned(Align(16)) ||
19182 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19184 Fast))) {
19185 return MVT::v2f64;
19186 } else if (Op.size() >= 8 &&
19187 (Op.isAligned(Align(8)) ||
19189 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19190 Fast))) {
19191 return MVT::f64;
19192 }
19193 }
19194
19195 // Let the target-independent logic figure it out.
19196 return MVT::Other;
19197}
19198
19199// 64-bit integers are split into their high and low parts and held in two
19200// different registers, so the trunc is free since the low register can just
19201// be used.
19202bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19203 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19204 return false;
19205 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19206 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19207 return (SrcBits == 64 && DestBits == 32);
19208}
19209
19211 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19212 !DstVT.isInteger())
19213 return false;
19214 unsigned SrcBits = SrcVT.getSizeInBits();
19215 unsigned DestBits = DstVT.getSizeInBits();
19216 return (SrcBits == 64 && DestBits == 32);
19217}
19218
19220 if (Val.getOpcode() != ISD::LOAD)
19221 return false;
19222
19223 EVT VT1 = Val.getValueType();
19224 if (!VT1.isSimple() || !VT1.isInteger() ||
19225 !VT2.isSimple() || !VT2.isInteger())
19226 return false;
19227
19228 switch (VT1.getSimpleVT().SimpleTy) {
19229 default: break;
19230 case MVT::i1:
19231 case MVT::i8:
19232 case MVT::i16:
19233 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19234 return true;
19235 }
19236
19237 return false;
19238}
19239
19241 if (!VT.isSimple())
19242 return false;
19243
19244 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19245 // negate values directly (fneg is free). So, we don't want to let the DAG
19246 // combiner rewrite fneg into xors and some other instructions. For f16 and
19247 // FullFP16 argument passing, some bitcast nodes may be introduced,
19248 // triggering this DAG combine rewrite, so we are avoiding that with this.
19249 switch (VT.getSimpleVT().SimpleTy) {
19250 default: break;
19251 case MVT::f16:
19252 return Subtarget->hasFullFP16();
19253 }
19254
19255 return false;
19256}
19257
19259 if (!Subtarget->hasMVEIntegerOps())
19260 return nullptr;
19261 Type *SVIType = SVI->getType();
19262 Type *ScalarType = SVIType->getScalarType();
19263
19264 if (ScalarType->isFloatTy())
19265 return Type::getInt32Ty(SVIType->getContext());
19266 if (ScalarType->isHalfTy())
19267 return Type::getInt16Ty(SVIType->getContext());
19268 return nullptr;
19269}
19270
19272 EVT VT = ExtVal.getValueType();
19273
19274 if (!isTypeLegal(VT))
19275 return false;
19276
19277 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19278 if (Ld->isExpandingLoad())
19279 return false;
19280 }
19281
19282 if (Subtarget->hasMVEIntegerOps())
19283 return true;
19284
19285 // Don't create a loadext if we can fold the extension into a wide/long
19286 // instruction.
19287 // If there's more than one user instruction, the loadext is desirable no
19288 // matter what. There can be two uses by the same instruction.
19289 if (ExtVal->use_empty() ||
19290 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19291 return true;
19292
19293 SDNode *U = *ExtVal->user_begin();
19294 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19295 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19296 return false;
19297
19298 return true;
19299}
19300
19302 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19303 return false;
19304
19305 if (!isTypeLegal(EVT::getEVT(Ty1)))
19306 return false;
19307
19308 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19309
19310 // Assuming the caller doesn't have a zeroext or signext return parameter,
19311 // truncation all the way down to i1 is valid.
19312 return true;
19313}
19314
19315/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19316/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19317/// expanded to FMAs when this method returns true, otherwise fmuladd is
19318/// expanded to fmul + fadd.
19319///
19320/// ARM supports both fused and unfused multiply-add operations; we already
19321/// lower a pair of fmul and fadd to the latter so it's not clear that there
19322/// would be a gain or that the gain would be worthwhile enough to risk
19323/// correctness bugs.
19324///
19325/// For MVE, we set this to true as it helps simplify the need for some
19326/// patterns (and we don't have the non-fused floating point instruction).
19327bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19328 EVT VT) const {
19329 if (Subtarget->useSoftFloat())
19330 return false;
19331
19332 if (!VT.isSimple())
19333 return false;
19334
19335 switch (VT.getSimpleVT().SimpleTy) {
19336 case MVT::v4f32:
19337 case MVT::v8f16:
19338 return Subtarget->hasMVEFloatOps();
19339 case MVT::f16:
19340 return Subtarget->useFPVFMx16();
19341 case MVT::f32:
19342 return Subtarget->useFPVFMx();
19343 case MVT::f64:
19344 return Subtarget->useFPVFMx64();
19345 default:
19346 break;
19347 }
19348
19349 return false;
19350}
19351
19352static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19353 if (V < 0)
19354 return false;
19355
19356 unsigned Scale = 1;
19357 switch (VT.getSimpleVT().SimpleTy) {
19358 case MVT::i1:
19359 case MVT::i8:
19360 // Scale == 1;
19361 break;
19362 case MVT::i16:
19363 // Scale == 2;
19364 Scale = 2;
19365 break;
19366 default:
19367 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19368 // Scale == 4;
19369 Scale = 4;
19370 break;
19371 }
19372
19373 if ((V & (Scale - 1)) != 0)
19374 return false;
19375 return isUInt<5>(V / Scale);
19376}
19377
19378static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19379 const ARMSubtarget *Subtarget) {
19380 if (!VT.isInteger() && !VT.isFloatingPoint())
19381 return false;
19382 if (VT.isVector() && Subtarget->hasNEON())
19383 return false;
19384 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19385 !Subtarget->hasMVEFloatOps())
19386 return false;
19387
19388 bool IsNeg = false;
19389 if (V < 0) {
19390 IsNeg = true;
19391 V = -V;
19392 }
19393
19394 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19395
19396 // MVE: size * imm7
19397 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19398 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19399 case MVT::i32:
19400 case MVT::f32:
19401 return isShiftedUInt<7,2>(V);
19402 case MVT::i16:
19403 case MVT::f16:
19404 return isShiftedUInt<7,1>(V);
19405 case MVT::i8:
19406 return isUInt<7>(V);
19407 default:
19408 return false;
19409 }
19410 }
19411
19412 // half VLDR: 2 * imm8
19413 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19414 return isShiftedUInt<8, 1>(V);
19415 // VLDR and LDRD: 4 * imm8
19416 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19417 return isShiftedUInt<8, 2>(V);
19418
19419 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19420 // + imm12 or - imm8
19421 if (IsNeg)
19422 return isUInt<8>(V);
19423 return isUInt<12>(V);
19424 }
19425
19426 return false;
19427}
19428
19429/// isLegalAddressImmediate - Return true if the integer value can be used
19430/// as the offset of the target addressing mode for load / store of the
19431/// given type.
19432static bool isLegalAddressImmediate(int64_t V, EVT VT,
19433 const ARMSubtarget *Subtarget) {
19434 if (V == 0)
19435 return true;
19436
19437 if (!VT.isSimple())
19438 return false;
19439
19440 if (Subtarget->isThumb1Only())
19441 return isLegalT1AddressImmediate(V, VT);
19442 else if (Subtarget->isThumb2())
19443 return isLegalT2AddressImmediate(V, VT, Subtarget);
19444
19445 // ARM mode.
19446 if (V < 0)
19447 V = - V;
19448 switch (VT.getSimpleVT().SimpleTy) {
19449 default: return false;
19450 case MVT::i1:
19451 case MVT::i8:
19452 case MVT::i32:
19453 // +- imm12
19454 return isUInt<12>(V);
19455 case MVT::i16:
19456 // +- imm8
19457 return isUInt<8>(V);
19458 case MVT::f32:
19459 case MVT::f64:
19460 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19461 return false;
19462 return isShiftedUInt<8, 2>(V);
19463 }
19464}
19465
19467 EVT VT) const {
19468 int Scale = AM.Scale;
19469 if (Scale < 0)
19470 return false;
19471
19472 switch (VT.getSimpleVT().SimpleTy) {
19473 default: return false;
19474 case MVT::i1:
19475 case MVT::i8:
19476 case MVT::i16:
19477 case MVT::i32:
19478 if (Scale == 1)
19479 return true;
19480 // r + r << imm
19481 Scale = Scale & ~1;
19482 return Scale == 2 || Scale == 4 || Scale == 8;
19483 case MVT::i64:
19484 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19485 // version in Thumb mode.
19486 // r + r
19487 if (Scale == 1)
19488 return true;
19489 // r * 2 (this can be lowered to r + r).
19490 if (!AM.HasBaseReg && Scale == 2)
19491 return true;
19492 return false;
19493 case MVT::isVoid:
19494 // Note, we allow "void" uses (basically, uses that aren't loads or
19495 // stores), because arm allows folding a scale into many arithmetic
19496 // operations. This should be made more precise and revisited later.
19497
19498 // Allow r << imm, but the imm has to be a multiple of two.
19499 if (Scale & 1) return false;
19500 return isPowerOf2_32(Scale);
19501 }
19502}
19503
19505 EVT VT) const {
19506 const int Scale = AM.Scale;
19507
19508 // Negative scales are not supported in Thumb1.
19509 if (Scale < 0)
19510 return false;
19511
19512 // Thumb1 addressing modes do not support register scaling excepting the
19513 // following cases:
19514 // 1. Scale == 1 means no scaling.
19515 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19516 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19517}
19518
19519/// isLegalAddressingMode - Return true if the addressing mode represented
19520/// by AM is legal for this target, for a load/store of the specified type.
19522 const AddrMode &AM, Type *Ty,
19523 unsigned AS, Instruction *I) const {
19524 EVT VT = getValueType(DL, Ty, true);
19525 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19526 return false;
19527
19528 // Can never fold addr of global into load/store.
19529 if (AM.BaseGV)
19530 return false;
19531
19532 switch (AM.Scale) {
19533 case 0: // no scale reg, must be "r+i" or "r", or "i".
19534 break;
19535 default:
19536 // ARM doesn't support any R+R*scale+imm addr modes.
19537 if (AM.BaseOffs)
19538 return false;
19539
19540 if (!VT.isSimple())
19541 return false;
19542
19543 if (Subtarget->isThumb1Only())
19544 return isLegalT1ScaledAddressingMode(AM, VT);
19545
19546 if (Subtarget->isThumb2())
19547 return isLegalT2ScaledAddressingMode(AM, VT);
19548
19549 int Scale = AM.Scale;
19550 switch (VT.getSimpleVT().SimpleTy) {
19551 default: return false;
19552 case MVT::i1:
19553 case MVT::i8:
19554 case MVT::i32:
19555 if (Scale < 0) Scale = -Scale;
19556 if (Scale == 1)
19557 return true;
19558 // r + r << imm
19559 return isPowerOf2_32(Scale & ~1);
19560 case MVT::i16:
19561 case MVT::i64:
19562 // r +/- r
19563 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19564 return true;
19565 // r * 2 (this can be lowered to r + r).
19566 if (!AM.HasBaseReg && Scale == 2)
19567 return true;
19568 return false;
19569
19570 case MVT::isVoid:
19571 // Note, we allow "void" uses (basically, uses that aren't loads or
19572 // stores), because arm allows folding a scale into many arithmetic
19573 // operations. This should be made more precise and revisited later.
19574
19575 // Allow r << imm, but the imm has to be a multiple of two.
19576 if (Scale & 1) return false;
19577 return isPowerOf2_32(Scale);
19578 }
19579 }
19580 return true;
19581}
19582
19583/// isLegalICmpImmediate - Return true if the specified immediate is legal
19584/// icmp immediate, that is the target has icmp instructions which can compare
19585/// a register against the immediate without having to materialize the
19586/// immediate into a register.
19588 // Thumb2 and ARM modes can use cmn for negative immediates.
19589 if (!Subtarget->isThumb())
19590 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19591 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19592 if (Subtarget->isThumb2())
19593 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19594 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19595 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19596 return Imm >= 0 && Imm <= 255;
19597}
19598
19599/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19600/// *or sub* immediate, that is the target has add or sub instructions which can
19601/// add a register with the immediate without having to materialize the
19602/// immediate into a register.
19604 // Same encoding for add/sub, just flip the sign.
19605 uint64_t AbsImm = AbsoluteValue(Imm);
19606 if (!Subtarget->isThumb())
19607 return ARM_AM::getSOImmVal(AbsImm) != -1;
19608 if (Subtarget->isThumb2())
19609 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19610 // Thumb1 only has 8-bit unsigned immediate.
19611 return AbsImm <= 255;
19612}
19613
19614// Return false to prevent folding
19615// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19616// if the folding leads to worse code.
19618 SDValue ConstNode) const {
19619 // Let the DAGCombiner decide for vector types and large types.
19620 const EVT VT = AddNode.getValueType();
19621 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19622 return true;
19623
19624 // It is worse if c0 is legal add immediate, while c1*c0 is not
19625 // and has to be composed by at least two instructions.
19626 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19627 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19628 const int64_t C0 = C0Node->getSExtValue();
19629 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19631 return true;
19632 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19633 return false;
19634
19635 // Default to true and let the DAGCombiner decide.
19636 return true;
19637}
19638
19640 bool isSEXTLoad, SDValue &Base,
19641 SDValue &Offset, bool &isInc,
19642 SelectionDAG &DAG) {
19643 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19644 return false;
19645
19646 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19647 // AddressingMode 3
19648 Base = Ptr->getOperand(0);
19649 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19650 int RHSC = (int)RHS->getZExtValue();
19651 if (RHSC < 0 && RHSC > -256) {
19652 assert(Ptr->getOpcode() == ISD::ADD);
19653 isInc = false;
19654 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19655 return true;
19656 }
19657 }
19658 isInc = (Ptr->getOpcode() == ISD::ADD);
19659 Offset = Ptr->getOperand(1);
19660 return true;
19661 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19662 // AddressingMode 2
19663 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19664 int RHSC = (int)RHS->getZExtValue();
19665 if (RHSC < 0 && RHSC > -0x1000) {
19666 assert(Ptr->getOpcode() == ISD::ADD);
19667 isInc = false;
19668 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19669 Base = Ptr->getOperand(0);
19670 return true;
19671 }
19672 }
19673
19674 if (Ptr->getOpcode() == ISD::ADD) {
19675 isInc = true;
19676 ARM_AM::ShiftOpc ShOpcVal=
19677 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19678 if (ShOpcVal != ARM_AM::no_shift) {
19679 Base = Ptr->getOperand(1);
19680 Offset = Ptr->getOperand(0);
19681 } else {
19682 Base = Ptr->getOperand(0);
19683 Offset = Ptr->getOperand(1);
19684 }
19685 return true;
19686 }
19687
19688 isInc = (Ptr->getOpcode() == ISD::ADD);
19689 Base = Ptr->getOperand(0);
19690 Offset = Ptr->getOperand(1);
19691 return true;
19692 }
19693
19694 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19695 return false;
19696}
19697
19699 bool isSEXTLoad, SDValue &Base,
19700 SDValue &Offset, bool &isInc,
19701 SelectionDAG &DAG) {
19702 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19703 return false;
19704
19705 Base = Ptr->getOperand(0);
19706 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19707 int RHSC = (int)RHS->getZExtValue();
19708 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19709 assert(Ptr->getOpcode() == ISD::ADD);
19710 isInc = false;
19711 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19712 return true;
19713 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19714 isInc = Ptr->getOpcode() == ISD::ADD;
19715 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19716 return true;
19717 }
19718 }
19719
19720 return false;
19721}
19722
19723static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19724 bool isSEXTLoad, bool IsMasked, bool isLE,
19726 bool &isInc, SelectionDAG &DAG) {
19727 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19728 return false;
19729 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19730 return false;
19731
19732 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19733 // as opposed to a vldrw.32). This can allow extra addressing modes or
19734 // alignments for what is otherwise an equivalent instruction.
19735 bool CanChangeType = isLE && !IsMasked;
19736
19737 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19738 int RHSC = (int)RHS->getZExtValue();
19739
19740 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19741 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19742 assert(Ptr->getOpcode() == ISD::ADD);
19743 isInc = false;
19744 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19745 return true;
19746 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19747 isInc = Ptr->getOpcode() == ISD::ADD;
19748 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19749 return true;
19750 }
19751 return false;
19752 };
19753
19754 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19755 // (in BE/masked) type.
19756 Base = Ptr->getOperand(0);
19757 if (VT == MVT::v4i16) {
19758 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19759 return true;
19760 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19761 if (IsInRange(RHSC, 0x80, 1))
19762 return true;
19763 } else if (Alignment >= 4 &&
19764 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19765 IsInRange(RHSC, 0x80, 4))
19766 return true;
19767 else if (Alignment >= 2 &&
19768 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19769 IsInRange(RHSC, 0x80, 2))
19770 return true;
19771 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19772 return true;
19773 return false;
19774}
19775
19776/// getPreIndexedAddressParts - returns true by value, base pointer and
19777/// offset pointer and addressing mode by reference if the node's address
19778/// can be legally represented as pre-indexed load / store address.
19779bool
19781 SDValue &Offset,
19783 SelectionDAG &DAG) const {
19784 if (Subtarget->isThumb1Only())
19785 return false;
19786
19787 EVT VT;
19788 SDValue Ptr;
19789 Align Alignment;
19790 bool isSEXTLoad = false;
19791 bool IsMasked = false;
19792 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19793 Ptr = LD->getBasePtr();
19794 VT = LD->getMemoryVT();
19795 Alignment = LD->getAlign();
19796 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19797 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19798 Ptr = ST->getBasePtr();
19799 VT = ST->getMemoryVT();
19800 Alignment = ST->getAlign();
19801 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19802 Ptr = LD->getBasePtr();
19803 VT = LD->getMemoryVT();
19804 Alignment = LD->getAlign();
19805 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19806 IsMasked = true;
19808 Ptr = ST->getBasePtr();
19809 VT = ST->getMemoryVT();
19810 Alignment = ST->getAlign();
19811 IsMasked = true;
19812 } else
19813 return false;
19814
19815 bool isInc;
19816 bool isLegal = false;
19817 if (VT.isVector())
19818 isLegal = Subtarget->hasMVEIntegerOps() &&
19820 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19821 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19822 else {
19823 if (Subtarget->isThumb2())
19824 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19825 Offset, isInc, DAG);
19826 else
19827 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19828 Offset, isInc, DAG);
19829 }
19830 if (!isLegal)
19831 return false;
19832
19833 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19834 return true;
19835}
19836
19837/// getPostIndexedAddressParts - returns true by value, base pointer and
19838/// offset pointer and addressing mode by reference if this node can be
19839/// combined with a load / store to form a post-indexed load / store.
19841 SDValue &Base,
19842 SDValue &Offset,
19844 SelectionDAG &DAG) const {
19845 EVT VT;
19846 SDValue Ptr;
19847 Align Alignment;
19848 bool isSEXTLoad = false, isNonExt;
19849 bool IsMasked = false;
19850 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19851 VT = LD->getMemoryVT();
19852 Ptr = LD->getBasePtr();
19853 Alignment = LD->getAlign();
19854 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19855 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19856 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19857 VT = ST->getMemoryVT();
19858 Ptr = ST->getBasePtr();
19859 Alignment = ST->getAlign();
19860 isNonExt = !ST->isTruncatingStore();
19861 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19862 VT = LD->getMemoryVT();
19863 Ptr = LD->getBasePtr();
19864 Alignment = LD->getAlign();
19865 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19866 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19867 IsMasked = true;
19869 VT = ST->getMemoryVT();
19870 Ptr = ST->getBasePtr();
19871 Alignment = ST->getAlign();
19872 isNonExt = !ST->isTruncatingStore();
19873 IsMasked = true;
19874 } else
19875 return false;
19876
19877 if (Subtarget->isThumb1Only()) {
19878 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19879 // must be non-extending/truncating, i32, with an offset of 4.
19880 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19881 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19882 return false;
19883 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19884 if (!RHS || RHS->getZExtValue() != 4)
19885 return false;
19886 if (Alignment < Align(4))
19887 return false;
19888
19889 Offset = Op->getOperand(1);
19890 Base = Op->getOperand(0);
19891 AM = ISD::POST_INC;
19892 return true;
19893 }
19894
19895 bool isInc;
19896 bool isLegal = false;
19897 if (VT.isVector())
19898 isLegal = Subtarget->hasMVEIntegerOps() &&
19899 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19900 Subtarget->isLittle(), Base, Offset,
19901 isInc, DAG);
19902 else {
19903 if (Subtarget->isThumb2())
19904 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19905 isInc, DAG);
19906 else
19907 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19908 isInc, DAG);
19909 }
19910 if (!isLegal)
19911 return false;
19912
19913 if (Ptr != Base) {
19914 // Swap base ptr and offset to catch more post-index load / store when
19915 // it's legal. In Thumb2 mode, offset must be an immediate.
19916 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19917 !Subtarget->isThumb2())
19919
19920 // Post-indexed load / store update the base pointer.
19921 if (Ptr != Base)
19922 return false;
19923 }
19924
19925 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19926 return true;
19927}
19928
19930 KnownBits &Known,
19931 const APInt &DemandedElts,
19932 const SelectionDAG &DAG,
19933 unsigned Depth) const {
19934 unsigned BitWidth = Known.getBitWidth();
19935 Known.resetAll();
19936 switch (Op.getOpcode()) {
19937 default: break;
19938 case ARMISD::ADDC:
19939 case ARMISD::ADDE:
19940 case ARMISD::SUBC:
19941 case ARMISD::SUBE:
19942 // Special cases when we convert a carry to a boolean.
19943 if (Op.getResNo() == 0) {
19944 SDValue LHS = Op.getOperand(0);
19945 SDValue RHS = Op.getOperand(1);
19946 // (ADDE 0, 0, C) will give us a single bit.
19947 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19948 isNullConstant(RHS)) {
19950 return;
19951 }
19952 }
19953 break;
19954 case ARMISD::CMOV: {
19955 // Bits are known zero/one if known on the LHS and RHS.
19956 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19957 if (Known.isUnknown())
19958 return;
19959
19960 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19961 Known = Known.intersectWith(KnownRHS);
19962 return;
19963 }
19965 Intrinsic::ID IntID =
19966 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19967 switch (IntID) {
19968 default: return;
19969 case Intrinsic::arm_ldaex:
19970 case Intrinsic::arm_ldrex: {
19971 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19972 unsigned MemBits = VT.getScalarSizeInBits();
19973 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19974 return;
19975 }
19976 }
19977 }
19978 case ARMISD::BFI: {
19979 // Conservatively, we can recurse down the first operand
19980 // and just mask out all affected bits.
19981 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19982
19983 // The operand to BFI is already a mask suitable for removing the bits it
19984 // sets.
19985 const APInt &Mask = Op.getConstantOperandAPInt(2);
19986 Known.Zero &= Mask;
19987 Known.One &= Mask;
19988 return;
19989 }
19990 case ARMISD::VGETLANEs:
19991 case ARMISD::VGETLANEu: {
19992 const SDValue &SrcSV = Op.getOperand(0);
19993 EVT VecVT = SrcSV.getValueType();
19994 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19995 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19996 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19997 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19998 "VGETLANE index out of bounds");
19999 unsigned Idx = Pos->getZExtValue();
20000 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20001 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20002
20003 EVT VT = Op.getValueType();
20004 const unsigned DstSz = VT.getScalarSizeInBits();
20005 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20006 (void)SrcSz;
20007 assert(SrcSz == Known.getBitWidth());
20008 assert(DstSz > SrcSz);
20009 if (Op.getOpcode() == ARMISD::VGETLANEs)
20010 Known = Known.sext(DstSz);
20011 else {
20012 Known = Known.zext(DstSz);
20013 }
20014 assert(DstSz == Known.getBitWidth());
20015 break;
20016 }
20017 case ARMISD::VMOVrh: {
20018 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20019 assert(KnownOp.getBitWidth() == 16);
20020 Known = KnownOp.zext(32);
20021 break;
20022 }
20023 case ARMISD::CSINC:
20024 case ARMISD::CSINV:
20025 case ARMISD::CSNEG: {
20026 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20027 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20028
20029 // The result is either:
20030 // CSINC: KnownOp0 or KnownOp1 + 1
20031 // CSINV: KnownOp0 or ~KnownOp1
20032 // CSNEG: KnownOp0 or KnownOp1 * -1
20033 if (Op.getOpcode() == ARMISD::CSINC)
20034 KnownOp1 =
20035 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20036 else if (Op.getOpcode() == ARMISD::CSINV)
20037 std::swap(KnownOp1.Zero, KnownOp1.One);
20038 else if (Op.getOpcode() == ARMISD::CSNEG)
20039 KnownOp1 = KnownBits::mul(KnownOp1,
20041
20042 Known = KnownOp0.intersectWith(KnownOp1);
20043 break;
20044 }
20045 case ARMISD::VORRIMM:
20046 case ARMISD::VBICIMM: {
20047 unsigned Encoded = Op.getConstantOperandVal(1);
20048 unsigned DecEltBits = 0;
20049 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20050
20051 unsigned EltBits = Op.getScalarValueSizeInBits();
20052 if (EltBits != DecEltBits) {
20053 // Be conservative: only update Known when EltBits == DecEltBits.
20054 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20055 // that changes in the future, doing nothing here is safer than risking
20056 // subtle bugs.
20057 break;
20058 }
20059
20060 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20061 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20062 APInt Imm(DecEltBits, DecodedVal);
20063
20064 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20065 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20066 break;
20067 }
20068 }
20069}
20070
20072 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20073 TargetLoweringOpt &TLO) const {
20074 // Delay optimization, so we don't have to deal with illegal types, or block
20075 // optimizations.
20076 if (!TLO.LegalOps)
20077 return false;
20078
20079 // Only optimize AND for now.
20080 if (Op.getOpcode() != ISD::AND)
20081 return false;
20082
20083 EVT VT = Op.getValueType();
20084
20085 // Ignore vectors.
20086 if (VT.isVector())
20087 return false;
20088
20089 assert(VT == MVT::i32 && "Unexpected integer type");
20090
20091 // Make sure the RHS really is a constant.
20092 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20093 if (!C)
20094 return false;
20095
20096 unsigned Mask = C->getZExtValue();
20097
20098 unsigned Demanded = DemandedBits.getZExtValue();
20099 unsigned ShrunkMask = Mask & Demanded;
20100 unsigned ExpandedMask = Mask | ~Demanded;
20101
20102 // If the mask is all zeros, let the target-independent code replace the
20103 // result with zero.
20104 if (ShrunkMask == 0)
20105 return false;
20106
20107 // If the mask is all ones, erase the AND. (Currently, the target-independent
20108 // code won't do this, so we have to do it explicitly to avoid an infinite
20109 // loop in obscure cases.)
20110 if (ExpandedMask == ~0U)
20111 return TLO.CombineTo(Op, Op.getOperand(0));
20112
20113 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20114 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20115 };
20116 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20117 if (NewMask == Mask)
20118 return true;
20119 SDLoc DL(Op);
20120 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20121 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20122 return TLO.CombineTo(Op, NewOp);
20123 };
20124
20125 // Prefer uxtb mask.
20126 if (IsLegalMask(0xFF))
20127 return UseMask(0xFF);
20128
20129 // Prefer uxth mask.
20130 if (IsLegalMask(0xFFFF))
20131 return UseMask(0xFFFF);
20132
20133 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20134 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20135 if (ShrunkMask < 256)
20136 return UseMask(ShrunkMask);
20137
20138 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20139 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20140 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20141 return UseMask(ExpandedMask);
20142
20143 // Potential improvements:
20144 //
20145 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20146 // We could try to prefer Thumb1 immediates which can be lowered to a
20147 // two-instruction sequence.
20148 // We could try to recognize more legal ARM/Thumb2 immediates here.
20149
20150 return false;
20151}
20152
20154 SDValue Op, const APInt &OriginalDemandedBits,
20155 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20156 unsigned Depth) const {
20157 unsigned Opc = Op.getOpcode();
20158
20159 switch (Opc) {
20160 case ARMISD::ASRL:
20161 case ARMISD::LSRL: {
20162 // If this is result 0 and the other result is unused, see if the demand
20163 // bits allow us to shrink this long shift into a standard small shift in
20164 // the opposite direction.
20165 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20166 isa<ConstantSDNode>(Op->getOperand(2))) {
20167 unsigned ShAmt = Op->getConstantOperandVal(2);
20168 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20169 << (32 - ShAmt)))
20170 return TLO.CombineTo(
20171 Op, TLO.DAG.getNode(
20172 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20173 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20174 }
20175 break;
20176 }
20177 case ARMISD::VBICIMM: {
20178 SDValue Op0 = Op.getOperand(0);
20179 unsigned ModImm = Op.getConstantOperandVal(1);
20180 unsigned EltBits = 0;
20181 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20182 if ((OriginalDemandedBits & Mask) == 0)
20183 return TLO.CombineTo(Op, Op0);
20184 }
20185 }
20186
20188 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20189}
20190
20191//===----------------------------------------------------------------------===//
20192// ARM Inline Assembly Support
20193//===----------------------------------------------------------------------===//
20194
20195const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20196 // At this point, we have to lower this constraint to something else, so we
20197 // lower it to an "r" or "w". However, by doing this we will force the result
20198 // to be in register, while the X constraint is much more permissive.
20199 //
20200 // Although we are correct (we are free to emit anything, without
20201 // constraints), we might break use cases that would expect us to be more
20202 // efficient and emit something else.
20203 if (!Subtarget->hasVFP2Base())
20204 return "r";
20205 if (ConstraintVT.isFloatingPoint())
20206 return "w";
20207 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20208 (ConstraintVT.getSizeInBits() == 64 ||
20209 ConstraintVT.getSizeInBits() == 128))
20210 return "w";
20211
20212 return "r";
20213}
20214
20215/// getConstraintType - Given a constraint letter, return the type of
20216/// constraint it is for this target.
20219 unsigned S = Constraint.size();
20220 if (S == 1) {
20221 switch (Constraint[0]) {
20222 default: break;
20223 case 'l': return C_RegisterClass;
20224 case 'w': return C_RegisterClass;
20225 case 'h': return C_RegisterClass;
20226 case 'x': return C_RegisterClass;
20227 case 't': return C_RegisterClass;
20228 case 'j': return C_Immediate; // Constant for movw.
20229 // An address with a single base register. Due to the way we
20230 // currently handle addresses it is the same as an 'r' memory constraint.
20231 case 'Q': return C_Memory;
20232 }
20233 } else if (S == 2) {
20234 switch (Constraint[0]) {
20235 default: break;
20236 case 'T': return C_RegisterClass;
20237 // All 'U+' constraints are addresses.
20238 case 'U': return C_Memory;
20239 }
20240 }
20241 return TargetLowering::getConstraintType(Constraint);
20242}
20243
20244/// Examine constraint type and operand type and determine a weight value.
20245/// This object must already have been set up with the operand type
20246/// and the current alternative constraint selected.
20249 AsmOperandInfo &info, const char *constraint) const {
20251 Value *CallOperandVal = info.CallOperandVal;
20252 // If we don't have a value, we can't do a match,
20253 // but allow it at the lowest weight.
20254 if (!CallOperandVal)
20255 return CW_Default;
20256 Type *type = CallOperandVal->getType();
20257 // Look at the constraint type.
20258 switch (*constraint) {
20259 default:
20261 break;
20262 case 'l':
20263 if (type->isIntegerTy()) {
20264 if (Subtarget->isThumb())
20265 weight = CW_SpecificReg;
20266 else
20267 weight = CW_Register;
20268 }
20269 break;
20270 case 'w':
20271 if (type->isFloatingPointTy())
20272 weight = CW_Register;
20273 break;
20274 }
20275 return weight;
20276}
20277
20278static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20279 if (PR == 0 || VT == MVT::Other)
20280 return false;
20281 if (ARM::SPRRegClass.contains(PR))
20282 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20283 if (ARM::DPRRegClass.contains(PR))
20284 return VT != MVT::f64 && !VT.is64BitVector();
20285 return false;
20286}
20287
20288using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20289
20291 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20292 switch (Constraint.size()) {
20293 case 1:
20294 // GCC ARM Constraint Letters
20295 switch (Constraint[0]) {
20296 case 'l': // Low regs or general regs.
20297 if (Subtarget->isThumb())
20298 return RCPair(0U, &ARM::tGPRRegClass);
20299 return RCPair(0U, &ARM::GPRRegClass);
20300 case 'h': // High regs or no regs.
20301 if (Subtarget->isThumb())
20302 return RCPair(0U, &ARM::hGPRRegClass);
20303 break;
20304 case 'r':
20305 if (Subtarget->isThumb1Only())
20306 return RCPair(0U, &ARM::tGPRRegClass);
20307 return RCPair(0U, &ARM::GPRRegClass);
20308 case 'w':
20309 if (VT == MVT::Other)
20310 break;
20311 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20312 return RCPair(0U, &ARM::SPRRegClass);
20313 if (VT.getSizeInBits() == 64)
20314 return RCPair(0U, &ARM::DPRRegClass);
20315 if (VT.getSizeInBits() == 128)
20316 return RCPair(0U, &ARM::QPRRegClass);
20317 break;
20318 case 'x':
20319 if (VT == MVT::Other)
20320 break;
20321 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20322 return RCPair(0U, &ARM::SPR_8RegClass);
20323 if (VT.getSizeInBits() == 64)
20324 return RCPair(0U, &ARM::DPR_8RegClass);
20325 if (VT.getSizeInBits() == 128)
20326 return RCPair(0U, &ARM::QPR_8RegClass);
20327 break;
20328 case 't':
20329 if (VT == MVT::Other)
20330 break;
20331 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20332 return RCPair(0U, &ARM::SPRRegClass);
20333 if (VT.getSizeInBits() == 64)
20334 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20335 if (VT.getSizeInBits() == 128)
20336 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20337 break;
20338 }
20339 break;
20340
20341 case 2:
20342 if (Constraint[0] == 'T') {
20343 switch (Constraint[1]) {
20344 default:
20345 break;
20346 case 'e':
20347 return RCPair(0U, &ARM::tGPREvenRegClass);
20348 case 'o':
20349 return RCPair(0U, &ARM::tGPROddRegClass);
20350 }
20351 }
20352 break;
20353
20354 default:
20355 break;
20356 }
20357
20358 if (StringRef("{cc}").equals_insensitive(Constraint))
20359 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20360
20361 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20362 if (isIncompatibleReg(RCP.first, VT))
20363 return {0, nullptr};
20364 return RCP;
20365}
20366
20367/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20368/// vector. If it is invalid, don't add anything to Ops.
20370 StringRef Constraint,
20371 std::vector<SDValue> &Ops,
20372 SelectionDAG &DAG) const {
20373 SDValue Result;
20374
20375 // Currently only support length 1 constraints.
20376 if (Constraint.size() != 1)
20377 return;
20378
20379 char ConstraintLetter = Constraint[0];
20380 switch (ConstraintLetter) {
20381 default: break;
20382 case 'j':
20383 case 'I': case 'J': case 'K': case 'L':
20384 case 'M': case 'N': case 'O':
20386 if (!C)
20387 return;
20388
20389 int64_t CVal64 = C->getSExtValue();
20390 int CVal = (int) CVal64;
20391 // None of these constraints allow values larger than 32 bits. Check
20392 // that the value fits in an int.
20393 if (CVal != CVal64)
20394 return;
20395
20396 switch (ConstraintLetter) {
20397 case 'j':
20398 // Constant suitable for movw, must be between 0 and
20399 // 65535.
20400 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20401 if (CVal >= 0 && CVal <= 65535)
20402 break;
20403 return;
20404 case 'I':
20405 if (Subtarget->isThumb1Only()) {
20406 // This must be a constant between 0 and 255, for ADD
20407 // immediates.
20408 if (CVal >= 0 && CVal <= 255)
20409 break;
20410 } else if (Subtarget->isThumb2()) {
20411 // A constant that can be used as an immediate value in a
20412 // data-processing instruction.
20413 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20414 break;
20415 } else {
20416 // A constant that can be used as an immediate value in a
20417 // data-processing instruction.
20418 if (ARM_AM::getSOImmVal(CVal) != -1)
20419 break;
20420 }
20421 return;
20422
20423 case 'J':
20424 if (Subtarget->isThumb1Only()) {
20425 // This must be a constant between -255 and -1, for negated ADD
20426 // immediates. This can be used in GCC with an "n" modifier that
20427 // prints the negated value, for use with SUB instructions. It is
20428 // not useful otherwise but is implemented for compatibility.
20429 if (CVal >= -255 && CVal <= -1)
20430 break;
20431 } else {
20432 // This must be a constant between -4095 and 4095. This is suitable
20433 // for use as the immediate offset field in LDR and STR instructions
20434 // such as LDR r0,[r1,#offset].
20435 if (CVal >= -4095 && CVal <= 4095)
20436 break;
20437 }
20438 return;
20439
20440 case 'K':
20441 if (Subtarget->isThumb1Only()) {
20442 // A 32-bit value where only one byte has a nonzero value. Exclude
20443 // zero to match GCC. This constraint is used by GCC internally for
20444 // constants that can be loaded with a move/shift combination.
20445 // It is not useful otherwise but is implemented for compatibility.
20446 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20447 break;
20448 } else if (Subtarget->isThumb2()) {
20449 // A constant whose bitwise inverse can be used as an immediate
20450 // value in a data-processing instruction. This can be used in GCC
20451 // with a "B" modifier that prints the inverted value, for use with
20452 // BIC and MVN instructions. It is not useful otherwise but is
20453 // implemented for compatibility.
20454 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20455 break;
20456 } else {
20457 // A constant whose bitwise inverse can be used as an immediate
20458 // value in a data-processing instruction. This can be used in GCC
20459 // with a "B" modifier that prints the inverted value, for use with
20460 // BIC and MVN instructions. It is not useful otherwise but is
20461 // implemented for compatibility.
20462 if (ARM_AM::getSOImmVal(~CVal) != -1)
20463 break;
20464 }
20465 return;
20466
20467 case 'L':
20468 if (Subtarget->isThumb1Only()) {
20469 // This must be a constant between -7 and 7,
20470 // for 3-operand ADD/SUB immediate instructions.
20471 if (CVal >= -7 && CVal < 7)
20472 break;
20473 } else if (Subtarget->isThumb2()) {
20474 // A constant whose negation can be used as an immediate value in a
20475 // data-processing instruction. This can be used in GCC with an "n"
20476 // modifier that prints the negated value, for use with SUB
20477 // instructions. It is not useful otherwise but is implemented for
20478 // compatibility.
20479 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20480 break;
20481 } else {
20482 // A constant whose negation can be used as an immediate value in a
20483 // data-processing instruction. This can be used in GCC with an "n"
20484 // modifier that prints the negated value, for use with SUB
20485 // instructions. It is not useful otherwise but is implemented for
20486 // compatibility.
20487 if (ARM_AM::getSOImmVal(-CVal) != -1)
20488 break;
20489 }
20490 return;
20491
20492 case 'M':
20493 if (Subtarget->isThumb1Only()) {
20494 // This must be a multiple of 4 between 0 and 1020, for
20495 // ADD sp + immediate.
20496 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20497 break;
20498 } else {
20499 // A power of two or a constant between 0 and 32. This is used in
20500 // GCC for the shift amount on shifted register operands, but it is
20501 // useful in general for any shift amounts.
20502 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20503 break;
20504 }
20505 return;
20506
20507 case 'N':
20508 if (Subtarget->isThumb1Only()) {
20509 // This must be a constant between 0 and 31, for shift amounts.
20510 if (CVal >= 0 && CVal <= 31)
20511 break;
20512 }
20513 return;
20514
20515 case 'O':
20516 if (Subtarget->isThumb1Only()) {
20517 // This must be a multiple of 4 between -508 and 508, for
20518 // ADD/SUB sp = sp + immediate.
20519 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20520 break;
20521 }
20522 return;
20523 }
20524 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20525 break;
20526 }
20527
20528 if (Result.getNode()) {
20529 Ops.push_back(Result);
20530 return;
20531 }
20532 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20533}
20534
20535static RTLIB::Libcall getDivRemLibcall(
20536 const SDNode *N, MVT::SimpleValueType SVT) {
20537 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20538 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20539 "Unhandled Opcode in getDivRemLibcall");
20540 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20541 N->getOpcode() == ISD::SREM;
20542 RTLIB::Libcall LC;
20543 switch (SVT) {
20544 default: llvm_unreachable("Unexpected request for libcall!");
20545 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20546 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20547 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20548 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20549 }
20550 return LC;
20551}
20552
20554 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20555 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20556 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20557 "Unhandled Opcode in getDivRemArgList");
20558 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20559 N->getOpcode() == ISD::SREM;
20561 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20562 EVT ArgVT = N->getOperand(i).getValueType();
20563 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20564 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20565 Entry.IsSExt = isSigned;
20566 Entry.IsZExt = !isSigned;
20567 Args.push_back(Entry);
20568 }
20569 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20570 std::swap(Args[0], Args[1]);
20571 return Args;
20572}
20573
20574SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20575 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20576 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20577 Subtarget->isTargetWindows()) &&
20578 "Register-based DivRem lowering only");
20579 unsigned Opcode = Op->getOpcode();
20580 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20581 "Invalid opcode for Div/Rem lowering");
20582 bool isSigned = (Opcode == ISD::SDIVREM);
20583 EVT VT = Op->getValueType(0);
20584 SDLoc dl(Op);
20585
20586 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20588 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20589 SDValue Res0 =
20590 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20591 SDValue Res1 =
20592 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20593 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20594 {Res0, Res1});
20595 }
20596 }
20597
20598 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20599
20600 // If the target has hardware divide, use divide + multiply + subtract:
20601 // div = a / b
20602 // rem = a - b * div
20603 // return {div, rem}
20604 // This should be lowered into UDIV/SDIV + MLS later on.
20605 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20606 : Subtarget->hasDivideInARMMode();
20607 if (hasDivide && Op->getValueType(0).isSimple() &&
20608 Op->getSimpleValueType(0) == MVT::i32) {
20609 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20610 const SDValue Dividend = Op->getOperand(0);
20611 const SDValue Divisor = Op->getOperand(1);
20612 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20613 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20614 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20615
20616 SDValue Values[2] = {Div, Rem};
20617 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20618 }
20619
20620 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20621 VT.getSimpleVT().SimpleTy);
20622 SDValue InChain = DAG.getEntryNode();
20623
20625 DAG.getContext(),
20626 Subtarget);
20627
20630
20631 Type *RetTy = StructType::get(Ty, Ty);
20632
20633 if (Subtarget->isTargetWindows())
20634 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20635
20636 TargetLowering::CallLoweringInfo CLI(DAG);
20637 CLI.setDebugLoc(dl).setChain(InChain)
20638 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20640
20641 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20642 return CallInfo.first;
20643}
20644
20645// Lowers REM using divmod helpers
20646// see RTABI section 4.2/4.3
20647SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20648 EVT VT = N->getValueType(0);
20649
20650 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20652 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20653 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20654 Result[0], Result[1]);
20655 }
20656
20657 // Build return types (div and rem)
20658 std::vector<Type*> RetTyParams;
20659 Type *RetTyElement;
20660
20661 switch (VT.getSimpleVT().SimpleTy) {
20662 default: llvm_unreachable("Unexpected request for libcall!");
20663 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20664 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20665 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20666 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20667 }
20668
20669 RetTyParams.push_back(RetTyElement);
20670 RetTyParams.push_back(RetTyElement);
20671 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20672 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20673
20674 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20675 SimpleTy);
20676 SDValue InChain = DAG.getEntryNode();
20678 Subtarget);
20679 bool isSigned = N->getOpcode() == ISD::SREM;
20682
20683 if (Subtarget->isTargetWindows())
20684 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20685
20686 // Lower call
20687 CallLoweringInfo CLI(DAG);
20688 CLI.setChain(InChain)
20689 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20691 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20692
20693 // Return second (rem) result operand (first contains div)
20694 SDNode *ResNode = CallResult.first.getNode();
20695 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20696 return ResNode->getOperand(1);
20697}
20698
20699SDValue
20700ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20701 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20702 SDLoc DL(Op);
20703
20704 // Get the inputs.
20705 SDValue Chain = Op.getOperand(0);
20706 SDValue Size = Op.getOperand(1);
20707
20709 "no-stack-arg-probe")) {
20710 MaybeAlign Align =
20711 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20712 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20713 Chain = SP.getValue(1);
20714 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20715 if (Align)
20716 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20717 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20718 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20719 SDValue Ops[2] = { SP, Chain };
20720 return DAG.getMergeValues(Ops, DL);
20721 }
20722
20723 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20724 DAG.getConstant(2, DL, MVT::i32));
20725
20726 SDValue Glue;
20727 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20728 Glue = Chain.getValue(1);
20729
20730 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20731 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20732
20733 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20734 Chain = NewSP.getValue(1);
20735
20736 SDValue Ops[2] = { NewSP, Chain };
20737 return DAG.getMergeValues(Ops, DL);
20738}
20739
20740SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20741 bool IsStrict = Op->isStrictFPOpcode();
20742 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20743 const unsigned DstSz = Op.getValueType().getSizeInBits();
20744 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20745 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20746 "Unexpected type for custom-lowering FP_EXTEND");
20747
20748 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20749 "With both FP DP and 16, any FP conversion is legal!");
20750
20751 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20752 "With FP16, 16 to 32 conversion is legal!");
20753
20754 // Converting from 32 -> 64 is valid if we have FP64.
20755 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20756 // FIXME: Remove this when we have strict fp instruction selection patterns
20757 if (IsStrict) {
20758 SDLoc Loc(Op);
20759 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
20760 Loc, Op.getValueType(), SrcVal);
20761 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20762 }
20763 return Op;
20764 }
20765
20766 // Either we are converting from 16 -> 64, without FP16 and/or
20767 // FP.double-precision or without Armv8-fp. So we must do it in two
20768 // steps.
20769 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20770 // without FP16. So we must do a function call.
20771 SDLoc Loc(Op);
20772 RTLIB::Libcall LC;
20773 MakeLibCallOptions CallOptions;
20774 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20775 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20776 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20777 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20778 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20779 if (Supported) {
20780 if (IsStrict) {
20781 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20782 {DstVT, MVT::Other}, {Chain, SrcVal});
20783 Chain = SrcVal.getValue(1);
20784 } else {
20785 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20786 }
20787 } else {
20788 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20789 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20790 "Unexpected type for custom-lowering FP_EXTEND");
20791 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20792 Loc, Chain);
20793 }
20794 }
20795
20796 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20797}
20798
20799SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20800 bool IsStrict = Op->isStrictFPOpcode();
20801
20802 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20803 EVT SrcVT = SrcVal.getValueType();
20804 EVT DstVT = Op.getValueType();
20805 const unsigned DstSz = Op.getValueType().getSizeInBits();
20806 const unsigned SrcSz = SrcVT.getSizeInBits();
20807 (void)DstSz;
20808 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20809 "Unexpected type for custom-lowering FP_ROUND");
20810
20811 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20812 "With both FP DP and 16, any FP conversion is legal!");
20813
20814 SDLoc Loc(Op);
20815
20816 // Instruction from 32 -> 16 if hasFP16 is valid
20817 if (SrcSz == 32 && Subtarget->hasFP16())
20818 return Op;
20819
20820 // Lib call from 32 -> 16 / 64 -> [32, 16]
20821 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20822 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20823 "Unexpected type for custom-lowering FP_ROUND");
20824 MakeLibCallOptions CallOptions;
20825 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20827 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20828 Loc, Chain);
20829 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20830}
20831
20832bool
20834 // The ARM target isn't yet aware of offsets.
20835 return false;
20836}
20837
20839 if (v == 0xffffffff)
20840 return false;
20841
20842 // there can be 1's on either or both "outsides", all the "inside"
20843 // bits must be 0's
20844 return isShiftedMask_32(~v);
20845}
20846
20847/// isFPImmLegal - Returns true if the target can instruction select the
20848/// specified FP immediate natively. If false, the legalizer will
20849/// materialize the FP immediate as a load from a constant pool.
20851 bool ForCodeSize) const {
20852 if (!Subtarget->hasVFP3Base())
20853 return false;
20854 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20855 return ARM_AM::getFP16Imm(Imm) != -1;
20856 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20857 ARM_AM::getFP32FP16Imm(Imm) != -1)
20858 return true;
20859 if (VT == MVT::f32)
20860 return ARM_AM::getFP32Imm(Imm) != -1;
20861 if (VT == MVT::f64 && Subtarget->hasFP64())
20862 return ARM_AM::getFP64Imm(Imm) != -1;
20863 return false;
20864}
20865
20866/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20867/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20868/// specified in the intrinsic calls.
20870 const CallInst &I,
20871 MachineFunction &MF,
20872 unsigned Intrinsic) const {
20873 switch (Intrinsic) {
20874 case Intrinsic::arm_neon_vld1:
20875 case Intrinsic::arm_neon_vld2:
20876 case Intrinsic::arm_neon_vld3:
20877 case Intrinsic::arm_neon_vld4:
20878 case Intrinsic::arm_neon_vld2lane:
20879 case Intrinsic::arm_neon_vld3lane:
20880 case Intrinsic::arm_neon_vld4lane:
20881 case Intrinsic::arm_neon_vld2dup:
20882 case Intrinsic::arm_neon_vld3dup:
20883 case Intrinsic::arm_neon_vld4dup: {
20884 Info.opc = ISD::INTRINSIC_W_CHAIN;
20885 // Conservatively set memVT to the entire set of vectors loaded.
20886 auto &DL = I.getDataLayout();
20887 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20888 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20889 Info.ptrVal = I.getArgOperand(0);
20890 Info.offset = 0;
20891 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20892 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20893 // volatile loads with NEON intrinsics not supported
20894 Info.flags = MachineMemOperand::MOLoad;
20895 return true;
20896 }
20897 case Intrinsic::arm_neon_vld1x2:
20898 case Intrinsic::arm_neon_vld1x3:
20899 case Intrinsic::arm_neon_vld1x4: {
20900 Info.opc = ISD::INTRINSIC_W_CHAIN;
20901 // Conservatively set memVT to the entire set of vectors loaded.
20902 auto &DL = I.getDataLayout();
20903 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20904 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20905 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20906 Info.offset = 0;
20907 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20908 // volatile loads with NEON intrinsics not supported
20909 Info.flags = MachineMemOperand::MOLoad;
20910 return true;
20911 }
20912 case Intrinsic::arm_neon_vst1:
20913 case Intrinsic::arm_neon_vst2:
20914 case Intrinsic::arm_neon_vst3:
20915 case Intrinsic::arm_neon_vst4:
20916 case Intrinsic::arm_neon_vst2lane:
20917 case Intrinsic::arm_neon_vst3lane:
20918 case Intrinsic::arm_neon_vst4lane: {
20919 Info.opc = ISD::INTRINSIC_VOID;
20920 // Conservatively set memVT to the entire set of vectors stored.
20921 auto &DL = I.getDataLayout();
20922 unsigned NumElts = 0;
20923 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20924 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20925 if (!ArgTy->isVectorTy())
20926 break;
20927 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20928 }
20929 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20930 Info.ptrVal = I.getArgOperand(0);
20931 Info.offset = 0;
20932 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20933 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20934 // volatile stores with NEON intrinsics not supported
20935 Info.flags = MachineMemOperand::MOStore;
20936 return true;
20937 }
20938 case Intrinsic::arm_neon_vst1x2:
20939 case Intrinsic::arm_neon_vst1x3:
20940 case Intrinsic::arm_neon_vst1x4: {
20941 Info.opc = ISD::INTRINSIC_VOID;
20942 // Conservatively set memVT to the entire set of vectors stored.
20943 auto &DL = I.getDataLayout();
20944 unsigned NumElts = 0;
20945 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20946 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20947 if (!ArgTy->isVectorTy())
20948 break;
20949 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20950 }
20951 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20952 Info.ptrVal = I.getArgOperand(0);
20953 Info.offset = 0;
20954 Info.align = I.getParamAlign(0).valueOrOne();
20955 // volatile stores with NEON intrinsics not supported
20956 Info.flags = MachineMemOperand::MOStore;
20957 return true;
20958 }
20959 case Intrinsic::arm_mve_vld2q:
20960 case Intrinsic::arm_mve_vld4q: {
20961 Info.opc = ISD::INTRINSIC_W_CHAIN;
20962 // Conservatively set memVT to the entire set of vectors loaded.
20963 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20964 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20965 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20966 Info.ptrVal = I.getArgOperand(0);
20967 Info.offset = 0;
20968 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20969 // volatile loads with MVE intrinsics not supported
20970 Info.flags = MachineMemOperand::MOLoad;
20971 return true;
20972 }
20973 case Intrinsic::arm_mve_vst2q:
20974 case Intrinsic::arm_mve_vst4q: {
20975 Info.opc = ISD::INTRINSIC_VOID;
20976 // Conservatively set memVT to the entire set of vectors stored.
20977 Type *VecTy = I.getArgOperand(1)->getType();
20978 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20979 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20980 Info.ptrVal = I.getArgOperand(0);
20981 Info.offset = 0;
20982 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20983 // volatile stores with MVE intrinsics not supported
20984 Info.flags = MachineMemOperand::MOStore;
20985 return true;
20986 }
20987 case Intrinsic::arm_mve_vldr_gather_base:
20988 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20989 Info.opc = ISD::INTRINSIC_W_CHAIN;
20990 Info.ptrVal = nullptr;
20991 Info.memVT = MVT::getVT(I.getType());
20992 Info.align = Align(1);
20993 Info.flags |= MachineMemOperand::MOLoad;
20994 return true;
20995 }
20996 case Intrinsic::arm_mve_vldr_gather_base_wb:
20997 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20998 Info.opc = ISD::INTRINSIC_W_CHAIN;
20999 Info.ptrVal = nullptr;
21000 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21001 Info.align = Align(1);
21002 Info.flags |= MachineMemOperand::MOLoad;
21003 return true;
21004 }
21005 case Intrinsic::arm_mve_vldr_gather_offset:
21006 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21007 Info.opc = ISD::INTRINSIC_W_CHAIN;
21008 Info.ptrVal = nullptr;
21009 MVT DataVT = MVT::getVT(I.getType());
21010 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21011 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21012 DataVT.getVectorNumElements());
21013 Info.align = Align(1);
21014 Info.flags |= MachineMemOperand::MOLoad;
21015 return true;
21016 }
21017 case Intrinsic::arm_mve_vstr_scatter_base:
21018 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21019 Info.opc = ISD::INTRINSIC_VOID;
21020 Info.ptrVal = nullptr;
21021 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21022 Info.align = Align(1);
21023 Info.flags |= MachineMemOperand::MOStore;
21024 return true;
21025 }
21026 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21027 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21028 Info.opc = ISD::INTRINSIC_W_CHAIN;
21029 Info.ptrVal = nullptr;
21030 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21031 Info.align = Align(1);
21032 Info.flags |= MachineMemOperand::MOStore;
21033 return true;
21034 }
21035 case Intrinsic::arm_mve_vstr_scatter_offset:
21036 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21037 Info.opc = ISD::INTRINSIC_VOID;
21038 Info.ptrVal = nullptr;
21039 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21040 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21041 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21042 DataVT.getVectorNumElements());
21043 Info.align = Align(1);
21044 Info.flags |= MachineMemOperand::MOStore;
21045 return true;
21046 }
21047 case Intrinsic::arm_ldaex:
21048 case Intrinsic::arm_ldrex: {
21049 auto &DL = I.getDataLayout();
21050 Type *ValTy = I.getParamElementType(0);
21051 Info.opc = ISD::INTRINSIC_W_CHAIN;
21052 Info.memVT = MVT::getVT(ValTy);
21053 Info.ptrVal = I.getArgOperand(0);
21054 Info.offset = 0;
21055 Info.align = DL.getABITypeAlign(ValTy);
21057 return true;
21058 }
21059 case Intrinsic::arm_stlex:
21060 case Intrinsic::arm_strex: {
21061 auto &DL = I.getDataLayout();
21062 Type *ValTy = I.getParamElementType(1);
21063 Info.opc = ISD::INTRINSIC_W_CHAIN;
21064 Info.memVT = MVT::getVT(ValTy);
21065 Info.ptrVal = I.getArgOperand(1);
21066 Info.offset = 0;
21067 Info.align = DL.getABITypeAlign(ValTy);
21069 return true;
21070 }
21071 case Intrinsic::arm_stlexd:
21072 case Intrinsic::arm_strexd:
21073 Info.opc = ISD::INTRINSIC_W_CHAIN;
21074 Info.memVT = MVT::i64;
21075 Info.ptrVal = I.getArgOperand(2);
21076 Info.offset = 0;
21077 Info.align = Align(8);
21079 return true;
21080
21081 case Intrinsic::arm_ldaexd:
21082 case Intrinsic::arm_ldrexd:
21083 Info.opc = ISD::INTRINSIC_W_CHAIN;
21084 Info.memVT = MVT::i64;
21085 Info.ptrVal = I.getArgOperand(0);
21086 Info.offset = 0;
21087 Info.align = Align(8);
21089 return true;
21090
21091 default:
21092 break;
21093 }
21094
21095 return false;
21096}
21097
21098/// Returns true if it is beneficial to convert a load of a constant
21099/// to just the constant itself.
21101 Type *Ty) const {
21102 assert(Ty->isIntegerTy());
21103
21104 unsigned Bits = Ty->getPrimitiveSizeInBits();
21105 if (Bits == 0 || Bits > 32)
21106 return false;
21107 return true;
21108}
21109
21111 unsigned Index) const {
21113 return false;
21114
21115 return (Index == 0 || Index == ResVT.getVectorNumElements());
21116}
21117
21119 ARM_MB::MemBOpt Domain) const {
21120 // First, if the target has no DMB, see what fallback we can use.
21121 if (!Subtarget->hasDataBarrier()) {
21122 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21123 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21124 // here.
21125 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21126 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21127 Builder.getInt32(0), Builder.getInt32(7),
21128 Builder.getInt32(10), Builder.getInt32(5)};
21129 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21130 } else {
21131 // Instead of using barriers, atomic accesses on these subtargets use
21132 // libcalls.
21133 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21134 }
21135 } else {
21136 // Only a full system barrier exists in the M-class architectures.
21137 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21138 Constant *CDomain = Builder.getInt32(Domain);
21139 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21140 }
21141}
21142
21143// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21145 Instruction *Inst,
21146 AtomicOrdering Ord) const {
21147 switch (Ord) {
21150 llvm_unreachable("Invalid fence: unordered/non-atomic");
21153 return nullptr; // Nothing to do
21155 if (!Inst->hasAtomicStore())
21156 return nullptr; // Nothing to do
21157 [[fallthrough]];
21160 if (Subtarget->preferISHSTBarriers())
21161 return makeDMB(Builder, ARM_MB::ISHST);
21162 // FIXME: add a comment with a link to documentation justifying this.
21163 else
21164 return makeDMB(Builder, ARM_MB::ISH);
21165 }
21166 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21167}
21168
21170 Instruction *Inst,
21171 AtomicOrdering Ord) const {
21172 switch (Ord) {
21175 llvm_unreachable("Invalid fence: unordered/not-atomic");
21178 return nullptr; // Nothing to do
21182 return makeDMB(Builder, ARM_MB::ISH);
21183 }
21184 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21185}
21186
21187// Loads and stores less than 64-bits are already atomic; ones above that
21188// are doomed anyway, so defer to the default libcall and blame the OS when
21189// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21190// anything for those.
21193 bool has64BitAtomicStore;
21194 if (Subtarget->isMClass())
21195 has64BitAtomicStore = false;
21196 else if (Subtarget->isThumb())
21197 has64BitAtomicStore = Subtarget->hasV7Ops();
21198 else
21199 has64BitAtomicStore = Subtarget->hasV6Ops();
21200
21201 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21202 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21204}
21205
21206// Loads and stores less than 64-bits are already atomic; ones above that
21207// are doomed anyway, so defer to the default libcall and blame the OS when
21208// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21209// anything for those.
21210// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21211// guarantee, see DDI0406C ARM architecture reference manual,
21212// sections A8.8.72-74 LDRD)
21215 bool has64BitAtomicLoad;
21216 if (Subtarget->isMClass())
21217 has64BitAtomicLoad = false;
21218 else if (Subtarget->isThumb())
21219 has64BitAtomicLoad = Subtarget->hasV7Ops();
21220 else
21221 has64BitAtomicLoad = Subtarget->hasV6Ops();
21222
21223 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21224 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21226}
21227
21228// For the real atomic operations, we have ldrex/strex up to 32 bits,
21229// and up to 64 bits on the non-M profiles
21232 if (AI->isFloatingPointOperation())
21234
21235 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21236 bool hasAtomicRMW;
21237 if (Subtarget->isMClass())
21238 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21239 else if (Subtarget->isThumb())
21240 hasAtomicRMW = Subtarget->hasV7Ops();
21241 else
21242 hasAtomicRMW = Subtarget->hasV6Ops();
21243 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21244 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21245 // implement atomicrmw without spilling. If the target address is also on
21246 // the stack and close enough to the spill slot, this can lead to a
21247 // situation where the monitor always gets cleared and the atomic operation
21248 // can never succeed. So at -O0 lower this operation to a CAS loop.
21249 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21252 }
21254}
21255
21256// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21257// bits, and up to 64 bits on the non-M profiles.
21260 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21261 // implement cmpxchg without spilling. If the address being exchanged is also
21262 // on the stack and close enough to the spill slot, this can lead to a
21263 // situation where the monitor always gets cleared and the atomic operation
21264 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21265 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21266 bool HasAtomicCmpXchg;
21267 if (Subtarget->isMClass())
21268 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21269 else if (Subtarget->isThumb())
21270 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21271 else
21272 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21273 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21274 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21277}
21278
21280 const Instruction *I) const {
21281 return InsertFencesForAtomic;
21282}
21283
21285 // ROPI/RWPI are not supported currently.
21286 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21287}
21288
21290 // MSVC CRT provides functionalities for stack protection.
21291 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21292 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21293
21294 RTLIB::LibcallImpl SecurityCookieVar =
21295 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21296 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21297 SecurityCookieVar != RTLIB::Unsupported) {
21298 // MSVC CRT has a global variable holding security cookie.
21299 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21300 PointerType::getUnqual(M.getContext()));
21301
21302 // MSVC CRT has a function to validate security cookie.
21303 FunctionCallee SecurityCheckCookie =
21304 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21305 Type::getVoidTy(M.getContext()),
21306 PointerType::getUnqual(M.getContext()));
21307 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21308 F->addParamAttr(0, Attribute::AttrKind::InReg);
21309 }
21310
21312}
21313
21315 // MSVC CRT has a function to validate security cookie.
21316 RTLIB::LibcallImpl SecurityCheckCookie =
21317 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21318 if (SecurityCheckCookie != RTLIB::Unsupported)
21319 return M.getFunction(getLibcallImplName(SecurityCheckCookie));
21321}
21322
21324 unsigned &Cost) const {
21325 // If we do not have NEON, vector types are not natively supported.
21326 if (!Subtarget->hasNEON())
21327 return false;
21328
21329 // Floating point values and vector values map to the same register file.
21330 // Therefore, although we could do a store extract of a vector type, this is
21331 // better to leave at float as we have more freedom in the addressing mode for
21332 // those.
21333 if (VectorTy->isFPOrFPVectorTy())
21334 return false;
21335
21336 // If the index is unknown at compile time, this is very expensive to lower
21337 // and it is not possible to combine the store with the extract.
21338 if (!isa<ConstantInt>(Idx))
21339 return false;
21340
21341 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21342 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21343 // We can do a store + vector extract on any vector that fits perfectly in a D
21344 // or Q register.
21345 if (BitWidth == 64 || BitWidth == 128) {
21346 Cost = 0;
21347 return true;
21348 }
21349 return false;
21350}
21351
21353 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21354 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21355 unsigned Opcode = Op.getOpcode();
21356 switch (Opcode) {
21357 case ARMISD::VORRIMM:
21358 case ARMISD::VBICIMM:
21359 return false;
21360 }
21362 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21363}
21364
21366 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21367}
21368
21370 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21371}
21372
21374 const Instruction &AndI) const {
21375 if (!Subtarget->hasV7Ops())
21376 return false;
21377
21378 // Sink the `and` instruction only if the mask would fit into a modified
21379 // immediate operand.
21381 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21382 return false;
21383 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21384 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21385 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21386}
21387
21390 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21391 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21394 ExpansionFactor);
21395}
21396
21398 Value *Addr,
21399 AtomicOrdering Ord) const {
21400 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21401 bool IsAcquire = isAcquireOrStronger(Ord);
21402
21403 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21404 // intrinsic must return {i32, i32} and we have to recombine them into a
21405 // single i64 here.
21406 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21408 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21409
21410 Value *LoHi =
21411 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21412
21413 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21414 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21415 if (!Subtarget->isLittle())
21416 std::swap (Lo, Hi);
21417 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21418 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21419 return Builder.CreateOr(
21420 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21421 }
21422
21423 Type *Tys[] = { Addr->getType() };
21424 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21425 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21426
21427 CI->addParamAttr(
21428 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21429 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21430}
21431
21433 IRBuilderBase &Builder) const {
21434 if (!Subtarget->hasV7Ops())
21435 return;
21436 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21437}
21438
21440 Value *Val, Value *Addr,
21441 AtomicOrdering Ord) const {
21442 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21443 bool IsRelease = isReleaseOrStronger(Ord);
21444
21445 // Since the intrinsics must have legal type, the i64 intrinsics take two
21446 // parameters: "i32, i32". We must marshal Val into the appropriate form
21447 // before the call.
21448 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21450 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21451 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21452
21453 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21454 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21455 if (!Subtarget->isLittle())
21456 std::swap(Lo, Hi);
21457 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21458 }
21459
21460 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21461 Type *Tys[] = { Addr->getType() };
21463
21464 CallInst *CI = Builder.CreateCall(
21465 Strex, {Builder.CreateZExtOrBitCast(
21466 Val, Strex->getFunctionType()->getParamType(0)),
21467 Addr});
21468 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21469 Val->getType()));
21470 return CI;
21471}
21472
21473
21475 return Subtarget->isMClass();
21476}
21477
21478/// A helper function for determining the number of interleaved accesses we
21479/// will generate when lowering accesses of the given type.
21480unsigned
21482 const DataLayout &DL) const {
21483 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21484}
21485
21487 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21488 const DataLayout &DL) const {
21489
21490 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21491 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21492
21493 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21494 return false;
21495
21496 // Ensure the vector doesn't have f16 elements. Even though we could do an
21497 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21498 // f32.
21499 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21500 return false;
21501 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21502 return false;
21503
21504 // Ensure the number of vector elements is greater than 1.
21505 if (VecTy->getNumElements() < 2)
21506 return false;
21507
21508 // Ensure the element type is legal.
21509 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21510 return false;
21511 // And the alignment if high enough under MVE.
21512 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21513 return false;
21514
21515 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21516 // 128 will be split into multiple interleaved accesses.
21517 if (Subtarget->hasNEON() && VecSize == 64)
21518 return true;
21519 return VecSize % 128 == 0;
21520}
21521
21523 if (Subtarget->hasNEON())
21524 return 4;
21525 if (Subtarget->hasMVEIntegerOps())
21528}
21529
21530/// Lower an interleaved load into a vldN intrinsic.
21531///
21532/// E.g. Lower an interleaved load (Factor = 2):
21533/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21534/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21535/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21536///
21537/// Into:
21538/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21539/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21540/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21542 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21543 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21544 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21545 "Invalid interleave factor");
21546 assert(!Shuffles.empty() && "Empty shufflevector input");
21547 assert(Shuffles.size() == Indices.size() &&
21548 "Unmatched number of shufflevectors and indices");
21549
21550 auto *LI = dyn_cast<LoadInst>(Load);
21551 if (!LI)
21552 return false;
21553 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21554
21555 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21556 Type *EltTy = VecTy->getElementType();
21557
21558 const DataLayout &DL = LI->getDataLayout();
21559 Align Alignment = LI->getAlign();
21560
21561 // Skip if we do not have NEON and skip illegal vector types. We can
21562 // "legalize" wide vector types into multiple interleaved accesses as long as
21563 // the vector types are divisible by 128.
21564 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21565 return false;
21566
21567 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21568
21569 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21570 // load integer vectors first and then convert to pointer vectors.
21571 if (EltTy->isPointerTy())
21572 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21573
21574 IRBuilder<> Builder(LI);
21575
21576 // The base address of the load.
21577 Value *BaseAddr = LI->getPointerOperand();
21578
21579 if (NumLoads > 1) {
21580 // If we're going to generate more than one load, reset the sub-vector type
21581 // to something legal.
21582 VecTy = FixedVectorType::get(VecTy->getElementType(),
21583 VecTy->getNumElements() / NumLoads);
21584 }
21585
21586 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21587
21588 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21589 if (Subtarget->hasNEON()) {
21590 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21591 Type *Tys[] = {VecTy, PtrTy};
21592 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21593 Intrinsic::arm_neon_vld3,
21594 Intrinsic::arm_neon_vld4};
21595
21597 Ops.push_back(BaseAddr);
21598 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21599
21600 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21601 /*FMFSource=*/nullptr, "vldN");
21602 } else {
21603 assert((Factor == 2 || Factor == 4) &&
21604 "expected interleave factor of 2 or 4 for MVE");
21605 Intrinsic::ID LoadInts =
21606 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21607 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21608 Type *Tys[] = {VecTy, PtrTy};
21609
21611 Ops.push_back(BaseAddr);
21612 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21613 "vldN");
21614 }
21615 };
21616
21617 // Holds sub-vectors extracted from the load intrinsic return values. The
21618 // sub-vectors are associated with the shufflevector instructions they will
21619 // replace.
21621
21622 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21623 // If we're generating more than one load, compute the base address of
21624 // subsequent loads as an offset from the previous.
21625 if (LoadCount > 0)
21626 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21627 VecTy->getNumElements() * Factor);
21628
21629 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21630
21631 // Replace uses of each shufflevector with the corresponding vector loaded
21632 // by ldN.
21633 for (unsigned i = 0; i < Shuffles.size(); i++) {
21634 ShuffleVectorInst *SV = Shuffles[i];
21635 unsigned Index = Indices[i];
21636
21637 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21638
21639 // Convert the integer vector to pointer vector if the element is pointer.
21640 if (EltTy->isPointerTy())
21641 SubVec = Builder.CreateIntToPtr(
21642 SubVec,
21644
21645 SubVecs[SV].push_back(SubVec);
21646 }
21647 }
21648
21649 // Replace uses of the shufflevector instructions with the sub-vectors
21650 // returned by the load intrinsic. If a shufflevector instruction is
21651 // associated with more than one sub-vector, those sub-vectors will be
21652 // concatenated into a single wide vector.
21653 for (ShuffleVectorInst *SVI : Shuffles) {
21654 auto &SubVec = SubVecs[SVI];
21655 auto *WideVec =
21656 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21657 SVI->replaceAllUsesWith(WideVec);
21658 }
21659
21660 return true;
21661}
21662
21663/// Lower an interleaved store into a vstN intrinsic.
21664///
21665/// E.g. Lower an interleaved store (Factor = 3):
21666/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21667/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21668/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21669///
21670/// Into:
21671/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21672/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21673/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21674/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21675///
21676/// Note that the new shufflevectors will be removed and we'll only generate one
21677/// vst3 instruction in CodeGen.
21678///
21679/// Example for a more general valid mask (Factor 3). Lower:
21680/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21681/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21682/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21683///
21684/// Into:
21685/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21686/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21687/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21688/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21690 Value *LaneMask,
21691 ShuffleVectorInst *SVI,
21692 unsigned Factor,
21693 const APInt &GapMask) const {
21694 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21695 "Invalid interleave factor");
21696 auto *SI = dyn_cast<StoreInst>(Store);
21697 if (!SI)
21698 return false;
21699 assert(!LaneMask && GapMask.popcount() == Factor &&
21700 "Unexpected mask on store");
21701
21702 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21703 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21704
21705 unsigned LaneLen = VecTy->getNumElements() / Factor;
21706 Type *EltTy = VecTy->getElementType();
21707 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21708
21709 const DataLayout &DL = SI->getDataLayout();
21710 Align Alignment = SI->getAlign();
21711
21712 // Skip if we do not have NEON and skip illegal vector types. We can
21713 // "legalize" wide vector types into multiple interleaved accesses as long as
21714 // the vector types are divisible by 128.
21715 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21716 return false;
21717
21718 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21719
21720 Value *Op0 = SVI->getOperand(0);
21721 Value *Op1 = SVI->getOperand(1);
21722 IRBuilder<> Builder(SI);
21723
21724 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21725 // vectors to integer vectors.
21726 if (EltTy->isPointerTy()) {
21727 Type *IntTy = DL.getIntPtrType(EltTy);
21728
21729 // Convert to the corresponding integer vector.
21730 auto *IntVecTy =
21732 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21733 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21734
21735 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21736 }
21737
21738 // The base address of the store.
21739 Value *BaseAddr = SI->getPointerOperand();
21740
21741 if (NumStores > 1) {
21742 // If we're going to generate more than one store, reset the lane length
21743 // and sub-vector type to something legal.
21744 LaneLen /= NumStores;
21745 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21746 }
21747
21748 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21749
21750 auto Mask = SVI->getShuffleMask();
21751
21752 auto createStoreIntrinsic = [&](Value *BaseAddr,
21753 SmallVectorImpl<Value *> &Shuffles) {
21754 if (Subtarget->hasNEON()) {
21755 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21756 Intrinsic::arm_neon_vst3,
21757 Intrinsic::arm_neon_vst4};
21758 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21759 Type *Tys[] = {PtrTy, SubVecTy};
21760
21762 Ops.push_back(BaseAddr);
21763 append_range(Ops, Shuffles);
21764 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21765 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21766 } else {
21767 assert((Factor == 2 || Factor == 4) &&
21768 "expected interleave factor of 2 or 4 for MVE");
21769 Intrinsic::ID StoreInts =
21770 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21771 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21772 Type *Tys[] = {PtrTy, SubVecTy};
21773
21775 Ops.push_back(BaseAddr);
21776 append_range(Ops, Shuffles);
21777 for (unsigned F = 0; F < Factor; F++) {
21778 Ops.push_back(Builder.getInt32(F));
21779 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21780 Ops.pop_back();
21781 }
21782 }
21783 };
21784
21785 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21786 // If we generating more than one store, we compute the base address of
21787 // subsequent stores as an offset from the previous.
21788 if (StoreCount > 0)
21789 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21790 BaseAddr, LaneLen * Factor);
21791
21792 SmallVector<Value *, 4> Shuffles;
21793
21794 // Split the shufflevector operands into sub vectors for the new vstN call.
21795 for (unsigned i = 0; i < Factor; i++) {
21796 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21797 if (Mask[IdxI] >= 0) {
21798 Shuffles.push_back(Builder.CreateShuffleVector(
21799 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21800 } else {
21801 unsigned StartMask = 0;
21802 for (unsigned j = 1; j < LaneLen; j++) {
21803 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21804 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21805 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21806 break;
21807 }
21808 }
21809 // Note: If all elements in a chunk are undefs, StartMask=0!
21810 // Note: Filling undef gaps with random elements is ok, since
21811 // those elements were being written anyway (with undefs).
21812 // In the case of all undefs we're defaulting to using elems from 0
21813 // Note: StartMask cannot be negative, it's checked in
21814 // isReInterleaveMask
21815 Shuffles.push_back(Builder.CreateShuffleVector(
21816 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21817 }
21818 }
21819
21820 createStoreIntrinsic(BaseAddr, Shuffles);
21821 }
21822 return true;
21823}
21824
21832
21834 uint64_t &Members) {
21835 if (auto *ST = dyn_cast<StructType>(Ty)) {
21836 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21837 uint64_t SubMembers = 0;
21838 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21839 return false;
21840 Members += SubMembers;
21841 }
21842 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21843 uint64_t SubMembers = 0;
21844 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21845 return false;
21846 Members += SubMembers * AT->getNumElements();
21847 } else if (Ty->isFloatTy()) {
21848 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21849 return false;
21850 Members = 1;
21851 Base = HA_FLOAT;
21852 } else if (Ty->isDoubleTy()) {
21853 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21854 return false;
21855 Members = 1;
21856 Base = HA_DOUBLE;
21857 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21858 Members = 1;
21859 switch (Base) {
21860 case HA_FLOAT:
21861 case HA_DOUBLE:
21862 return false;
21863 case HA_VECT64:
21864 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21865 case HA_VECT128:
21866 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21867 case HA_UNKNOWN:
21868 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21869 case 64:
21870 Base = HA_VECT64;
21871 return true;
21872 case 128:
21873 Base = HA_VECT128;
21874 return true;
21875 default:
21876 return false;
21877 }
21878 }
21879 }
21880
21881 return (Members > 0 && Members <= 4);
21882}
21883
21884/// Return the correct alignment for the current calling convention.
21886 Type *ArgTy, const DataLayout &DL) const {
21887 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21888 if (!ArgTy->isVectorTy())
21889 return ABITypeAlign;
21890
21891 // Avoid over-aligning vector parameters. It would require realigning the
21892 // stack and waste space for no real benefit.
21893 MaybeAlign StackAlign = DL.getStackAlignment();
21894 assert(StackAlign && "data layout string is missing stack alignment");
21895 return std::min(ABITypeAlign, *StackAlign);
21896}
21897
21898/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21899/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21900/// passing according to AAPCS rules.
21902 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21903 const DataLayout &DL) const {
21904 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21906 return false;
21907
21909 uint64_t Members = 0;
21910 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21911 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21912
21913 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21914 return IsHA || IsIntArray;
21915}
21916
21918 const Constant *PersonalityFn) const {
21919 // Platforms which do not use SjLj EH may return values in these registers
21920 // via the personality function.
21922 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21923}
21924
21926 const Constant *PersonalityFn) const {
21927 // Platforms which do not use SjLj EH may return values in these registers
21928 // via the personality function.
21930 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21931}
21932
21933void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21934 // Update IsSplitCSR in ARMFunctionInfo.
21935 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21936 AFI->setIsSplitCSR(true);
21937}
21938
21939void ARMTargetLowering::insertCopiesSplitCSR(
21940 MachineBasicBlock *Entry,
21941 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21942 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21943 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21944 if (!IStart)
21945 return;
21946
21947 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21948 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21949 MachineBasicBlock::iterator MBBI = Entry->begin();
21950 for (const MCPhysReg *I = IStart; *I; ++I) {
21951 const TargetRegisterClass *RC = nullptr;
21952 if (ARM::GPRRegClass.contains(*I))
21953 RC = &ARM::GPRRegClass;
21954 else if (ARM::DPRRegClass.contains(*I))
21955 RC = &ARM::DPRRegClass;
21956 else
21957 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21958
21959 Register NewVR = MRI->createVirtualRegister(RC);
21960 // Create copy from CSR to a virtual register.
21961 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21962 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21963 // nounwind. If we want to generalize this later, we may need to emit
21964 // CFI pseudo-instructions.
21965 assert(Entry->getParent()->getFunction().hasFnAttribute(
21966 Attribute::NoUnwind) &&
21967 "Function should be nounwind in insertCopiesSplitCSR!");
21968 Entry->addLiveIn(*I);
21969 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21970 .addReg(*I);
21971
21972 // Insert the copy-back instructions right before the terminator.
21973 for (auto *Exit : Exits)
21974 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21975 TII->get(TargetOpcode::COPY), *I)
21976 .addReg(NewVR);
21977 }
21978}
21979
21984
21986 return Subtarget->hasMVEIntegerOps();
21987}
21988
21991 auto *VTy = dyn_cast<FixedVectorType>(Ty);
21992 if (!VTy)
21993 return false;
21994
21995 auto *ScalarTy = VTy->getScalarType();
21996 unsigned NumElements = VTy->getNumElements();
21997
21998 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
21999 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22000 return false;
22001
22002 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22003 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22004 return Subtarget->hasMVEFloatOps();
22005
22007 return false;
22008
22009 return Subtarget->hasMVEIntegerOps() &&
22010 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22011 ScalarTy->isIntegerTy(32));
22012}
22013
22016 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22017 Value *Accumulator) const {
22018
22020
22021 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22022
22023 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22024
22025 if (TyWidth > 128) {
22026 int Stride = Ty->getNumElements() / 2;
22027 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22028 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22029 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22030 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22031
22032 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22033 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22034 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22035 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22036 Value *LowerSplitAcc = nullptr;
22037 Value *UpperSplitAcc = nullptr;
22038
22039 if (Accumulator) {
22040 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22041 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22042 }
22043
22044 auto *LowerSplitInt = createComplexDeinterleavingIR(
22045 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22046 auto *UpperSplitInt = createComplexDeinterleavingIR(
22047 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22048
22049 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22050 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22051 }
22052
22053 auto *IntTy = Type::getInt32Ty(B.getContext());
22054
22055 ConstantInt *ConstRotation = nullptr;
22056 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22057 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22058
22059 if (Accumulator)
22060 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22061 {ConstRotation, Accumulator, InputB, InputA});
22062 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22063 {ConstRotation, InputB, InputA});
22064 }
22065
22066 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22067 // 1 means the value is not halved.
22068 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22069
22071 ConstRotation = ConstantInt::get(IntTy, 0);
22073 ConstRotation = ConstantInt::get(IntTy, 1);
22074
22075 if (!ConstRotation)
22076 return nullptr; // Invalid rotation for arm_mve_vcaddq
22077
22078 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22079 {ConstHalving, ConstRotation, InputA, InputB});
22080 }
22081
22082 return nullptr;
22083}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:6002
APInt bitcastToAPInt() const
Definition APFloat.h:1335
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1314
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1201
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1761
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:475
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool isTargetWindows() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
Align getDualLoadStoreAlignment() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:899
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:277
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:237
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:295
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:687
const Argument * const_arg_iterator
Definition Function.h:73
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:127
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
const unsigned char * bytes_begin() const
Definition StringRef.h:124
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl)
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:438
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:531
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:732
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:279
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1516
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:333
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:594
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...