LLVM 19.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
87#include "llvm/IR/Type.h"
88#include "llvm/IR/User.h"
89#include "llvm/IR/Value.h"
90#include "llvm/MC/MCInstrDesc.h"
93#include "llvm/MC/MCSchedule.h"
100#include "llvm/Support/Debug.h"
108#include <algorithm>
109#include <cassert>
110#include <cstdint>
111#include <cstdlib>
112#include <iterator>
113#include <limits>
114#include <optional>
115#include <tuple>
116#include <utility>
117#include <vector>
118
119using namespace llvm;
120using namespace llvm::PatternMatch;
121
122#define DEBUG_TYPE "arm-isel"
123
124STATISTIC(NumTailCalls, "Number of tail calls");
125STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
126STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
127STATISTIC(NumConstpoolPromoted,
128 "Number of constants with their storage promoted into constant pools");
129
130static cl::opt<bool>
131ARMInterworking("arm-interworking", cl::Hidden,
132 cl::desc("Enable / disable ARM interworking (for debugging only)"),
133 cl::init(true));
134
136 "arm-promote-constant", cl::Hidden,
137 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 "constant pools"),
139 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
141 "arm-promote-constant-max-size", cl::Hidden,
142 cl::desc("Maximum size of constant to promote into a constant pool"),
143 cl::init(64));
145 "arm-promote-constant-max-total", cl::Hidden,
146 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
147 cl::init(128));
148
150MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
151 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
152 cl::init(2));
153
154// The APCS parameter registers.
155static const MCPhysReg GPRArgRegs[] = {
156 ARM::R0, ARM::R1, ARM::R2, ARM::R3
157};
158
160 SelectionDAG &DAG, const SDLoc &DL) {
162 assert(Arg.ArgVT.bitsLT(MVT::i32));
163 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
164 SDValue Ext =
166 MVT::i32, Trunc);
167 return Ext;
168}
169
170void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
171 if (VT != PromotedLdStVT) {
173 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
174
176 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
177 }
178
179 MVT ElemTy = VT.getVectorElementType();
180 if (ElemTy != MVT::f64)
184 if (ElemTy == MVT::i32) {
189 } else {
194 }
203 if (VT.isInteger()) {
207 }
208
209 // Neon does not support vector divide/remainder operations.
218
219 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
220 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
222 setOperationAction(Opcode, VT, Legal);
223 if (!VT.isFloatingPoint())
224 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
225 setOperationAction(Opcode, VT, Legal);
226}
227
228void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
229 addRegisterClass(VT, &ARM::DPRRegClass);
230 addTypeForNEON(VT, MVT::f64);
231}
232
233void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPairRegClass);
235 addTypeForNEON(VT, MVT::v2f64);
236}
237
238void ARMTargetLowering::setAllExpand(MVT VT) {
239 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
240 setOperationAction(Opc, VT, Expand);
241
242 // We support these really simple operations even on types where all
243 // the actual arithmetic has to be broken down into simpler
244 // operations or turned into library calls.
249}
250
251void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
252 LegalizeAction Action) {
253 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
254 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
255 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
256}
257
258void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
259 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
260
261 for (auto VT : IntTypes) {
262 addRegisterClass(VT, &ARM::MQPRRegClass);
292
293 // No native support for these.
303
304 // Vector reductions
314
315 if (!HasMVEFP) {
320 } else {
323 }
324
325 // Pre and Post inc are supported on loads and stores
326 for (unsigned im = (unsigned)ISD::PRE_INC;
332 }
333 }
334
335 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
336 for (auto VT : FloatTypes) {
337 addRegisterClass(VT, &ARM::MQPRRegClass);
338 if (!HasMVEFP)
339 setAllExpand(VT);
340
341 // These are legal or custom whether we have MVE.fp or not
354
355 // Pre and Post inc are supported on loads and stores
356 for (unsigned im = (unsigned)ISD::PRE_INC;
362 }
363
364 if (HasMVEFP) {
372
373 // No native support for these.
388 }
389 }
390
391 // Custom Expand smaller than legal vector reductions to prevent false zero
392 // items being added.
401
402 // We 'support' these types up to bitcast/load/store level, regardless of
403 // MVE integer-only / float support. Only doing FP data processing on the FP
404 // vector types is inhibited at integer-only level.
405 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
406 for (auto VT : LongTypes) {
407 addRegisterClass(VT, &ARM::MQPRRegClass);
408 setAllExpand(VT);
414 }
416
417 // We can do bitwise operations on v2i64 vectors
418 setOperationAction(ISD::AND, MVT::v2i64, Legal);
419 setOperationAction(ISD::OR, MVT::v2i64, Legal);
420 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
421
422 // It is legal to extload from v4i8 to v4i16 or v4i32.
423 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
424 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
425 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
426
427 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
433
434 // Some truncating stores are legal too.
435 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
436 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
437 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
438
439 // Pre and Post inc on these are legal, given the correct extends
440 for (unsigned im = (unsigned)ISD::PRE_INC;
442 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
447 }
448 }
449
450 // Predicate types
451 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
452 for (auto VT : pTypes) {
453 addRegisterClass(VT, &ARM::VCCRRegClass);
468
469 if (!HasMVEFP) {
474 }
475 }
479 setOperationAction(ISD::OR, MVT::v2i1, Expand);
485
494}
495
497 const ARMSubtarget &STI)
498 : TargetLowering(TM), Subtarget(&STI) {
499 RegInfo = Subtarget->getRegisterInfo();
500 Itins = Subtarget->getInstrItineraryData();
501
504
505 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
506 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
507 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
508 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
509 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
510 IsHFTarget ? CallingConv::ARM_AAPCS_VFP
512 }
513
514 if (Subtarget->isTargetMachO()) {
515 // Uses VFP for Thumb libfuncs if available.
516 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
517 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
518 static const struct {
519 const RTLIB::Libcall Op;
520 const char * const Name;
521 const ISD::CondCode Cond;
522 } LibraryCalls[] = {
523 // Single-precision floating-point arithmetic.
524 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
525 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
526 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
527 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
528
529 // Double-precision floating-point arithmetic.
530 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
531 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
532 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
533 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
534
535 // Single-precision comparisons.
536 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
537 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
538 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
539 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
540 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
541 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
542 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
543
544 // Double-precision comparisons.
545 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
546 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
547 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
548 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
549 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
550 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
551 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
552
553 // Floating-point to integer conversions.
554 // i64 conversions are done via library routines even when generating VFP
555 // instructions, so use the same ones.
556 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
557 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
558 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
559 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
560
561 // Conversions between floating types.
562 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
563 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
564
565 // Integer to floating-point conversions.
566 // i64 conversions are done via library routines even when generating VFP
567 // instructions, so use the same ones.
568 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
569 // e.g., __floatunsidf vs. __floatunssidfvfp.
570 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
571 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
572 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
573 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
574 };
575
576 for (const auto &LC : LibraryCalls) {
577 setLibcallName(LC.Op, LC.Name);
578 if (LC.Cond != ISD::SETCC_INVALID)
579 setCmpLibcallCC(LC.Op, LC.Cond);
580 }
581 }
582 }
583
584 // These libcalls are not available in 32-bit.
585 setLibcallName(RTLIB::SHL_I128, nullptr);
586 setLibcallName(RTLIB::SRL_I128, nullptr);
587 setLibcallName(RTLIB::SRA_I128, nullptr);
588 setLibcallName(RTLIB::MUL_I128, nullptr);
589 setLibcallName(RTLIB::MULO_I64, nullptr);
590 setLibcallName(RTLIB::MULO_I128, nullptr);
591
592 // RTLIB
593 if (Subtarget->isAAPCS_ABI() &&
594 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
595 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
596 static const struct {
597 const RTLIB::Libcall Op;
598 const char * const Name;
599 const CallingConv::ID CC;
600 const ISD::CondCode Cond;
601 } LibraryCalls[] = {
602 // Double-precision floating-point arithmetic helper functions
603 // RTABI chapter 4.1.2, Table 2
604 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
605 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
606 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
607 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
608
609 // Double-precision floating-point comparison helper functions
610 // RTABI chapter 4.1.2, Table 3
611 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
612 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
613 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
614 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
615 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
616 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
617 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
618
619 // Single-precision floating-point arithmetic helper functions
620 // RTABI chapter 4.1.2, Table 4
621 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
622 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
623 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
624 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
625
626 // Single-precision floating-point comparison helper functions
627 // RTABI chapter 4.1.2, Table 5
628 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
629 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
630 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
631 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
632 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
633 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
634 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
635
636 // Floating-point to integer conversions.
637 // RTABI chapter 4.1.2, Table 6
638 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
639 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
640 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
641 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
642 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
643 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
644 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
645 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
646
647 // Conversions between floating types.
648 // RTABI chapter 4.1.2, Table 7
649 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
650 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
651 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
652
653 // Integer to floating-point conversions.
654 // RTABI chapter 4.1.2, Table 8
655 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
656 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
657 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
658 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
659 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
660 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
661 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
662 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
663
664 // Long long helper functions
665 // RTABI chapter 4.2, Table 9
666 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
667 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
668 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
669 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
670
671 // Integer division functions
672 // RTABI chapter 4.3.1
673 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
674 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
675 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
676 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
677 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
678 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
679 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
680 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
681 };
682
683 for (const auto &LC : LibraryCalls) {
684 setLibcallName(LC.Op, LC.Name);
685 setLibcallCallingConv(LC.Op, LC.CC);
686 if (LC.Cond != ISD::SETCC_INVALID)
687 setCmpLibcallCC(LC.Op, LC.Cond);
688 }
689
690 // EABI dependent RTLIB
691 if (TM.Options.EABIVersion == EABI::EABI4 ||
692 TM.Options.EABIVersion == EABI::EABI5) {
693 static const struct {
694 const RTLIB::Libcall Op;
695 const char *const Name;
696 const CallingConv::ID CC;
697 const ISD::CondCode Cond;
698 } MemOpsLibraryCalls[] = {
699 // Memory operations
700 // RTABI chapter 4.3.4
701 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
702 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
703 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
704 };
705
706 for (const auto &LC : MemOpsLibraryCalls) {
707 setLibcallName(LC.Op, LC.Name);
708 setLibcallCallingConv(LC.Op, LC.CC);
709 if (LC.Cond != ISD::SETCC_INVALID)
710 setCmpLibcallCC(LC.Op, LC.Cond);
711 }
712 }
713 }
714
715 if (Subtarget->isTargetWindows()) {
716 static const struct {
717 const RTLIB::Libcall Op;
718 const char * const Name;
719 const CallingConv::ID CC;
720 } LibraryCalls[] = {
721 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
722 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
723 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
724 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
725 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
726 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
727 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
728 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
729 };
730
731 for (const auto &LC : LibraryCalls) {
732 setLibcallName(LC.Op, LC.Name);
733 setLibcallCallingConv(LC.Op, LC.CC);
734 }
735 }
736
737 // Use divmod compiler-rt calls for iOS 5.0 and later.
738 if (Subtarget->isTargetMachO() &&
739 !(Subtarget->isTargetIOS() &&
740 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
741 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
742 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
743 }
744
745 // The half <-> float conversion functions are always soft-float on
746 // non-watchos platforms, but are needed for some targets which use a
747 // hard-float calling convention by default.
748 if (!Subtarget->isTargetWatchABI()) {
749 if (Subtarget->isAAPCS_ABI()) {
750 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
751 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
752 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
753 } else {
754 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
755 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
756 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
757 }
758 }
759
760 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
761 // a __gnu_ prefix (which is the default).
762 if (Subtarget->isTargetAEABI()) {
763 static const struct {
764 const RTLIB::Libcall Op;
765 const char * const Name;
766 const CallingConv::ID CC;
767 } LibraryCalls[] = {
768 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
769 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
770 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
771 };
772
773 for (const auto &LC : LibraryCalls) {
774 setLibcallName(LC.Op, LC.Name);
775 setLibcallCallingConv(LC.Op, LC.CC);
776 }
777 }
778
779 if (Subtarget->isThumb1Only())
780 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
781 else
782 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
783
784 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
785 Subtarget->hasFPRegs()) {
786 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
787 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
788
793
794 if (!Subtarget->hasVFP2Base())
795 setAllExpand(MVT::f32);
796 if (!Subtarget->hasFP64())
797 setAllExpand(MVT::f64);
798 }
799
800 if (Subtarget->hasFullFP16()) {
801 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
804
807 }
808
809 if (Subtarget->hasBF16()) {
810 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
811 setAllExpand(MVT::bf16);
812 if (!Subtarget->hasFullFP16())
814 }
815
817 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
818 setTruncStoreAction(VT, InnerVT, Expand);
819 addAllExtLoads(VT, InnerVT, Expand);
820 }
821
824
826 }
827
830
833
834 if (Subtarget->hasMVEIntegerOps())
835 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
836
837 // Combine low-overhead loop intrinsics so that we can lower i1 types.
838 if (Subtarget->hasLOB()) {
840 }
841
842 if (Subtarget->hasNEON()) {
843 addDRTypeForNEON(MVT::v2f32);
844 addDRTypeForNEON(MVT::v8i8);
845 addDRTypeForNEON(MVT::v4i16);
846 addDRTypeForNEON(MVT::v2i32);
847 addDRTypeForNEON(MVT::v1i64);
848
849 addQRTypeForNEON(MVT::v4f32);
850 addQRTypeForNEON(MVT::v2f64);
851 addQRTypeForNEON(MVT::v16i8);
852 addQRTypeForNEON(MVT::v8i16);
853 addQRTypeForNEON(MVT::v4i32);
854 addQRTypeForNEON(MVT::v2i64);
855
856 if (Subtarget->hasFullFP16()) {
857 addQRTypeForNEON(MVT::v8f16);
858 addDRTypeForNEON(MVT::v4f16);
859 }
860
861 if (Subtarget->hasBF16()) {
862 addQRTypeForNEON(MVT::v8bf16);
863 addDRTypeForNEON(MVT::v4bf16);
864 }
865 }
866
867 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
868 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
869 // none of Neon, MVE or VFP supports any arithmetic operations on it.
870 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
871 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
872 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
873 // FIXME: Code duplication: FDIV and FREM are expanded always, see
874 // ARMTargetLowering::addTypeForNEON method for details.
875 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
876 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
877 // FIXME: Create unittest.
878 // In another words, find a way when "copysign" appears in DAG with vector
879 // operands.
881 // FIXME: Code duplication: SETCC has custom operation action, see
882 // ARMTargetLowering::addTypeForNEON method for details.
884 // FIXME: Create unittest for FNEG and for FABS.
885 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
886 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
888 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
889 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
890 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
891 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
892 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
895 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
898 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
904 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
905 }
906
907 if (Subtarget->hasNEON()) {
908 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
909 // supported for v4f32.
911 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
912 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
913 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
914 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
915 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
918 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
926
927 // Mark v2f32 intrinsics.
929 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
930 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
931 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
932 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
933 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
936 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
944
945 // Neon does not support some operations on v1i64 and v2i64 types.
946 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
947 // Custom handling for some quad-vector types to detect VMULL.
948 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
949 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
950 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
951 // Custom handling for some vector types to avoid expensive expansions
952 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
954 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
956 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
957 // a destination type that is wider than the source, and nor does
958 // it have a FP_TO_[SU]INT instruction with a narrower destination than
959 // source.
968
971
972 // NEON does not have single instruction CTPOP for vectors with element
973 // types wider than 8-bits. However, custom lowering can leverage the
974 // v8i8/v16i8 vcnt instruction.
981
982 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
983 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
984
985 // NEON does not have single instruction CTTZ for vectors.
987 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
988 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
989 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
990
991 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
992 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
993 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
994 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
995
1000
1005
1009 }
1010
1011 // NEON only has FMA instructions as of VFP4.
1012 if (!Subtarget->hasVFP4Base()) {
1013 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
1014 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
1015 }
1016
1019
1020 // It is legal to extload from v4i8 to v4i16 or v4i32.
1021 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
1022 MVT::v2i32}) {
1027 }
1028 }
1029
1030 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1031 MVT::v4i32}) {
1036 }
1037 }
1038
1039 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1046 }
1047 if (Subtarget->hasMVEIntegerOps()) {
1050 ISD::SETCC});
1051 }
1052 if (Subtarget->hasMVEFloatOps()) {
1054 }
1055
1056 if (!Subtarget->hasFP64()) {
1057 // When targeting a floating-point unit with only single-precision
1058 // operations, f64 is legal for the few double-precision instructions which
1059 // are present However, no double-precision operations other than moves,
1060 // loads and stores are provided by the hardware.
1098 }
1099
1100 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1103 if (Subtarget->hasFullFP16()) {
1106 }
1107 }
1108
1109 if (!Subtarget->hasFP16()) {
1112 }
1113
1115
1116 // ARM does not have floating-point extending loads.
1117 for (MVT VT : MVT::fp_valuetypes()) {
1118 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1119 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1120 }
1121
1122 // ... or truncating stores
1123 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1124 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1125 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1126
1127 // ARM does not have i1 sign extending load.
1128 for (MVT VT : MVT::integer_valuetypes())
1129 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1130
1131 // ARM supports all 4 flavors of integer indexed load / store.
1132 if (!Subtarget->isThumb1Only()) {
1133 for (unsigned im = (unsigned)ISD::PRE_INC;
1135 setIndexedLoadAction(im, MVT::i1, Legal);
1136 setIndexedLoadAction(im, MVT::i8, Legal);
1137 setIndexedLoadAction(im, MVT::i16, Legal);
1138 setIndexedLoadAction(im, MVT::i32, Legal);
1139 setIndexedStoreAction(im, MVT::i1, Legal);
1140 setIndexedStoreAction(im, MVT::i8, Legal);
1141 setIndexedStoreAction(im, MVT::i16, Legal);
1142 setIndexedStoreAction(im, MVT::i32, Legal);
1143 }
1144 } else {
1145 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1148 }
1149
1154
1157 if (Subtarget->hasDSP()) {
1166 }
1167 if (Subtarget->hasBaseDSP()) {
1170 }
1171
1172 // i64 operation support.
1175 if (Subtarget->isThumb1Only()) {
1178 }
1179 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1180 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1182
1192
1193 // MVE lowers 64 bit shifts to lsll and lsrl
1194 // assuming that ISD::SRL and SRA of i64 are already marked custom
1195 if (Subtarget->hasMVEIntegerOps())
1197
1198 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1199 if (Subtarget->isThumb1Only()) {
1203 }
1204
1205 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1207
1208 // ARM does not have ROTL.
1213 }
1216 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1219 }
1220
1221 // @llvm.readcyclecounter requires the Performance Monitors extension.
1222 // Default to the 0 expansion on unsupported platforms.
1223 // FIXME: Technically there are older ARM CPUs that have
1224 // implementation-specific ways of obtaining this information.
1225 if (Subtarget->hasPerfMon())
1227
1228 // Only ARMv6 has BSWAP.
1229 if (!Subtarget->hasV6Ops())
1231
1232 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1233 : Subtarget->hasDivideInARMMode();
1234 if (!hasDivide) {
1235 // These are expanded into libcalls if the cpu doesn't have HW divider.
1238 }
1239
1240 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1243
1246 }
1247
1250
1251 // Register based DivRem for AEABI (RTABI 4.2)
1252 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1253 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1254 Subtarget->isTargetWindows()) {
1257 HasStandaloneRem = false;
1258
1259 if (Subtarget->isTargetWindows()) {
1260 const struct {
1261 const RTLIB::Libcall Op;
1262 const char * const Name;
1263 const CallingConv::ID CC;
1264 } LibraryCalls[] = {
1265 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1266 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1267 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1268 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1269
1270 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1271 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1272 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1273 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1274 };
1275
1276 for (const auto &LC : LibraryCalls) {
1277 setLibcallName(LC.Op, LC.Name);
1278 setLibcallCallingConv(LC.Op, LC.CC);
1279 }
1280 } else {
1281 const struct {
1282 const RTLIB::Libcall Op;
1283 const char * const Name;
1284 const CallingConv::ID CC;
1285 } LibraryCalls[] = {
1286 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1287 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1288 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1289 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1290
1291 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1292 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1293 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1294 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1295 };
1296
1297 for (const auto &LC : LibraryCalls) {
1298 setLibcallName(LC.Op, LC.Name);
1299 setLibcallCallingConv(LC.Op, LC.CC);
1300 }
1301 }
1302
1307 } else {
1310 }
1311
1312 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1313 // MSVCRT doesn't have powi; fall back to pow
1314 setLibcallName(RTLIB::POWI_F32, nullptr);
1315 setLibcallName(RTLIB::POWI_F64, nullptr);
1316 }
1317
1322
1323 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1325
1326 // Use the default implementation.
1328 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1330 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1333
1334 if (Subtarget->isTargetWindows())
1336 else
1338
1339 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1340 // the default expansion.
1341 InsertFencesForAtomic = false;
1342 if (Subtarget->hasAnyDataBarrier() &&
1343 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1344 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1345 // to ldrex/strex loops already.
1347 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1349
1350 // On v8, we have particularly efficient implementations of atomic fences
1351 // if they can be combined with nearby atomic loads and stores.
1352 if (!Subtarget->hasAcquireRelease() ||
1353 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1354 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1355 InsertFencesForAtomic = true;
1356 }
1357 } else {
1358 // If there's anything we can use as a barrier, go through custom lowering
1359 // for ATOMIC_FENCE.
1360 // If target has DMB in thumb, Fences can be inserted.
1361 if (Subtarget->hasDataBarrier())
1362 InsertFencesForAtomic = true;
1363
1365 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1366
1367 // Set them all for libcall, which will force libcalls.
1380 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1381 // Unordered/Monotonic case.
1382 if (!InsertFencesForAtomic) {
1385 }
1386 }
1387
1388 // Compute supported atomic widths.
1389 if (Subtarget->isTargetLinux() ||
1390 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1391 // For targets where __sync_* routines are reliably available, we use them
1392 // if necessary.
1393 //
1394 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1395 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1396 //
1397 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1398 // such targets should provide __sync_* routines, which use the ARM mode
1399 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1400 // encoding; see ARMISD::MEMBARRIER_MCR.)
1402 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1403 Subtarget->hasForced32BitAtomics()) {
1404 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1406 } else {
1407 // We can't assume anything about other targets; just use libatomic
1408 // routines.
1410 }
1411
1413
1415
1416 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1417 if (!Subtarget->hasV6Ops()) {
1420 }
1422
1423 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1424 !Subtarget->isThumb1Only()) {
1425 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1426 // iff target supports vfp2.
1436 }
1437
1438 // We want to custom lower some of our intrinsics.
1443 if (Subtarget->useSjLjEH())
1444 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1445
1455 if (Subtarget->hasFullFP16()) {
1459 }
1460
1462
1465 if (Subtarget->hasFullFP16())
1469 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1470
1471 // We don't support sin/cos/fmod/copysign/pow
1480 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1481 !Subtarget->isThumb1Only()) {
1484 }
1487
1488 if (!Subtarget->hasVFP4Base()) {
1491 }
1492
1493 // Various VFP goodness
1494 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1495 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1496 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1499 }
1500
1501 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1502 if (!Subtarget->hasFP16()) {
1505 }
1506
1507 // Strict floating-point comparisons need custom lowering.
1514 }
1515
1516 // Use __sincos_stret if available.
1517 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1518 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1521 }
1522
1523 // FP-ARMv8 implements a lot of rounding-like FP operations.
1524 if (Subtarget->hasFPARMv8Base()) {
1533 if (Subtarget->hasNEON()) {
1538 }
1539
1540 if (Subtarget->hasFP64()) {
1549 }
1550 }
1551
1552 // FP16 often need to be promoted to call lib functions
1553 if (Subtarget->hasFullFP16()) {
1568
1570 }
1571
1572 if (Subtarget->hasNEON()) {
1573 // vmin and vmax aren't available in a scalar form, so we can use
1574 // a NEON instruction with an undef lane instead.
1583
1584 if (Subtarget->hasFullFP16()) {
1589
1594 }
1595 }
1596
1597 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1598 // it, but it's just a wrapper around ldexp.
1599 if (Subtarget->isTargetWindows()) {
1601 if (isOperationExpand(Op, MVT::f32))
1602 setOperationAction(Op, MVT::f32, Promote);
1603 }
1604
1605 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1606 // isn't legal.
1608 if (isOperationExpand(Op, MVT::f16))
1609 setOperationAction(Op, MVT::f16, Promote);
1610
1611 // We have target-specific dag combine patterns for the following nodes:
1612 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1615
1616 if (Subtarget->hasMVEIntegerOps())
1618
1619 if (Subtarget->hasV6Ops())
1621 if (Subtarget->isThumb1Only())
1623 // Attempt to lower smin/smax to ssat/usat
1624 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1625 Subtarget->isThumb2()) {
1627 }
1628
1630
1631 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1632 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1634 else
1636
1637 //// temporary - rewrite interface to use type
1640 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1642 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1644
1645 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1646 // are at least 4 bytes aligned.
1648
1649 // Prefer likely predicted branches to selects on out-of-order cores.
1650 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1651
1652 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1654
1655 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1656}
1657
1659 return Subtarget->useSoftFloat();
1660}
1661
1662// FIXME: It might make sense to define the representative register class as the
1663// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1664// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1665// SPR's representative would be DPR_VFP2. This should work well if register
1666// pressure tracking were modified such that a register use would increment the
1667// pressure of the register class's representative and all of it's super
1668// classes' representatives transitively. We have not implemented this because
1669// of the difficulty prior to coalescing of modeling operand register classes
1670// due to the common occurrence of cross class copies and subregister insertions
1671// and extractions.
1672std::pair<const TargetRegisterClass *, uint8_t>
1674 MVT VT) const {
1675 const TargetRegisterClass *RRC = nullptr;
1676 uint8_t Cost = 1;
1677 switch (VT.SimpleTy) {
1678 default:
1680 // Use DPR as representative register class for all floating point
1681 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1682 // the cost is 1 for both f32 and f64.
1683 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1684 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1685 RRC = &ARM::DPRRegClass;
1686 // When NEON is used for SP, only half of the register file is available
1687 // because operations that define both SP and DP results will be constrained
1688 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1689 // coalescing by double-counting the SP regs. See the FIXME above.
1690 if (Subtarget->useNEONForSinglePrecisionFP())
1691 Cost = 2;
1692 break;
1693 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1694 case MVT::v4f32: case MVT::v2f64:
1695 RRC = &ARM::DPRRegClass;
1696 Cost = 2;
1697 break;
1698 case MVT::v4i64:
1699 RRC = &ARM::DPRRegClass;
1700 Cost = 4;
1701 break;
1702 case MVT::v8i64:
1703 RRC = &ARM::DPRRegClass;
1704 Cost = 8;
1705 break;
1706 }
1707 return std::make_pair(RRC, Cost);
1708}
1709
1710const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1711#define MAKE_CASE(V) \
1712 case V: \
1713 return #V;
1714 switch ((ARMISD::NodeType)Opcode) {
1716 break;
1919#undef MAKE_CASE
1920 }
1921 return nullptr;
1922}
1923
1925 EVT VT) const {
1926 if (!VT.isVector())
1927 return getPointerTy(DL);
1928
1929 // MVE has a predicate register.
1930 if ((Subtarget->hasMVEIntegerOps() &&
1931 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1932 VT == MVT::v16i8)) ||
1933 (Subtarget->hasMVEFloatOps() &&
1934 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1935 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1937}
1938
1939/// getRegClassFor - Return the register class that should be used for the
1940/// specified value type.
1941const TargetRegisterClass *
1942ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1943 (void)isDivergent;
1944 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1945 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1946 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1947 // MVE Q registers.
1948 if (Subtarget->hasNEON()) {
1949 if (VT == MVT::v4i64)
1950 return &ARM::QQPRRegClass;
1951 if (VT == MVT::v8i64)
1952 return &ARM::QQQQPRRegClass;
1953 }
1954 if (Subtarget->hasMVEIntegerOps()) {
1955 if (VT == MVT::v4i64)
1956 return &ARM::MQQPRRegClass;
1957 if (VT == MVT::v8i64)
1958 return &ARM::MQQQQPRRegClass;
1959 }
1961}
1962
1963// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1964// source/dest is aligned and the copy size is large enough. We therefore want
1965// to align such objects passed to memory intrinsics.
1967 Align &PrefAlign) const {
1968 if (!isa<MemIntrinsic>(CI))
1969 return false;
1970 MinSize = 8;
1971 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1972 // cycle faster than 4-byte aligned LDM.
1973 PrefAlign =
1974 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1975 return true;
1976}
1977
1978// Create a fast isel object.
1979FastISel *
1981 const TargetLibraryInfo *libInfo) const {
1982 return ARM::createFastISel(funcInfo, libInfo);
1983}
1984
1986 unsigned NumVals = N->getNumValues();
1987 if (!NumVals)
1988 return Sched::RegPressure;
1989
1990 for (unsigned i = 0; i != NumVals; ++i) {
1991 EVT VT = N->getValueType(i);
1992 if (VT == MVT::Glue || VT == MVT::Other)
1993 continue;
1994 if (VT.isFloatingPoint() || VT.isVector())
1995 return Sched::ILP;
1996 }
1997
1998 if (!N->isMachineOpcode())
1999 return Sched::RegPressure;
2000
2001 // Load are scheduled for latency even if there instruction itinerary
2002 // is not available.
2003 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2004 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
2005
2006 if (MCID.getNumDefs() == 0)
2007 return Sched::RegPressure;
2008 if (!Itins->isEmpty() &&
2009 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
2010 return Sched::ILP;
2011
2012 return Sched::RegPressure;
2013}
2014
2015//===----------------------------------------------------------------------===//
2016// Lowering Code
2017//===----------------------------------------------------------------------===//
2018
2019static bool isSRL16(const SDValue &Op) {
2020 if (Op.getOpcode() != ISD::SRL)
2021 return false;
2022 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2023 return Const->getZExtValue() == 16;
2024 return false;
2025}
2026
2027static bool isSRA16(const SDValue &Op) {
2028 if (Op.getOpcode() != ISD::SRA)
2029 return false;
2030 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2031 return Const->getZExtValue() == 16;
2032 return false;
2033}
2034
2035static bool isSHL16(const SDValue &Op) {
2036 if (Op.getOpcode() != ISD::SHL)
2037 return false;
2038 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2039 return Const->getZExtValue() == 16;
2040 return false;
2041}
2042
2043// Check for a signed 16-bit value. We special case SRA because it makes it
2044// more simple when also looking for SRAs that aren't sign extending a
2045// smaller value. Without the check, we'd need to take extra care with
2046// checking order for some operations.
2047static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
2048 if (isSRA16(Op))
2049 return isSHL16(Op.getOperand(0));
2050 return DAG.ComputeNumSignBits(Op) == 17;
2051}
2052
2053/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
2055 switch (CC) {
2056 default: llvm_unreachable("Unknown condition code!");
2057 case ISD::SETNE: return ARMCC::NE;
2058 case ISD::SETEQ: return ARMCC::EQ;
2059 case ISD::SETGT: return ARMCC::GT;
2060 case ISD::SETGE: return ARMCC::GE;
2061 case ISD::SETLT: return ARMCC::LT;
2062 case ISD::SETLE: return ARMCC::LE;
2063 case ISD::SETUGT: return ARMCC::HI;
2064 case ISD::SETUGE: return ARMCC::HS;
2065 case ISD::SETULT: return ARMCC::LO;
2066 case ISD::SETULE: return ARMCC::LS;
2067 }
2068}
2069
2070/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
2072 ARMCC::CondCodes &CondCode2) {
2073 CondCode2 = ARMCC::AL;
2074 switch (CC) {
2075 default: llvm_unreachable("Unknown FP condition!");
2076 case ISD::SETEQ:
2077 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
2078 case ISD::SETGT:
2079 case ISD::SETOGT: CondCode = ARMCC::GT; break;
2080 case ISD::SETGE:
2081 case ISD::SETOGE: CondCode = ARMCC::GE; break;
2082 case ISD::SETOLT: CondCode = ARMCC::MI; break;
2083 case ISD::SETOLE: CondCode = ARMCC::LS; break;
2084 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
2085 case ISD::SETO: CondCode = ARMCC::VC; break;
2086 case ISD::SETUO: CondCode = ARMCC::VS; break;
2087 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
2088 case ISD::SETUGT: CondCode = ARMCC::HI; break;
2089 case ISD::SETUGE: CondCode = ARMCC::PL; break;
2090 case ISD::SETLT:
2091 case ISD::SETULT: CondCode = ARMCC::LT; break;
2092 case ISD::SETLE:
2093 case ISD::SETULE: CondCode = ARMCC::LE; break;
2094 case ISD::SETNE:
2095 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2096 }
2097}
2098
2099//===----------------------------------------------------------------------===//
2100// Calling Convention Implementation
2101//===----------------------------------------------------------------------===//
2102
2103/// getEffectiveCallingConv - Get the effective calling convention, taking into
2104/// account presence of floating point hardware and calling convention
2105/// limitations, such as support for variadic functions.
2107ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2108 bool isVarArg) const {
2109 switch (CC) {
2110 default:
2111 report_fatal_error("Unsupported calling convention");
2114 case CallingConv::GHC:
2116 return CC;
2122 case CallingConv::Swift:
2125 case CallingConv::C:
2126 case CallingConv::Tail:
2127 if (!Subtarget->isAAPCS_ABI())
2128 return CallingConv::ARM_APCS;
2129 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
2130 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2131 !isVarArg)
2133 else
2135 case CallingConv::Fast:
2137 if (!Subtarget->isAAPCS_ABI()) {
2138 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2139 return CallingConv::Fast;
2140 return CallingConv::ARM_APCS;
2141 } else if (Subtarget->hasVFP2Base() &&
2142 !Subtarget->isThumb1Only() && !isVarArg)
2144 else
2146 }
2147}
2148
2150 bool isVarArg) const {
2151 return CCAssignFnForNode(CC, false, isVarArg);
2152}
2153
2155 bool isVarArg) const {
2156 return CCAssignFnForNode(CC, true, isVarArg);
2157}
2158
2159/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2160/// CallingConvention.
2161CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2162 bool Return,
2163 bool isVarArg) const {
2164 switch (getEffectiveCallingConv(CC, isVarArg)) {
2165 default:
2166 report_fatal_error("Unsupported calling convention");
2168 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2170 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2172 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2173 case CallingConv::Fast:
2174 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2175 case CallingConv::GHC:
2176 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2178 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2180 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2182 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2183 }
2184}
2185
2186SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2187 MVT LocVT, MVT ValVT, SDValue Val) const {
2188 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2189 Val);
2190 if (Subtarget->hasFullFP16()) {
2191 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2192 } else {
2193 Val = DAG.getNode(ISD::TRUNCATE, dl,
2194 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2195 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2196 }
2197 return Val;
2198}
2199
2200SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2201 MVT LocVT, MVT ValVT,
2202 SDValue Val) const {
2203 if (Subtarget->hasFullFP16()) {
2204 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2205 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2206 } else {
2207 Val = DAG.getNode(ISD::BITCAST, dl,
2208 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2209 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2210 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2211 }
2212 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2213}
2214
2215/// LowerCallResult - Lower the result values of a call into the
2216/// appropriate copies out of appropriate physical registers.
2217SDValue ARMTargetLowering::LowerCallResult(
2218 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2219 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2220 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2221 SDValue ThisVal, bool isCmseNSCall) const {
2222 // Assign locations to each value returned by this call.
2224 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2225 *DAG.getContext());
2226 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2227
2228 // Copy all of the result registers out of their specified physreg.
2229 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2230 CCValAssign VA = RVLocs[i];
2231
2232 // Pass 'this' value directly from the argument to return value, to avoid
2233 // reg unit interference
2234 if (i == 0 && isThisReturn) {
2235 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2236 "unexpected return calling convention register assignment");
2237 InVals.push_back(ThisVal);
2238 continue;
2239 }
2240
2241 SDValue Val;
2242 if (VA.needsCustom() &&
2243 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2244 // Handle f64 or half of a v2f64.
2245 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2246 InGlue);
2247 Chain = Lo.getValue(1);
2248 InGlue = Lo.getValue(2);
2249 VA = RVLocs[++i]; // skip ahead to next loc
2250 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2251 InGlue);
2252 Chain = Hi.getValue(1);
2253 InGlue = Hi.getValue(2);
2254 if (!Subtarget->isLittle())
2255 std::swap (Lo, Hi);
2256 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2257
2258 if (VA.getLocVT() == MVT::v2f64) {
2259 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2260 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2261 DAG.getConstant(0, dl, MVT::i32));
2262
2263 VA = RVLocs[++i]; // skip ahead to next loc
2264 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2265 Chain = Lo.getValue(1);
2266 InGlue = Lo.getValue(2);
2267 VA = RVLocs[++i]; // skip ahead to next loc
2268 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2269 Chain = Hi.getValue(1);
2270 InGlue = Hi.getValue(2);
2271 if (!Subtarget->isLittle())
2272 std::swap (Lo, Hi);
2273 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2274 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2275 DAG.getConstant(1, dl, MVT::i32));
2276 }
2277 } else {
2278 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2279 InGlue);
2280 Chain = Val.getValue(1);
2281 InGlue = Val.getValue(2);
2282 }
2283
2284 switch (VA.getLocInfo()) {
2285 default: llvm_unreachable("Unknown loc info!");
2286 case CCValAssign::Full: break;
2287 case CCValAssign::BCvt:
2288 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2289 break;
2290 }
2291
2292 // f16 arguments have their size extended to 4 bytes and passed as if they
2293 // had been copied to the LSBs of a 32-bit register.
2294 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2295 if (VA.needsCustom() &&
2296 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2297 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2298
2299 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2300 // is less than 32 bits must be sign- or zero-extended after the call for
2301 // security reasons. Although the ABI mandates an extension done by the
2302 // callee, the latter cannot be trusted to follow the rules of the ABI.
2303 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2304 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2305 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2306 Val = handleCMSEValue(Val, Arg, DAG, dl);
2307
2308 InVals.push_back(Val);
2309 }
2310
2311 return Chain;
2312}
2313
2314std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2315 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2316 bool IsTailCall, int SPDiff) const {
2317 SDValue DstAddr;
2318 MachinePointerInfo DstInfo;
2319 int32_t Offset = VA.getLocMemOffset();
2321
2322 if (IsTailCall) {
2323 Offset += SPDiff;
2324 auto PtrVT = getPointerTy(DAG.getDataLayout());
2325 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2326 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2327 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2328 DstInfo =
2330 } else {
2331 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2332 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2333 StackPtr, PtrOff);
2334 DstInfo =
2336 }
2337
2338 return std::make_pair(DstAddr, DstInfo);
2339}
2340
2341void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2342 SDValue Chain, SDValue &Arg,
2343 RegsToPassVector &RegsToPass,
2344 CCValAssign &VA, CCValAssign &NextVA,
2345 SDValue &StackPtr,
2346 SmallVectorImpl<SDValue> &MemOpChains,
2347 bool IsTailCall,
2348 int SPDiff) const {
2349 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2350 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2351 unsigned id = Subtarget->isLittle() ? 0 : 1;
2352 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2353
2354 if (NextVA.isRegLoc())
2355 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2356 else {
2357 assert(NextVA.isMemLoc());
2358 if (!StackPtr.getNode())
2359 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2361
2362 SDValue DstAddr;
2363 MachinePointerInfo DstInfo;
2364 std::tie(DstAddr, DstInfo) =
2365 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2366 MemOpChains.push_back(
2367 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2368 }
2369}
2370
2371static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2372 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2374}
2375
2376/// LowerCall - Lowering a call into a callseq_start <-
2377/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2378/// nodes.
2379SDValue
2380ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2381 SmallVectorImpl<SDValue> &InVals) const {
2382 SelectionDAG &DAG = CLI.DAG;
2383 SDLoc &dl = CLI.DL;
2385 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2387 SDValue Chain = CLI.Chain;
2388 SDValue Callee = CLI.Callee;
2389 bool &isTailCall = CLI.IsTailCall;
2390 CallingConv::ID CallConv = CLI.CallConv;
2391 bool doesNotRet = CLI.DoesNotReturn;
2392 bool isVarArg = CLI.IsVarArg;
2393
2397 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2398 bool isThisReturn = false;
2399 bool isCmseNSCall = false;
2400 bool isSibCall = false;
2401 bool PreferIndirect = false;
2402 bool GuardWithBTI = false;
2403
2404 // Analyze operands of the call, assigning locations to each operand.
2406 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2407 *DAG.getContext());
2408 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2409
2410 // Lower 'returns_twice' calls to a pseudo-instruction.
2411 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2412 !Subtarget->noBTIAtReturnTwice())
2413 GuardWithBTI = AFI->branchTargetEnforcement();
2414
2415 // Determine whether this is a non-secure function call.
2416 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2417 isCmseNSCall = true;
2418
2419 // Disable tail calls if they're not supported.
2420 if (!Subtarget->supportsTailCall())
2421 isTailCall = false;
2422
2423 // For both the non-secure calls and the returns from a CMSE entry function,
2424 // the function needs to do some extra work afte r the call, or before the
2425 // return, respectively, thus it cannot end with atail call
2426 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2427 isTailCall = false;
2428
2429 if (isa<GlobalAddressSDNode>(Callee)) {
2430 // If we're optimizing for minimum size and the function is called three or
2431 // more times in this block, we can improve codesize by calling indirectly
2432 // as BLXr has a 16-bit encoding.
2433 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2434 if (CLI.CB) {
2435 auto *BB = CLI.CB->getParent();
2436 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2437 count_if(GV->users(), [&BB](const User *U) {
2438 return isa<Instruction>(U) &&
2439 cast<Instruction>(U)->getParent() == BB;
2440 }) > 2;
2441 }
2442 }
2443 if (isTailCall) {
2444 // Check if it's really possible to do a tail call.
2445 isTailCall =
2446 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2447
2448 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2449 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2450 isSibCall = true;
2451
2452 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2453 // detected sibcalls.
2454 if (isTailCall)
2455 ++NumTailCalls;
2456 }
2457
2458 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2459 report_fatal_error("failed to perform tail call elimination on a call "
2460 "site marked musttail");
2461
2462 // Get a count of how many bytes are to be pushed on the stack.
2463 unsigned NumBytes = CCInfo.getStackSize();
2464
2465 // SPDiff is the byte offset of the call's argument area from the callee's.
2466 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2467 // by this amount for a tail call. In a sibling call it must be 0 because the
2468 // caller will deallocate the entire stack and the callee still expects its
2469 // arguments to begin at SP+0. Completely unused for non-tail calls.
2470 int SPDiff = 0;
2471
2472 if (isTailCall && !isSibCall) {
2473 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2474 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2475
2476 // Since callee will pop argument stack as a tail call, we must keep the
2477 // popped size 16-byte aligned.
2478 Align StackAlign = DAG.getDataLayout().getStackAlignment();
2479 NumBytes = alignTo(NumBytes, StackAlign);
2480
2481 // SPDiff will be negative if this tail call requires more space than we
2482 // would automatically have in our incoming argument space. Positive if we
2483 // can actually shrink the stack.
2484 SPDiff = NumReusableBytes - NumBytes;
2485
2486 // If this call requires more stack than we have available from
2487 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2488 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2489 AFI->setArgRegsSaveSize(-SPDiff);
2490 }
2491
2492 if (isSibCall) {
2493 // For sibling tail calls, memory operands are available in our caller's stack.
2494 NumBytes = 0;
2495 } else {
2496 // Adjust the stack pointer for the new arguments...
2497 // These operations are automatically eliminated by the prolog/epilog pass
2498 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2499 }
2500
2502 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2503
2504 RegsToPassVector RegsToPass;
2505 SmallVector<SDValue, 8> MemOpChains;
2506
2507 // During a tail call, stores to the argument area must happen after all of
2508 // the function's incoming arguments have been loaded because they may alias.
2509 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2510 // there's no point in doing so repeatedly so this tracks whether that's
2511 // happened yet.
2512 bool AfterFormalArgLoads = false;
2513
2514 // Walk the register/memloc assignments, inserting copies/loads. In the case
2515 // of tail call optimization, arguments are handled later.
2516 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2517 i != e;
2518 ++i, ++realArgIdx) {
2519 CCValAssign &VA = ArgLocs[i];
2520 SDValue Arg = OutVals[realArgIdx];
2521 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2522 bool isByVal = Flags.isByVal();
2523
2524 // Promote the value if needed.
2525 switch (VA.getLocInfo()) {
2526 default: llvm_unreachable("Unknown loc info!");
2527 case CCValAssign::Full: break;
2528 case CCValAssign::SExt:
2529 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2530 break;
2531 case CCValAssign::ZExt:
2532 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2533 break;
2534 case CCValAssign::AExt:
2535 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2536 break;
2537 case CCValAssign::BCvt:
2538 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2539 break;
2540 }
2541
2542 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2543 Chain = DAG.getStackArgumentTokenFactor(Chain);
2544 AfterFormalArgLoads = true;
2545 }
2546
2547 // f16 arguments have their size extended to 4 bytes and passed as if they
2548 // had been copied to the LSBs of a 32-bit register.
2549 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2550 if (VA.needsCustom() &&
2551 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2552 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2553 } else {
2554 // f16 arguments could have been extended prior to argument lowering.
2555 // Mask them arguments if this is a CMSE nonsecure call.
2556 auto ArgVT = Outs[realArgIdx].ArgVT;
2557 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2558 auto LocBits = VA.getLocVT().getSizeInBits();
2559 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2560 SDValue Mask =
2561 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2562 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2563 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2564 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2565 }
2566 }
2567
2568 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2569 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2570 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2571 DAG.getConstant(0, dl, MVT::i32));
2572 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2573 DAG.getConstant(1, dl, MVT::i32));
2574
2575 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2576 StackPtr, MemOpChains, isTailCall, SPDiff);
2577
2578 VA = ArgLocs[++i]; // skip ahead to next loc
2579 if (VA.isRegLoc()) {
2580 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2581 StackPtr, MemOpChains, isTailCall, SPDiff);
2582 } else {
2583 assert(VA.isMemLoc());
2584 SDValue DstAddr;
2585 MachinePointerInfo DstInfo;
2586 std::tie(DstAddr, DstInfo) =
2587 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2588 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2589 }
2590 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2591 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2592 StackPtr, MemOpChains, isTailCall, SPDiff);
2593 } else if (VA.isRegLoc()) {
2594 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2595 Outs[0].VT == MVT::i32) {
2596 assert(VA.getLocVT() == MVT::i32 &&
2597 "unexpected calling convention register assignment");
2598 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2599 "unexpected use of 'returned'");
2600 isThisReturn = true;
2601 }
2602 const TargetOptions &Options = DAG.getTarget().Options;
2603 if (Options.EmitCallSiteInfo)
2604 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2605 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2606 } else if (isByVal) {
2607 assert(VA.isMemLoc());
2608 unsigned offset = 0;
2609
2610 // True if this byval aggregate will be split between registers
2611 // and memory.
2612 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2613 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2614
2615 if (CurByValIdx < ByValArgsCount) {
2616
2617 unsigned RegBegin, RegEnd;
2618 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2619
2620 EVT PtrVT =
2622 unsigned int i, j;
2623 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2624 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2625 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2626 SDValue Load =
2627 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2628 DAG.InferPtrAlign(AddArg));
2629 MemOpChains.push_back(Load.getValue(1));
2630 RegsToPass.push_back(std::make_pair(j, Load));
2631 }
2632
2633 // If parameter size outsides register area, "offset" value
2634 // helps us to calculate stack slot for remained part properly.
2635 offset = RegEnd - RegBegin;
2636
2637 CCInfo.nextInRegsParam();
2638 }
2639
2640 if (Flags.getByValSize() > 4*offset) {
2641 auto PtrVT = getPointerTy(DAG.getDataLayout());
2642 SDValue Dst;
2643 MachinePointerInfo DstInfo;
2644 std::tie(Dst, DstInfo) =
2645 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2646 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2647 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2648 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2649 MVT::i32);
2650 SDValue AlignNode =
2651 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2652
2653 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2654 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2655 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2656 Ops));
2657 }
2658 } else {
2659 assert(VA.isMemLoc());
2660 SDValue DstAddr;
2661 MachinePointerInfo DstInfo;
2662 std::tie(DstAddr, DstInfo) =
2663 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2664
2665 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2666 MemOpChains.push_back(Store);
2667 }
2668 }
2669
2670 if (!MemOpChains.empty())
2671 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2672
2673 // Build a sequence of copy-to-reg nodes chained together with token chain
2674 // and flag operands which copy the outgoing args into the appropriate regs.
2675 SDValue InGlue;
2676 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2677 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2678 RegsToPass[i].second, InGlue);
2679 InGlue = Chain.getValue(1);
2680 }
2681
2682 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2683 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2684 // node so that legalize doesn't hack it.
2685 bool isDirect = false;
2686
2688 const GlobalValue *GVal = nullptr;
2689 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2690 GVal = G->getGlobal();
2691 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2692
2693 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2694 bool isLocalARMFunc = false;
2695 auto PtrVt = getPointerTy(DAG.getDataLayout());
2696
2697 if (Subtarget->genLongCalls()) {
2698 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2699 "long-calls codegen is not position independent!");
2700 // Handle a global address or an external symbol. If it's not one of
2701 // those, the target's already in a register, so we don't need to do
2702 // anything extra.
2703 if (isa<GlobalAddressSDNode>(Callee)) {
2704 if (Subtarget->genExecuteOnly()) {
2705 if (Subtarget->useMovt())
2706 ++NumMovwMovt;
2707 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2708 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2709 } else {
2710 // Create a constant pool entry for the callee address
2711 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2713 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2714
2715 // Get the address of the callee into a register
2716 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2717 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2718 Callee = DAG.getLoad(
2719 PtrVt, dl, DAG.getEntryNode(), Addr,
2721 }
2722 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2723 const char *Sym = S->getSymbol();
2724
2725 if (Subtarget->genExecuteOnly()) {
2726 if (Subtarget->useMovt())
2727 ++NumMovwMovt;
2728 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2729 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2730 } else {
2731 // Create a constant pool entry for the callee address
2732 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2734 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2735
2736 // Get the address of the callee into a register
2737 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2738 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2739 Callee = DAG.getLoad(
2740 PtrVt, dl, DAG.getEntryNode(), Addr,
2742 }
2743 }
2744 } else if (isa<GlobalAddressSDNode>(Callee)) {
2745 if (!PreferIndirect) {
2746 isDirect = true;
2747 bool isDef = GVal->isStrongDefinitionForLinker();
2748
2749 // ARM call to a local ARM function is predicable.
2750 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2751 // tBX takes a register source operand.
2752 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2753 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2754 Callee = DAG.getNode(
2755 ARMISD::WrapperPIC, dl, PtrVt,
2756 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2757 Callee = DAG.getLoad(
2758 PtrVt, dl, DAG.getEntryNode(), Callee,
2762 } else if (Subtarget->isTargetCOFF()) {
2763 assert(Subtarget->isTargetWindows() &&
2764 "Windows is the only supported COFF target");
2765 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2766 if (GVal->hasDLLImportStorageClass())
2767 TargetFlags = ARMII::MO_DLLIMPORT;
2768 else if (!TM.shouldAssumeDSOLocal(GVal))
2769 TargetFlags = ARMII::MO_COFFSTUB;
2770 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2771 TargetFlags);
2772 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2773 Callee =
2774 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2775 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2777 } else {
2778 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2779 }
2780 }
2781 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2782 isDirect = true;
2783 // tBX takes a register source operand.
2784 const char *Sym = S->getSymbol();
2785 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2786 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2789 ARMPCLabelIndex, 4);
2790 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2791 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2792 Callee = DAG.getLoad(
2793 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2795 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2796 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2797 } else {
2798 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2799 }
2800 }
2801
2802 if (isCmseNSCall) {
2803 assert(!isARMFunc && !isDirect &&
2804 "Cannot handle call to ARM function or direct call");
2805 if (NumBytes > 0) {
2807 "call to non-secure function would "
2808 "require passing arguments on stack",
2809 dl.getDebugLoc());
2810 DAG.getContext()->diagnose(Diag);
2811 }
2812 if (isStructRet) {
2815 "call to non-secure function would return value through pointer",
2816 dl.getDebugLoc());
2817 DAG.getContext()->diagnose(Diag);
2818 }
2819 }
2820
2821 // FIXME: handle tail calls differently.
2822 unsigned CallOpc;
2823 if (Subtarget->isThumb()) {
2824 if (GuardWithBTI)
2825 CallOpc = ARMISD::t2CALL_BTI;
2826 else if (isCmseNSCall)
2827 CallOpc = ARMISD::tSECALL;
2828 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2829 CallOpc = ARMISD::CALL_NOLINK;
2830 else
2831 CallOpc = ARMISD::CALL;
2832 } else {
2833 if (!isDirect && !Subtarget->hasV5TOps())
2834 CallOpc = ARMISD::CALL_NOLINK;
2835 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2836 // Emit regular call when code size is the priority
2837 !Subtarget->hasMinSize())
2838 // "mov lr, pc; b _foo" to avoid confusing the RSP
2839 CallOpc = ARMISD::CALL_NOLINK;
2840 else
2841 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2842 }
2843
2844 // We don't usually want to end the call-sequence here because we would tidy
2845 // the frame up *after* the call, however in the ABI-changing tail-call case
2846 // we've carefully laid out the parameters so that when sp is reset they'll be
2847 // in the correct location.
2848 if (isTailCall && !isSibCall) {
2849 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2850 InGlue = Chain.getValue(1);
2851 }
2852
2853 std::vector<SDValue> Ops;
2854 Ops.push_back(Chain);
2855 Ops.push_back(Callee);
2856
2857 if (isTailCall) {
2858 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2859 }
2860
2861 // Add argument registers to the end of the list so that they are known live
2862 // into the call.
2863 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2864 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2865 RegsToPass[i].second.getValueType()));
2866
2867 // Add a register mask operand representing the call-preserved registers.
2868 const uint32_t *Mask;
2869 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2870 if (isThisReturn) {
2871 // For 'this' returns, use the R0-preserving mask if applicable
2872 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2873 if (!Mask) {
2874 // Set isThisReturn to false if the calling convention is not one that
2875 // allows 'returned' to be modeled in this way, so LowerCallResult does
2876 // not try to pass 'this' straight through
2877 isThisReturn = false;
2878 Mask = ARI->getCallPreservedMask(MF, CallConv);
2879 }
2880 } else
2881 Mask = ARI->getCallPreservedMask(MF, CallConv);
2882
2883 assert(Mask && "Missing call preserved mask for calling convention");
2884 Ops.push_back(DAG.getRegisterMask(Mask));
2885
2886 if (InGlue.getNode())
2887 Ops.push_back(InGlue);
2888
2889 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2890 if (isTailCall) {
2892 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2893 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2894 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2895 return Ret;
2896 }
2897
2898 // Returns a chain and a flag for retval copy to use.
2899 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2900 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2901 InGlue = Chain.getValue(1);
2902 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2903
2904 // If we're guaranteeing tail-calls will be honoured, the callee must
2905 // pop its own argument stack on return. But this call is *not* a tail call so
2906 // we need to undo that after it returns to restore the status-quo.
2907 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2908 uint64_t CalleePopBytes =
2909 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
2910
2911 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2912 if (!Ins.empty())
2913 InGlue = Chain.getValue(1);
2914
2915 // Handle result values, copying them out of physregs into vregs that we
2916 // return.
2917 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2918 InVals, isThisReturn,
2919 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2920}
2921
2922/// HandleByVal - Every parameter *after* a byval parameter is passed
2923/// on the stack. Remember the next parameter register to allocate,
2924/// and then confiscate the rest of the parameter registers to insure
2925/// this.
2926void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2927 Align Alignment) const {
2928 // Byval (as with any stack) slots are always at least 4 byte aligned.
2929 Alignment = std::max(Alignment, Align(4));
2930
2931 unsigned Reg = State->AllocateReg(GPRArgRegs);
2932 if (!Reg)
2933 return;
2934
2935 unsigned AlignInRegs = Alignment.value() / 4;
2936 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2937 for (unsigned i = 0; i < Waste; ++i)
2938 Reg = State->AllocateReg(GPRArgRegs);
2939
2940 if (!Reg)
2941 return;
2942
2943 unsigned Excess = 4 * (ARM::R4 - Reg);
2944
2945 // Special case when NSAA != SP and parameter size greater than size of
2946 // all remained GPR regs. In that case we can't split parameter, we must
2947 // send it to stack. We also must set NCRN to R4, so waste all
2948 // remained registers.
2949 const unsigned NSAAOffset = State->getStackSize();
2950 if (NSAAOffset != 0 && Size > Excess) {
2951 while (State->AllocateReg(GPRArgRegs))
2952 ;
2953 return;
2954 }
2955
2956 // First register for byval parameter is the first register that wasn't
2957 // allocated before this method call, so it would be "reg".
2958 // If parameter is small enough to be saved in range [reg, r4), then
2959 // the end (first after last) register would be reg + param-size-in-regs,
2960 // else parameter would be splitted between registers and stack,
2961 // end register would be r4 in this case.
2962 unsigned ByValRegBegin = Reg;
2963 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2964 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2965 // Note, first register is allocated in the beginning of function already,
2966 // allocate remained amount of registers we need.
2967 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2968 State->AllocateReg(GPRArgRegs);
2969 // A byval parameter that is split between registers and memory needs its
2970 // size truncated here.
2971 // In the case where the entire structure fits in registers, we set the
2972 // size in memory to zero.
2973 Size = std::max<int>(Size - Excess, 0);
2974}
2975
2976/// MatchingStackOffset - Return true if the given stack call argument is
2977/// already available in the same position (relatively) of the caller's
2978/// incoming argument stack.
2979static
2982 const TargetInstrInfo *TII) {
2983 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2984 int FI = std::numeric_limits<int>::max();
2985 if (Arg.getOpcode() == ISD::CopyFromReg) {
2986 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2987 if (!VR.isVirtual())
2988 return false;
2989 MachineInstr *Def = MRI->getVRegDef(VR);
2990 if (!Def)
2991 return false;
2992 if (!Flags.isByVal()) {
2993 if (!TII->isLoadFromStackSlot(*Def, FI))
2994 return false;
2995 } else {
2996 return false;
2997 }
2998 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2999 if (Flags.isByVal())
3000 // ByVal argument is passed in as a pointer but it's now being
3001 // dereferenced. e.g.
3002 // define @foo(%struct.X* %A) {
3003 // tail call @bar(%struct.X* byval %A)
3004 // }
3005 return false;
3006 SDValue Ptr = Ld->getBasePtr();
3007 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3008 if (!FINode)
3009 return false;
3010 FI = FINode->getIndex();
3011 } else
3012 return false;
3013
3014 assert(FI != std::numeric_limits<int>::max());
3015 if (!MFI.isFixedObjectIndex(FI))
3016 return false;
3017 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
3018}
3019
3020/// IsEligibleForTailCallOptimization - Check whether the call is eligible
3021/// for tail call optimization. Targets which want to do tail call
3022/// optimization should implement this function. Note that this function also
3023/// processes musttail calls, so when this function returns false on a valid
3024/// musttail call, a fatal backend error occurs.
3025bool ARMTargetLowering::IsEligibleForTailCallOptimization(
3027 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
3028 CallingConv::ID CalleeCC = CLI.CallConv;
3029 SDValue Callee = CLI.Callee;
3030 bool isVarArg = CLI.IsVarArg;
3031 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3032 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3034 const SelectionDAG &DAG = CLI.DAG;
3036 const Function &CallerF = MF.getFunction();
3037 CallingConv::ID CallerCC = CallerF.getCallingConv();
3038
3039 assert(Subtarget->supportsTailCall());
3040
3041 // Indirect tail calls cannot be optimized for Thumb1 if the args
3042 // to the call take up r0-r3. The reason is that there are no legal registers
3043 // left to hold the pointer to the function to be called.
3044 // Similarly, if the function uses return address sign and authentication,
3045 // r12 is needed to hold the PAC and is not available to hold the callee
3046 // address.
3047 if (Outs.size() >= 4 &&
3048 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
3049 if (Subtarget->isThumb1Only())
3050 return false;
3051 // Conservatively assume the function spills LR.
3053 return false;
3054 }
3055
3056 // Look for obvious safe cases to perform tail call optimization that do not
3057 // require ABI changes. This is what gcc calls sibcall.
3058
3059 // Exception-handling functions need a special set of instructions to indicate
3060 // a return to the hardware. Tail-calling another function would probably
3061 // break this.
3062 if (CallerF.hasFnAttribute("interrupt"))
3063 return false;
3064
3065 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
3066 return CalleeCC == CallerCC;
3067
3068 // Also avoid sibcall optimization if either caller or callee uses struct
3069 // return semantics.
3070 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
3071 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
3072 if (isCalleeStructRet || isCallerStructRet)
3073 return false;
3074
3075 // Externally-defined functions with weak linkage should not be
3076 // tail-called on ARM when the OS does not support dynamic
3077 // pre-emption of symbols, as the AAELF spec requires normal calls
3078 // to undefined weak functions to be replaced with a NOP or jump to the
3079 // next instruction. The behaviour of branch instructions in this
3080 // situation (as used for tail calls) is implementation-defined, so we
3081 // cannot rely on the linker replacing the tail call with a return.
3082 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3083 const GlobalValue *GV = G->getGlobal();
3085 if (GV->hasExternalWeakLinkage() &&
3086 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3087 return false;
3088 }
3089
3090 // Check that the call results are passed in the same way.
3091 LLVMContext &C = *DAG.getContext();
3093 getEffectiveCallingConv(CalleeCC, isVarArg),
3094 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3095 CCAssignFnForReturn(CalleeCC, isVarArg),
3096 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
3097 return false;
3098 // The callee has to preserve all registers the caller needs to preserve.
3099 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3100 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3101 if (CalleeCC != CallerCC) {
3102 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3103 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3104 return false;
3105 }
3106
3107 // If Caller's vararg or byval argument has been split between registers and
3108 // stack, do not perform tail call, since part of the argument is in caller's
3109 // local frame.
3110 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3111 if (AFI_Caller->getArgRegsSaveSize())
3112 return false;
3113
3114 // If the callee takes no arguments then go on to check the results of the
3115 // call.
3116 if (!Outs.empty()) {
3117 if (CCInfo.getStackSize()) {
3118 // Check if the arguments are already laid out in the right way as
3119 // the caller's fixed stack objects.
3120 MachineFrameInfo &MFI = MF.getFrameInfo();
3121 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3122 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3123 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3124 i != e;
3125 ++i, ++realArgIdx) {
3126 CCValAssign &VA = ArgLocs[i];
3127 EVT RegVT = VA.getLocVT();
3128 SDValue Arg = OutVals[realArgIdx];
3129 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3131 return false;
3132 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3133 // f64 and vector types are split into multiple registers or
3134 // register/stack-slot combinations. The types will not match
3135 // the registers; give up on memory f64 refs until we figure
3136 // out what to do about this.
3137 if (!VA.isRegLoc())
3138 return false;
3139 if (!ArgLocs[++i].isRegLoc())
3140 return false;
3141 if (RegVT == MVT::v2f64) {
3142 if (!ArgLocs[++i].isRegLoc())
3143 return false;
3144 if (!ArgLocs[++i].isRegLoc())
3145 return false;
3146 }
3147 } else if (!VA.isRegLoc()) {
3149 MFI, MRI, TII))
3150 return false;
3151 }
3152 }
3153 }
3154
3155 const MachineRegisterInfo &MRI = MF.getRegInfo();
3156 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3157 return false;
3158 }
3159
3160 return true;
3161}
3162
3163bool
3164ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3165 MachineFunction &MF, bool isVarArg,
3167 LLVMContext &Context) const {
3169 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3170 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3171}
3172
3174 const SDLoc &DL, SelectionDAG &DAG) {
3175 const MachineFunction &MF = DAG.getMachineFunction();
3176 const Function &F = MF.getFunction();
3177
3178 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3179
3180 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3181 // version of the "preferred return address". These offsets affect the return
3182 // instruction if this is a return from PL1 without hypervisor extensions.
3183 // IRQ/FIQ: +4 "subs pc, lr, #4"
3184 // SWI: 0 "subs pc, lr, #0"
3185 // ABORT: +4 "subs pc, lr, #4"
3186 // UNDEF: +4/+2 "subs pc, lr, #0"
3187 // UNDEF varies depending on where the exception came from ARM or Thumb
3188 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3189
3190 int64_t LROffset;
3191 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3192 IntKind == "ABORT")
3193 LROffset = 4;
3194 else if (IntKind == "SWI" || IntKind == "UNDEF")
3195 LROffset = 0;
3196 else
3197 report_fatal_error("Unsupported interrupt attribute. If present, value "
3198 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3199
3200 RetOps.insert(RetOps.begin() + 1,
3201 DAG.getConstant(LROffset, DL, MVT::i32, false));
3202
3203 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3204}
3205
3206SDValue
3207ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3208 bool isVarArg,
3210 const SmallVectorImpl<SDValue> &OutVals,
3211 const SDLoc &dl, SelectionDAG &DAG) const {
3212 // CCValAssign - represent the assignment of the return value to a location.
3214
3215 // CCState - Info about the registers and stack slots.
3216 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3217 *DAG.getContext());
3218
3219 // Analyze outgoing return values.
3220 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3221
3222 SDValue Glue;
3224 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3225 bool isLittleEndian = Subtarget->isLittle();
3226
3229 AFI->setReturnRegsCount(RVLocs.size());
3230
3231 // Report error if cmse entry function returns structure through first ptr arg.
3232 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3233 // Note: using an empty SDLoc(), as the first line of the function is a
3234 // better place to report than the last line.
3237 "secure entry function would return value through pointer",
3238 SDLoc().getDebugLoc());
3239 DAG.getContext()->diagnose(Diag);
3240 }
3241
3242 // Copy the result values into the output registers.
3243 for (unsigned i = 0, realRVLocIdx = 0;
3244 i != RVLocs.size();
3245 ++i, ++realRVLocIdx) {
3246 CCValAssign &VA = RVLocs[i];
3247 assert(VA.isRegLoc() && "Can only return in registers!");
3248
3249 SDValue Arg = OutVals[realRVLocIdx];
3250 bool ReturnF16 = false;
3251
3252 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3253 // Half-precision return values can be returned like this:
3254 //
3255 // t11 f16 = fadd ...
3256 // t12: i16 = bitcast t11
3257 // t13: i32 = zero_extend t12
3258 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3259 //
3260 // to avoid code generation for bitcasts, we simply set Arg to the node
3261 // that produces the f16 value, t11 in this case.
3262 //
3263 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3264 SDValue ZE = Arg.getOperand(0);
3265 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3266 SDValue BC = ZE.getOperand(0);
3267 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3268 Arg = BC.getOperand(0);
3269 ReturnF16 = true;
3270 }
3271 }
3272 }
3273 }
3274
3275 switch (VA.getLocInfo()) {
3276 default: llvm_unreachable("Unknown loc info!");
3277 case CCValAssign::Full: break;
3278 case CCValAssign::BCvt:
3279 if (!ReturnF16)
3280 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3281 break;
3282 }
3283
3284 // Mask f16 arguments if this is a CMSE nonsecure entry.
3285 auto RetVT = Outs[realRVLocIdx].ArgVT;
3286 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3287 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3288 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3289 } else {
3290 auto LocBits = VA.getLocVT().getSizeInBits();
3291 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3292 SDValue Mask =
3293 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3294 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3295 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3296 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3297 }
3298 }
3299
3300 if (VA.needsCustom() &&
3301 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3302 if (VA.getLocVT() == MVT::v2f64) {
3303 // Extract the first half and return it in two registers.
3304 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3305 DAG.getConstant(0, dl, MVT::i32));
3306 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3307 DAG.getVTList(MVT::i32, MVT::i32), Half);
3308
3309 Chain =
3310 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3311 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3312 Glue = Chain.getValue(1);
3313 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3314 VA = RVLocs[++i]; // skip ahead to next loc
3315 Chain =
3316 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3317 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3318 Glue = Chain.getValue(1);
3319 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3320 VA = RVLocs[++i]; // skip ahead to next loc
3321
3322 // Extract the 2nd half and fall through to handle it as an f64 value.
3323 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3324 DAG.getConstant(1, dl, MVT::i32));
3325 }
3326 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3327 // available.
3328 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3329 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3330 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3331 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3332 Glue = Chain.getValue(1);
3333 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3334 VA = RVLocs[++i]; // skip ahead to next loc
3335 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3336 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3337 } else
3338 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3339
3340 // Guarantee that all emitted copies are
3341 // stuck together, avoiding something bad.
3342 Glue = Chain.getValue(1);
3343 RetOps.push_back(DAG.getRegister(
3344 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3345 }
3346 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3347 const MCPhysReg *I =
3348 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3349 if (I) {
3350 for (; *I; ++I) {
3351 if (ARM::GPRRegClass.contains(*I))
3352 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3353 else if (ARM::DPRRegClass.contains(*I))
3355 else
3356 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3357 }
3358 }
3359
3360 // Update chain and glue.
3361 RetOps[0] = Chain;
3362 if (Glue.getNode())
3363 RetOps.push_back(Glue);
3364
3365 // CPUs which aren't M-class use a special sequence to return from
3366 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3367 // though we use "subs pc, lr, #N").
3368 //
3369 // M-class CPUs actually use a normal return sequence with a special
3370 // (hardware-provided) value in LR, so the normal code path works.
3371 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3372 !Subtarget->isMClass()) {
3373 if (Subtarget->isThumb1Only())
3374 report_fatal_error("interrupt attribute is not supported in Thumb1");
3375 return LowerInterruptReturn(RetOps, dl, DAG);
3376 }
3377
3380 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3381}
3382
3383bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3384 if (N->getNumValues() != 1)
3385 return false;
3386 if (!N->hasNUsesOfValue(1, 0))
3387 return false;
3388
3389 SDValue TCChain = Chain;
3390 SDNode *Copy = *N->use_begin();
3391 if (Copy->getOpcode() == ISD::CopyToReg) {
3392 // If the copy has a glue operand, we conservatively assume it isn't safe to
3393 // perform a tail call.
3394 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3395 return false;
3396 TCChain = Copy->getOperand(0);
3397 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3398 SDNode *VMov = Copy;
3399 // f64 returned in a pair of GPRs.
3401 for (SDNode *U : VMov->uses()) {
3402 if (U->getOpcode() != ISD::CopyToReg)
3403 return false;
3404 Copies.insert(U);
3405 }
3406 if (Copies.size() > 2)
3407 return false;
3408
3409 for (SDNode *U : VMov->uses()) {
3410 SDValue UseChain = U->getOperand(0);
3411 if (Copies.count(UseChain.getNode()))
3412 // Second CopyToReg
3413 Copy = U;
3414 else {
3415 // We are at the top of this chain.
3416 // If the copy has a glue operand, we conservatively assume it
3417 // isn't safe to perform a tail call.
3418 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3419 return false;
3420 // First CopyToReg
3421 TCChain = UseChain;
3422 }
3423 }
3424 } else if (Copy->getOpcode() == ISD::BITCAST) {
3425 // f32 returned in a single GPR.
3426 if (!Copy->hasOneUse())
3427 return false;
3428 Copy = *Copy->use_begin();
3429 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3430 return false;
3431 // If the copy has a glue operand, we conservatively assume it isn't safe to
3432 // perform a tail call.
3433 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3434 return false;
3435 TCChain = Copy->getOperand(0);
3436 } else {
3437 return false;
3438 }
3439
3440 bool HasRet = false;
3441 for (const SDNode *U : Copy->uses()) {
3442 if (U->getOpcode() != ARMISD::RET_GLUE &&
3443 U->getOpcode() != ARMISD::INTRET_GLUE)
3444 return false;
3445 HasRet = true;
3446 }
3447
3448 if (!HasRet)
3449 return false;
3450
3451 Chain = TCChain;
3452 return true;
3453}
3454
3455bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3456 if (!Subtarget->supportsTailCall())
3457 return false;
3458
3459 if (!CI->isTailCall())
3460 return false;
3461
3462 return true;
3463}
3464
3465// Trying to write a 64 bit value so need to split into two 32 bit values first,
3466// and pass the lower and high parts through.
3468 SDLoc DL(Op);
3469 SDValue WriteValue = Op->getOperand(2);
3470
3471 // This function is only supposed to be called for i64 type argument.
3472 assert(WriteValue.getValueType() == MVT::i64
3473 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3474
3475 SDValue Lo, Hi;
3476 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3477 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3478 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3479}
3480
3481// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3482// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3483// one of the above mentioned nodes. It has to be wrapped because otherwise
3484// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3485// be used to form addressing mode. These wrapped nodes will be selected
3486// into MOVi.
3487SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3488 SelectionDAG &DAG) const {
3489 EVT PtrVT = Op.getValueType();
3490 // FIXME there is no actual debug info here
3491 SDLoc dl(Op);
3492 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3493 SDValue Res;
3494
3495 // When generating execute-only code Constant Pools must be promoted to the
3496 // global data section. It's a bit ugly that we can't share them across basic
3497 // blocks, but this way we guarantee that execute-only behaves correct with
3498 // position-independent addressing modes.
3499 if (Subtarget->genExecuteOnly()) {
3500 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3501 auto T = const_cast<Type*>(CP->getType());
3502 auto C = const_cast<Constant*>(CP->getConstVal());
3503 auto M = const_cast<Module*>(DAG.getMachineFunction().
3505 auto GV = new GlobalVariable(
3506 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3509 Twine(AFI->createPICLabelUId())
3510 );
3511 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3512 dl, PtrVT);
3513 return LowerGlobalAddress(GA, DAG);
3514 }
3515
3516 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3517 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3518 Align CPAlign = CP->getAlign();
3519 if (Subtarget->isThumb1Only())
3520 CPAlign = std::max(CPAlign, Align(4));
3521 if (CP->isMachineConstantPoolEntry())
3522 Res =
3523 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3524 else
3525 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3526 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3527}
3528
3530 // If we don't have a 32-bit pc-relative branch instruction then the jump
3531 // table consists of block addresses. Usually this is inline, but for
3532 // execute-only it must be placed out-of-line.
3533 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3536}
3537
3538SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3539 SelectionDAG &DAG) const {
3542 unsigned ARMPCLabelIndex = 0;
3543 SDLoc DL(Op);
3544 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3545 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3546 SDValue CPAddr;
3547 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3548 if (!IsPositionIndependent) {
3549 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3550 } else {
3551 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3552 ARMPCLabelIndex = AFI->createPICLabelUId();
3554 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3555 ARMCP::CPBlockAddress, PCAdj);
3556 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3557 }
3558 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3559 SDValue Result = DAG.getLoad(
3560 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3562 if (!IsPositionIndependent)
3563 return Result;
3564 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3565 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3566}
3567
3568/// Convert a TLS address reference into the correct sequence of loads
3569/// and calls to compute the variable's address for Darwin, and return an
3570/// SDValue containing the final node.
3571
3572/// Darwin only has one TLS scheme which must be capable of dealing with the
3573/// fully general situation, in the worst case. This means:
3574/// + "extern __thread" declaration.
3575/// + Defined in a possibly unknown dynamic library.
3576///
3577/// The general system is that each __thread variable has a [3 x i32] descriptor
3578/// which contains information used by the runtime to calculate the address. The
3579/// only part of this the compiler needs to know about is the first word, which
3580/// contains a function pointer that must be called with the address of the
3581/// entire descriptor in "r0".
3582///
3583/// Since this descriptor may be in a different unit, in general access must
3584/// proceed along the usual ARM rules. A common sequence to produce is:
3585///
3586/// movw rT1, :lower16:_var$non_lazy_ptr
3587/// movt rT1, :upper16:_var$non_lazy_ptr
3588/// ldr r0, [rT1]
3589/// ldr rT2, [r0]
3590/// blx rT2
3591/// [...address now in r0...]
3592SDValue
3593ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3594 SelectionDAG &DAG) const {
3595 assert(Subtarget->isTargetDarwin() &&
3596 "This function expects a Darwin target");
3597 SDLoc DL(Op);
3598
3599 // First step is to get the address of the actua global symbol. This is where
3600 // the TLS descriptor lives.
3601 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3602
3603 // The first entry in the descriptor is a function pointer that we must call
3604 // to obtain the address of the variable.
3605 SDValue Chain = DAG.getEntryNode();
3606 SDValue FuncTLVGet = DAG.getLoad(
3607 MVT::i32, DL, Chain, DescAddr,
3611 Chain = FuncTLVGet.getValue(1);
3612
3614 MachineFrameInfo &MFI = F.getFrameInfo();
3615 MFI.setAdjustsStack(true);
3616
3617 // TLS calls preserve all registers except those that absolutely must be
3618 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3619 // silly).
3620 auto TRI =
3622 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3624
3625 // Finally, we can make the call. This is just a degenerate version of a
3626 // normal AArch64 call node: r0 takes the address of the descriptor, and
3627 // returns the address of the variable in this thread.
3628 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3629 Chain =
3630 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3631 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3632 DAG.getRegisterMask(Mask), Chain.getValue(1));
3633 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3634}
3635
3636SDValue
3637ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3638 SelectionDAG &DAG) const {
3639 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3640
3641 SDValue Chain = DAG.getEntryNode();
3642 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3643 SDLoc DL(Op);
3644
3645 // Load the current TEB (thread environment block)
3646 SDValue Ops[] = {Chain,
3647 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3648 DAG.getTargetConstant(15, DL, MVT::i32),
3649 DAG.getTargetConstant(0, DL, MVT::i32),
3650 DAG.getTargetConstant(13, DL, MVT::i32),
3651 DAG.getTargetConstant(0, DL, MVT::i32),
3652 DAG.getTargetConstant(2, DL, MVT::i32)};
3653 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3654 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3655
3656 SDValue TEB = CurrentTEB.getValue(0);
3657 Chain = CurrentTEB.getValue(1);
3658
3659 // Load the ThreadLocalStoragePointer from the TEB
3660 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3661 SDValue TLSArray =
3662 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3663 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3664
3665 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3666 // offset into the TLSArray.
3667
3668 // Load the TLS index from the C runtime
3669 SDValue TLSIndex =
3670 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3671 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3672 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3673
3674 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3675 DAG.getConstant(2, DL, MVT::i32));
3676 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3677 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3679
3680 // Get the offset of the start of the .tls section (section base)
3681 const auto *GA = cast<GlobalAddressSDNode>(Op);
3682 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3683 SDValue Offset = DAG.getLoad(
3684 PtrVT, DL, Chain,
3685 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3686 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3688
3689 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3690}
3691
3692// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3693SDValue
3694ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3695 SelectionDAG &DAG) const {
3696 SDLoc dl(GA);
3697 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3698 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3701 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3703 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3704 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3705 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3706 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3707 Argument = DAG.getLoad(
3708 PtrVT, dl, DAG.getEntryNode(), Argument,
3710 SDValue Chain = Argument.getValue(1);
3711
3712 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3713 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3714
3715 // call __tls_get_addr.
3717 ArgListEntry Entry;
3718 Entry.Node = Argument;
3719 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
3720 Args.push_back(Entry);
3721
3722 // FIXME: is there useful debug info available here?
3724 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3726 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3727
3728 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3729 return CallResult.first;
3730}
3731
3732// Lower ISD::GlobalTLSAddress using the "initial exec" or
3733// "local exec" model.
3734SDValue
3735ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3736 SelectionDAG &DAG,
3737 TLSModel::Model model) const {
3738 const GlobalValue *GV = GA->getGlobal();
3739 SDLoc dl(GA);
3741 SDValue Chain = DAG.getEntryNode();
3742 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3743 // Get the Thread Pointer
3745
3746 if (model == TLSModel::InitialExec) {
3749 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3750 // Initial exec model.
3751 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3753 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3755 true);
3756 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3757 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3758 Offset = DAG.getLoad(
3759 PtrVT, dl, Chain, Offset,
3761 Chain = Offset.getValue(1);
3762
3763 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3764 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3765
3766 Offset = DAG.getLoad(
3767 PtrVT, dl, Chain, Offset,
3769 } else {
3770 // local exec model
3771 assert(model == TLSModel::LocalExec);
3774 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3775 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3776 Offset = DAG.getLoad(
3777 PtrVT, dl, Chain, Offset,
3779 }
3780
3781 // The address of the thread local variable is the add of the thread
3782 // pointer with the offset of the variable.
3783 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3784}
3785
3786SDValue
3787ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3788 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3789 if (DAG.getTarget().useEmulatedTLS())
3790 return LowerToTLSEmulatedModel(GA, DAG);
3791
3792 if (Subtarget->isTargetDarwin())
3793 return LowerGlobalTLSAddressDarwin(Op, DAG);
3794
3795 if (Subtarget->isTargetWindows())
3796 return LowerGlobalTLSAddressWindows(Op, DAG);
3797
3798 // TODO: implement the "local dynamic" model
3799 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3801
3802 switch (model) {
3805 return LowerToTLSGeneralDynamicModel(GA, DAG);
3808 return LowerToTLSExecModels(GA, DAG, model);
3809 }
3810 llvm_unreachable("bogus TLS model");
3811}
3812
3813/// Return true if all users of V are within function F, looking through
3814/// ConstantExprs.
3815static bool allUsersAreInFunction(const Value *V, const Function *F) {
3816 SmallVector<const User*,4> Worklist(V->users());
3817 while (!Worklist.empty()) {
3818 auto *U = Worklist.pop_back_val();
3819 if (isa<ConstantExpr>(U)) {
3820 append_range(Worklist, U->users());
3821 continue;
3822 }
3823
3824 auto *I = dyn_cast<Instruction>(U);
3825 if (!I || I->getParent()->getParent() != F)
3826 return false;
3827 }
3828 return true;
3829}
3830
3832 const GlobalValue *GV, SelectionDAG &DAG,
3833 EVT PtrVT, const SDLoc &dl) {
3834 // If we're creating a pool entry for a constant global with unnamed address,
3835 // and the global is small enough, we can emit it inline into the constant pool
3836 // to save ourselves an indirection.
3837 //
3838 // This is a win if the constant is only used in one function (so it doesn't
3839 // need to be duplicated) or duplicating the constant wouldn't increase code
3840 // size (implying the constant is no larger than 4 bytes).
3841 const Function &F = DAG.getMachineFunction().getFunction();
3842
3843 // We rely on this decision to inline being idemopotent and unrelated to the
3844 // use-site. We know that if we inline a variable at one use site, we'll
3845 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3846 // doesn't know about this optimization, so bail out if it's enabled else
3847 // we could decide to inline here (and thus never emit the GV) but require
3848 // the GV from fast-isel generated code.
3851 return SDValue();
3852
3853 auto *GVar = dyn_cast<GlobalVariable>(GV);
3854 if (!GVar || !GVar->hasInitializer() ||
3855 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3856 !GVar->hasLocalLinkage())
3857 return SDValue();
3858
3859 // If we inline a value that contains relocations, we move the relocations
3860 // from .data to .text. This is not allowed in position-independent code.
3861 auto *Init = GVar->getInitializer();
3862 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3863 Init->needsDynamicRelocation())
3864 return SDValue();
3865
3866 // The constant islands pass can only really deal with alignment requests
3867 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3868 // any type wanting greater alignment requirements than 4 bytes. We also
3869 // can only promote constants that are multiples of 4 bytes in size or
3870 // are paddable to a multiple of 4. Currently we only try and pad constants
3871 // that are strings for simplicity.
3872 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3873 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3874 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3875 unsigned RequiredPadding = 4 - (Size % 4);
3876 bool PaddingPossible =
3877 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3878 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3879 Size == 0)
3880 return SDValue();
3881
3882 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3885
3886 // We can't bloat the constant pool too much, else the ConstantIslands pass
3887 // may fail to converge. If we haven't promoted this global yet (it may have
3888 // multiple uses), and promoting it would increase the constant pool size (Sz
3889 // > 4), ensure we have space to do so up to MaxTotal.
3890 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3891 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3893 return SDValue();
3894
3895 // This is only valid if all users are in a single function; we can't clone
3896 // the constant in general. The LLVM IR unnamed_addr allows merging
3897 // constants, but not cloning them.
3898 //
3899 // We could potentially allow cloning if we could prove all uses of the
3900 // constant in the current function don't care about the address, like
3901 // printf format strings. But that isn't implemented for now.
3902 if (!allUsersAreInFunction(GVar, &F))
3903 return SDValue();
3904
3905 // We're going to inline this global. Pad it out if needed.
3906 if (RequiredPadding != 4) {
3907 StringRef S = CDAInit->getAsString();
3908
3910 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3911 while (RequiredPadding--)
3912 V.push_back(0);
3914 }
3915
3916 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3917 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3918 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3921 PaddedSize - 4);
3922 }
3923 ++NumConstpoolPromoted;
3924 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3925}
3926
3928 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3929 if (!(GV = GA->getAliaseeObject()))
3930 return false;
3931 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3932 return V->isConstant();
3933 return isa<Function>(GV);
3934}
3935
3936SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3937 SelectionDAG &DAG) const {
3938 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3939 default: llvm_unreachable("unknown object format");
3940 case Triple::COFF:
3941 return LowerGlobalAddressWindows(Op, DAG);
3942 case Triple::ELF:
3943 return LowerGlobalAddressELF(Op, DAG);
3944 case Triple::MachO:
3945 return LowerGlobalAddressDarwin(Op, DAG);
3946 }
3947}
3948
3949SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3950 SelectionDAG &DAG) const {
3951 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3952 SDLoc dl(Op);
3953 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3954 bool IsRO = isReadOnly(GV);
3955
3956 // promoteToConstantPool only if not generating XO text section
3957 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3958 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3959 return V;
3960
3961 if (isPositionIndependent()) {
3963 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3964 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3965 if (!GV->isDSOLocal())
3966 Result =
3967 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3969 return Result;
3970 } else if (Subtarget->isROPI() && IsRO) {
3971 // PC-relative.
3972 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3973 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3974 return Result;
3975 } else if (Subtarget->isRWPI() && !IsRO) {
3976 // SB-relative.
3977 SDValue RelAddr;
3978 if (Subtarget->useMovt()) {
3979 ++NumMovwMovt;
3980 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3981 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3982 } else { // use literal pool for address constant
3985 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3986 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3987 RelAddr = DAG.getLoad(
3988 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3990 }
3991 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3992 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3993 return Result;
3994 }
3995
3996 // If we have T2 ops, we can materialize the address directly via movt/movw
3997 // pair. This is always cheaper. If need to generate Execute Only code, and we
3998 // only have Thumb1 available, we can't use a constant pool and are forced to
3999 // use immediate relocations.
4000 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
4001 if (Subtarget->useMovt())
4002 ++NumMovwMovt;
4003 // FIXME: Once remat is capable of dealing with instructions with register
4004 // operands, expand this into two nodes.
4005 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
4006 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
4007 } else {
4008 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
4009 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4010 return DAG.getLoad(
4011 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4013 }
4014}
4015
4016SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
4017 SelectionDAG &DAG) const {
4018 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
4019 "ROPI/RWPI not currently supported for Darwin");
4020 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4021 SDLoc dl(Op);
4022 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4023
4024 if (Subtarget->useMovt())
4025 ++NumMovwMovt;
4026
4027 // FIXME: Once remat is capable of dealing with instructions with register
4028 // operands, expand this into multiple nodes
4029 unsigned Wrapper =
4031
4032 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
4033 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
4034
4035 if (Subtarget->isGVIndirectSymbol(GV))
4036 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
4038 return Result;
4039}
4040
4041SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
4042 SelectionDAG &DAG) const {
4043 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
4044 assert(Subtarget->useMovt() &&
4045 "Windows on ARM expects to use movw/movt");
4046 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
4047 "ROPI/RWPI not currently supported for Windows");
4048
4050 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4051 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
4052 if (GV->hasDLLImportStorageClass())
4053 TargetFlags = ARMII::MO_DLLIMPORT;
4054 else if (!TM.shouldAssumeDSOLocal(GV))
4055 TargetFlags = ARMII::MO_COFFSTUB;
4056 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4058 SDLoc DL(Op);
4059
4060 ++NumMovwMovt;
4061
4062 // FIXME: Once remat is capable of dealing with instructions with register
4063 // operands, expand this into two nodes.
4064 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
4065 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
4066 TargetFlags));
4067 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
4068 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
4070 return Result;
4071}
4072
4073SDValue
4074ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
4075 SDLoc dl(Op);
4076 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
4077 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
4078 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
4079 Op.getOperand(1), Val);
4080}
4081
4082SDValue
4083ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
4084 SDLoc dl(Op);
4085 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
4086 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
4087}
4088
4089SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
4090 SelectionDAG &DAG) const {
4091 SDLoc dl(Op);
4092 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
4093 Op.getOperand(0));
4094}
4095
4096SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
4097 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
4098 unsigned IntNo =
4099 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4100 switch (IntNo) {
4101 default:
4102 return SDValue(); // Don't custom lower most intrinsics.
4103 case Intrinsic::arm_gnu_eabi_mcount: {
4105 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4106 SDLoc dl(Op);
4107 SDValue Chain = Op.getOperand(0);
4108 // call "\01__gnu_mcount_nc"
4109 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4110 const uint32_t *Mask =
4112 assert(Mask && "Missing call preserved mask for calling convention");
4113 // Mark LR an implicit live-in.
4114 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4115 SDValue ReturnAddress =
4116 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4117 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4118 SDValue Callee =
4119 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4121 if (Subtarget->isThumb())
4122 return SDValue(
4123 DAG.getMachineNode(
4124 ARM::tBL_PUSHLR, dl, ResultTys,
4125 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4126 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4127 0);
4128 return SDValue(
4129 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4130 {ReturnAddress, Callee, RegisterMask, Chain}),
4131 0);
4132 }
4133 }
4134}
4135
4136SDValue
4137ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4138 const ARMSubtarget *Subtarget) const {
4139 unsigned IntNo = Op.getConstantOperandVal(0);
4140 SDLoc dl(Op);
4141 switch (IntNo) {
4142 default: return SDValue(); // Don't custom lower most intrinsics.
4143 case Intrinsic::thread_pointer: {
4144 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4145 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4146 }
4147 case Intrinsic::arm_cls: {
4148 const SDValue &Operand = Op.getOperand(1);
4149 const EVT VTy = Op.getValueType();
4150 SDValue SRA =
4151 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4152 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4153 SDValue SHL =
4154 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4155 SDValue OR =
4156 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4157 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4158 return Result;
4159 }
4160 case Intrinsic::arm_cls64: {
4161 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4162 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4163 const SDValue &Operand = Op.getOperand(1);
4164 const EVT VTy = Op.getValueType();
4165 SDValue Lo, Hi;
4166 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4167 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4168 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4169 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4170 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4171 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4172 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4173 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4174 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4175 SDValue CheckLo =
4176 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4177 SDValue HiIsZero =
4178 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4179 SDValue AdjustedLo =
4180 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4181 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4182 SDValue Result =
4183 DAG.getSelect(dl, VTy, CheckLo,
4184 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4185 return Result;
4186 }
4187 case Intrinsic::eh_sjlj_lsda: {
4190 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4191 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4192 SDValue CPAddr;
4193 bool IsPositionIndependent = isPositionIndependent();
4194 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4196 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4197 ARMCP::CPLSDA, PCAdj);
4198 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4199 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4200 SDValue Result = DAG.getLoad(
4201 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4203
4204 if (IsPositionIndependent) {
4205 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4206 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4207 }
4208 return Result;
4209 }
4210 case Intrinsic::arm_neon_vabs:
4211 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4212 Op.getOperand(1));
4213 case Intrinsic::arm_neon_vabds:
4214 if (Op.getValueType().isInteger())
4215 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4216 Op.getOperand(1), Op.getOperand(2));
4217 return SDValue();
4218 case Intrinsic::arm_neon_vabdu:
4219 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4220 Op.getOperand(1), Op.getOperand(2));
4221 case Intrinsic::arm_neon_vmulls:
4222 case Intrinsic::arm_neon_vmullu: {
4223 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4225 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4226 Op.getOperand(1), Op.getOperand(2));
4227 }
4228 case Intrinsic::arm_neon_vminnm:
4229 case Intrinsic::arm_neon_vmaxnm: {
4230 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4232 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4233 Op.getOperand(1), Op.getOperand(2));
4234 }
4235 case Intrinsic::arm_neon_vminu:
4236 case Intrinsic::arm_neon_vmaxu: {
4237 if (Op.getValueType().isFloatingPoint())
4238 return SDValue();
4239 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4240 ? ISD::UMIN : ISD::UMAX;
4241 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4242 Op.getOperand(1), Op.getOperand(2));
4243 }
4244 case Intrinsic::arm_neon_vmins:
4245 case Intrinsic::arm_neon_vmaxs: {
4246 // v{min,max}s is overloaded between signed integers and floats.
4247 if (!Op.getValueType().isFloatingPoint()) {
4248 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4249 ? ISD::SMIN : ISD::SMAX;
4250 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4251 Op.getOperand(1), Op.getOperand(2));
4252 }
4253 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4255 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4256 Op.getOperand(1), Op.getOperand(2));
4257 }
4258 case Intrinsic::arm_neon_vtbl1:
4259 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4260 Op.getOperand(1), Op.getOperand(2));
4261 case Intrinsic::arm_neon_vtbl2:
4262 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4263 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4264 case Intrinsic::arm_mve_pred_i2v:
4265 case Intrinsic::arm_mve_pred_v2i:
4266 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4267 Op.getOperand(1));
4268 case Intrinsic::arm_mve_vreinterpretq:
4269 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4270 Op.getOperand(1));
4271 case Intrinsic::arm_mve_lsll:
4272 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4273 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4274 case Intrinsic::arm_mve_asrl:
4275 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4276 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4277 }
4278}
4279
4281 const ARMSubtarget *Subtarget) {
4282 SDLoc dl(Op);
4283 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4284 if (SSID == SyncScope::SingleThread)
4285 return Op;
4286
4287 if (!Subtarget->hasDataBarrier()) {
4288 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4289 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4290 // here.
4291 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4292 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4293 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4294 DAG.getConstant(0, dl, MVT::i32));
4295 }
4296
4297 AtomicOrdering Ord =
4298 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4300 if (Subtarget->isMClass()) {
4301 // Only a full system barrier exists in the M-class architectures.
4303 } else if (Subtarget->preferISHSTBarriers() &&
4304 Ord == AtomicOrdering::Release) {
4305 // Swift happens to implement ISHST barriers in a way that's compatible with
4306 // Release semantics but weaker than ISH so we'd be fools not to use
4307 // it. Beware: other processors probably don't!
4309 }
4310
4311 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4312 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4313 DAG.getConstant(Domain, dl, MVT::i32));
4314}
4315
4317 const ARMSubtarget *Subtarget) {
4318 // ARM pre v5TE and Thumb1 does not have preload instructions.
4319 if (!(Subtarget->isThumb2() ||
4320 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4321 // Just preserve the chain.
4322 return Op.getOperand(0);
4323
4324 SDLoc dl(Op);
4325 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4326 if (!isRead &&
4327 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4328 // ARMv7 with MP extension has PLDW.
4329 return Op.getOperand(0);
4330
4331 unsigned isData = Op.getConstantOperandVal(4);
4332 if (Subtarget->isThumb()) {
4333 // Invert the bits.
4334 isRead = ~isRead & 1;
4335 isData = ~isData & 1;
4336 }
4337
4338 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4339 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4340 DAG.getConstant(isData, dl, MVT::i32));
4341}
4342
4345 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4346
4347 // vastart just stores the address of the VarArgsFrameIndex slot into the
4348 // memory location argument.
4349 SDLoc dl(Op);
4351 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4352 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4353 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4354 MachinePointerInfo(SV));
4355}
4356
4357SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4358 CCValAssign &NextVA,
4359 SDValue &Root,
4360 SelectionDAG &DAG,
4361 const SDLoc &dl) const {
4364
4365 const TargetRegisterClass *RC;
4366 if (AFI->isThumb1OnlyFunction())
4367 RC = &ARM::tGPRRegClass;
4368 else
4369 RC = &ARM::GPRRegClass;
4370
4371 // Transform the arguments stored in physical registers into virtual ones.
4372 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4373 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4374
4375 SDValue ArgValue2;
4376 if (NextVA.isMemLoc()) {
4377 MachineFrameInfo &MFI = MF.getFrameInfo();
4378 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4379
4380 // Create load node to retrieve arguments from the stack.
4381 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4382 ArgValue2 = DAG.getLoad(
4383 MVT::i32, dl, Root, FIN,
4385 } else {
4386 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4387 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4388 }
4389 if (!Subtarget->isLittle())
4390 std::swap (ArgValue, ArgValue2);
4391 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4392}
4393
4394// The remaining GPRs hold either the beginning of variable-argument
4395// data, or the beginning of an aggregate passed by value (usually
4396// byval). Either way, we allocate stack slots adjacent to the data
4397// provided by our caller, and store the unallocated registers there.
4398// If this is a variadic function, the va_list pointer will begin with
4399// these values; otherwise, this reassembles a (byval) structure that
4400// was split between registers and memory.
4401// Return: The frame index registers were stored into.
4402int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4403 const SDLoc &dl, SDValue &Chain,
4404 const Value *OrigArg,
4405 unsigned InRegsParamRecordIdx,
4406 int ArgOffset, unsigned ArgSize) const {
4407 // Currently, two use-cases possible:
4408 // Case #1. Non-var-args function, and we meet first byval parameter.
4409 // Setup first unallocated register as first byval register;
4410 // eat all remained registers
4411 // (these two actions are performed by HandleByVal method).
4412 // Then, here, we initialize stack frame with
4413 // "store-reg" instructions.
4414 // Case #2. Var-args function, that doesn't contain byval parameters.
4415 // The same: eat all remained unallocated registers,
4416 // initialize stack frame.
4417
4419 MachineFrameInfo &MFI = MF.getFrameInfo();
4421 unsigned RBegin, REnd;
4422 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4423 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4424 } else {
4425 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4426 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4427 REnd = ARM::R4;
4428 }
4429
4430 if (REnd != RBegin)
4431 ArgOffset = -4 * (ARM::R4 - RBegin);
4432
4433 auto PtrVT = getPointerTy(DAG.getDataLayout());
4434 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4435 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4436
4438 const TargetRegisterClass *RC =
4439 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4440
4441 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4442 Register VReg = MF.addLiveIn(Reg, RC);
4443 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4444 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4445 MachinePointerInfo(OrigArg, 4 * i));
4446 MemOps.push_back(Store);
4447 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4448 }
4449
4450 if (!MemOps.empty())
4451 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4452 return FrameIndex;
4453}
4454
4455// Setup stack frame, the va_list pointer will start from.
4456void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4457 const SDLoc &dl, SDValue &Chain,
4458 unsigned ArgOffset,
4459 unsigned TotalArgRegsSaveSize,
4460 bool ForceMutable) const {
4463
4464 // Try to store any remaining integer argument regs
4465 // to their spots on the stack so that they may be loaded by dereferencing
4466 // the result of va_next.
4467 // If there is no regs to be stored, just point address after last
4468 // argument passed via stack.
4469 int FrameIndex = StoreByValRegs(
4470 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4471 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4472 AFI->setVarArgsFrameIndex(FrameIndex);
4473}
4474
4475bool ARMTargetLowering::splitValueIntoRegisterParts(
4476 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4477 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4478 EVT ValueVT = Val.getValueType();
4479 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4480 unsigned ValueBits = ValueVT.getSizeInBits();
4481 unsigned PartBits = PartVT.getSizeInBits();
4482 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4483 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4484 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4485 Parts[0] = Val;
4486 return true;
4487 }
4488 return false;
4489}
4490
4491SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4492 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4493 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4494 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4495 unsigned ValueBits = ValueVT.getSizeInBits();
4496 unsigned PartBits = PartVT.getSizeInBits();
4497 SDValue Val = Parts[0];
4498
4499 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4500 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4501 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4502 return Val;
4503 }
4504 return SDValue();
4505}
4506
4507SDValue ARMTargetLowering::LowerFormalArguments(
4508 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4509 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4510 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4512 MachineFrameInfo &MFI = MF.getFrameInfo();
4513
4515
4516 // Assign locations to all of the incoming arguments.
4518 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4519 *DAG.getContext());
4520 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4521
4523 unsigned CurArgIdx = 0;
4524
4525 // Initially ArgRegsSaveSize is zero.
4526 // Then we increase this value each time we meet byval parameter.
4527 // We also increase this value in case of varargs function.
4528 AFI->setArgRegsSaveSize(0);
4529
4530 // Calculate the amount of stack space that we need to allocate to store
4531 // byval and variadic arguments that are passed in registers.
4532 // We need to know this before we allocate the first byval or variadic
4533 // argument, as they will be allocated a stack slot below the CFA (Canonical
4534 // Frame Address, the stack pointer at entry to the function).
4535 unsigned ArgRegBegin = ARM::R4;
4536 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4537 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4538 break;
4539
4540 CCValAssign &VA = ArgLocs[i];
4541 unsigned Index = VA.getValNo();
4542 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4543 if (!Flags.isByVal())
4544 continue;
4545
4546 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4547 unsigned RBegin, REnd;
4548 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4549 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4550
4551 CCInfo.nextInRegsParam();
4552 }
4553 CCInfo.rewindByValRegsInfo();
4554
4555 int lastInsIndex = -1;
4556 if (isVarArg && MFI.hasVAStart()) {
4557 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4558 if (RegIdx != std::size(GPRArgRegs))
4559 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4560 }
4561
4562 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4563 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4564 auto PtrVT = getPointerTy(DAG.getDataLayout());
4565
4566 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4567 CCValAssign &VA = ArgLocs[i];
4568 if (Ins[VA.getValNo()].isOrigArg()) {
4569 std::advance(CurOrigArg,
4570 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4571 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4572 }
4573 // Arguments stored in registers.
4574 if (VA.isRegLoc()) {
4575 EVT RegVT = VA.getLocVT();
4576 SDValue ArgValue;
4577
4578 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4579 // f64 and vector types are split up into multiple registers or
4580 // combinations of registers and stack slots.
4581 SDValue ArgValue1 =
4582 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4583 VA = ArgLocs[++i]; // skip ahead to next loc
4584 SDValue ArgValue2;
4585 if (VA.isMemLoc()) {
4586 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4587 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4588 ArgValue2 = DAG.getLoad(
4589 MVT::f64, dl, Chain, FIN,
4591 } else {
4592 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4593 }
4594 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4595 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4596 ArgValue1, DAG.getIntPtrConstant(0, dl));
4597 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4598 ArgValue2, DAG.getIntPtrConstant(1, dl));
4599 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4600 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4601 } else {
4602 const TargetRegisterClass *RC;
4603
4604 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4605 RC = &ARM::HPRRegClass;
4606 else if (RegVT == MVT::f32)
4607 RC = &ARM::SPRRegClass;
4608 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4609 RegVT == MVT::v4bf16)
4610 RC = &ARM::DPRRegClass;
4611 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4612 RegVT == MVT::v8bf16)
4613 RC = &ARM::QPRRegClass;
4614 else if (RegVT == MVT::i32)
4615 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4616 : &ARM::GPRRegClass;
4617 else
4618 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4619
4620 // Transform the arguments in physical registers into virtual ones.
4621 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4622 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4623
4624 // If this value is passed in r0 and has the returned attribute (e.g.
4625 // C++ 'structors), record this fact for later use.
4626 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4627 AFI->setPreservesR0();
4628 }
4629 }
4630
4631 // If this is an 8 or 16-bit value, it is really passed promoted
4632 // to 32 bits. Insert an assert[sz]ext to capture this, then
4633 // truncate to the right size.
4634 switch (VA.getLocInfo()) {
4635 default: llvm_unreachable("Unknown loc info!");
4636 case CCValAssign::Full: break;
4637 case CCValAssign::BCvt:
4638 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4639 break;
4640 }
4641
4642 // f16 arguments have their size extended to 4 bytes and passed as if they
4643 // had been copied to the LSBs of a 32-bit register.
4644 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4645 if (VA.needsCustom() &&
4646 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4647 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4648
4649 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4650 // less than 32 bits must be sign- or zero-extended in the callee for
4651 // security reasons. Although the ABI mandates an extension done by the
4652 // caller, the latter cannot be trusted to follow the rules of the ABI.
4653 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4654 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4655 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4656 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4657
4658 InVals.push_back(ArgValue);
4659 } else { // VA.isRegLoc()
4660 // Only arguments passed on the stack should make it here.
4661 assert(VA.isMemLoc());
4662 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4663
4664 int index = VA.getValNo();
4665
4666 // Some Ins[] entries become multiple ArgLoc[] entries.
4667 // Process them only once.
4668 if (index != lastInsIndex)
4669 {
4670 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4671 // FIXME: For now, all byval parameter objects are marked mutable.
4672 // This can be changed with more analysis.
4673 // In case of tail call optimization mark all arguments mutable.
4674 // Since they could be overwritten by lowering of arguments in case of
4675 // a tail call.
4676 if (Flags.isByVal()) {
4677 assert(Ins[index].isOrigArg() &&
4678 "Byval arguments cannot be implicit");
4679 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4680
4681 int FrameIndex = StoreByValRegs(
4682 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4683 VA.getLocMemOffset(), Flags.getByValSize());
4684 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4685 CCInfo.nextInRegsParam();
4686 } else {
4687 unsigned FIOffset = VA.getLocMemOffset();
4688 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4689 FIOffset, true);
4690
4691 // Create load nodes to retrieve arguments from the stack.
4692 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4693 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4695 DAG.getMachineFunction(), FI)));
4696 }
4697 lastInsIndex = index;
4698 }
4699 }
4700 }
4701
4702 // varargs
4703 if (isVarArg && MFI.hasVAStart()) {
4704 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4705 TotalArgRegsSaveSize);
4706 if (AFI->isCmseNSEntryFunction()) {
4709 "secure entry function must not be variadic", dl.getDebugLoc());
4710 DAG.getContext()->diagnose(Diag);
4711 }
4712 }
4713
4714 unsigned StackArgSize = CCInfo.getStackSize();
4715 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4716 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4717 // The only way to guarantee a tail call is if the callee restores its
4718 // argument area, but it must also keep the stack aligned when doing so.
4719 const DataLayout &DL = DAG.getDataLayout();
4720 StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
4721
4722 AFI->setArgumentStackToRestore(StackArgSize);
4723 }
4724 AFI->setArgumentStackSize(StackArgSize);
4725
4726 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4729 "secure entry function requires arguments on stack", dl.getDebugLoc());
4730 DAG.getContext()->diagnose(Diag);
4731 }
4732
4733 return Chain;
4734}
4735
4736/// isFloatingPointZero - Return true if this is +0.0.
4738 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
4739 return CFP->getValueAPF().isPosZero();
4740 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4741 // Maybe this has already been legalized into the constant pool?
4742 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4743 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4744 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
4745 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4746 return CFP->getValueAPF().isPosZero();
4747 }
4748 } else if (Op->getOpcode() == ISD::BITCAST &&
4749 Op->getValueType(0) == MVT::f64) {
4750 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4751 // created by LowerConstantFP().
4752 SDValue BitcastOp = Op->getOperand(0);
4753 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4754 isNullConstant(BitcastOp->getOperand(0)))
4755 return true;
4756 }
4757 return false;
4758}
4759
4760/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4761/// the given operands.
4762SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4763 SDValue &ARMcc, SelectionDAG &DAG,
4764 const SDLoc &dl) const {
4765 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4766 unsigned C = RHSC->getZExtValue();
4767 if (!isLegalICmpImmediate((int32_t)C)) {
4768 // Constant does not fit, try adjusting it by one.
4769 switch (CC) {
4770 default: break;
4771 case ISD::SETLT:
4772 case ISD::SETGE:
4773 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4775 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4776 }
4777 break;
4778 case ISD::SETULT:
4779 case ISD::SETUGE:
4780 if (C != 0 && isLegalICmpImmediate(C-1)) {
4782 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4783 }
4784 break;
4785 case ISD::SETLE:
4786 case ISD::SETGT:
4787 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4789 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4790 }
4791 break;
4792 case ISD::SETULE:
4793 case ISD::SETUGT:
4794 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4796 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4797 }
4798 break;
4799 }
4800 }
4801 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4803 // In ARM and Thumb-2, the compare instructions can shift their second
4804 // operand.
4806 std::swap(LHS, RHS);
4807 }
4808
4809 // Thumb1 has very limited immediate modes, so turning an "and" into a
4810 // shift can save multiple instructions.
4811 //
4812 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4813 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4814 // own. If it's the operand to an unsigned comparison with an immediate,
4815 // we can eliminate one of the shifts: we transform
4816 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4817 //
4818 // We avoid transforming cases which aren't profitable due to encoding
4819 // details:
4820 //
4821 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4822 // would not; in that case, we're essentially trading one immediate load for
4823 // another.
4824 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4825 // 3. C2 is zero; we have other code for this special case.
4826 //
4827 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4828 // instruction, since the AND is always one instruction anyway, but we could
4829 // use narrow instructions in some cases.
4830 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4831 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4832 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4833 !isSignedIntSetCC(CC)) {
4834 unsigned Mask = LHS.getConstantOperandVal(1);
4835 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4836 uint64_t RHSV = RHSC->getZExtValue();
4837 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4838 unsigned ShiftBits = llvm::countl_zero(Mask);
4839 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4840 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4841 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4842 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4843 }
4844 }
4845 }
4846
4847 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4848 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4849 // way a cmp would.
4850 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4851 // some tweaks to the heuristics for the previous and->shift transform.
4852 // FIXME: Optimize cases where the LHS isn't a shift.
4853 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4854 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4855 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4856 LHS.getConstantOperandVal(1) < 31) {
4857 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4858 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4859 DAG.getVTList(MVT::i32, MVT::i32),
4860 LHS.getOperand(0),
4861 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4862 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4863 Shift.getValue(1), SDValue());
4864 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4865 return Chain.getValue(1);
4866 }
4867
4869
4870 // If the RHS is a constant zero then the V (overflow) flag will never be
4871 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4872 // simpler for other passes (like the peephole optimiser) to deal with.
4873 if (isNullConstant(RHS)) {
4874 switch (CondCode) {
4875 default: break;
4876 case ARMCC::GE:
4878 break;
4879 case ARMCC::LT:
4881 break;
4882 }
4883 }
4884
4885 ARMISD::NodeType CompareType;
4886 switch (CondCode) {
4887 default:
4888 CompareType = ARMISD::CMP;
4889 break;
4890 case ARMCC::EQ:
4891 case ARMCC::NE:
4892 // Uses only Z Flag
4893 CompareType = ARMISD::CMPZ;
4894 break;
4895 }
4896 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4897 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4898}
4899
4900/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4901SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4902 SelectionDAG &DAG, const SDLoc &dl,
4903 bool Signaling) const {
4904 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4905 SDValue Cmp;
4906 if (!isFloatingPointZero(RHS))
4907 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
4908 dl, MVT::Glue, LHS, RHS);
4909 else
4910 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
4911 dl, MVT::Glue, LHS);
4912 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4913}
4914
4915/// duplicateCmp - Glue values can have only one use, so this function
4916/// duplicates a comparison node.
4917SDValue
4918ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4919 unsigned Opc = Cmp.getOpcode();
4920 SDLoc DL(Cmp);
4921 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4922 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4923
4924 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4925 Cmp = Cmp.getOperand(0);
4926 Opc = Cmp.getOpcode();
4927 if (Opc == ARMISD::CMPFP)
4928 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4929 else {
4930 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4931 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
4932 }
4933 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4934}
4935
4936// This function returns three things: the arithmetic computation itself
4937// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4938// comparison and the condition code define the case in which the arithmetic
4939// computation *does not* overflow.
4940std::pair<SDValue, SDValue>
4941ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4942 SDValue &ARMcc) const {
4943 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4944
4945 SDValue Value, OverflowCmp;
4946 SDValue LHS = Op.getOperand(0);
4947 SDValue RHS = Op.getOperand(1);
4948 SDLoc dl(Op);
4949
4950 // FIXME: We are currently always generating CMPs because we don't support
4951 // generating CMN through the backend. This is not as good as the natural
4952 // CMP case because it causes a register dependency and cannot be folded
4953 // later.
4954
4955 switch (Op.getOpcode()) {
4956 default:
4957 llvm_unreachable("Unknown overflow instruction!");
4958 case ISD::SADDO:
4959 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4960 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4961 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4962 break;
4963 case ISD::UADDO:
4964 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4965 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4966 // We do not use it in the USUBO case as Value may not be used.
4967 Value = DAG.getNode(ARMISD::ADDC, dl,
4968 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4969 .getValue(0);
4970 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4971 break;
4972 case ISD::SSUBO:
4973 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4974 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4975 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4976 break;
4977 case ISD::USUBO:
4978 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4979 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4980 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4981 break;
4982 case ISD::UMULO:
4983 // We generate a UMUL_LOHI and then check if the high word is 0.
4984 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4985 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4986 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4987 LHS, RHS);
4988 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4989 DAG.getConstant(0, dl, MVT::i32));
4990 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4991 break;
4992 case ISD::SMULO:
4993 // We generate a SMUL_LOHI and then check if all the bits of the high word
4994 // are the same as the sign bit of the low word.
4995 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4996 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4997 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4998 LHS, RHS);
4999 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
5000 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
5001 Value.getValue(0),
5002 DAG.getConstant(31, dl, MVT::i32)));
5003 Value = Value.getValue(0); // We only want the low 32 bits for the result.
5004 break;
5005 } // switch (...)
5006
5007 return std::make_pair(Value, OverflowCmp);
5008}
5009
5010SDValue
5011ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
5012 // Let legalize expand this if it isn't a legal type yet.
5013 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
5014 return SDValue();
5015
5016 SDValue Value, OverflowCmp;
5017 SDValue ARMcc;
5018 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
5019 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5020 SDLoc dl(Op);
5021 // We use 0 and 1 as false and true values.
5022 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
5023 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
5024 EVT VT = Op.getValueType();
5025
5026 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
5027 ARMcc, CCR, OverflowCmp);
5028
5029 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
5030 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
5031}
5032
5034 SelectionDAG &DAG) {
5035 SDLoc DL(BoolCarry);
5036 EVT CarryVT = BoolCarry.getValueType();
5037
5038 // This converts the boolean value carry into the carry flag by doing
5039 // ARMISD::SUBC Carry, 1
5040 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
5041 DAG.getVTList(CarryVT, MVT::i32),
5042 BoolCarry, DAG.getConstant(1, DL, CarryVT));
5043 return Carry.getValue(1);
5044}
5045
5047 SelectionDAG &DAG) {
5048 SDLoc DL(Flags);
5049
5050 // Now convert the carry flag into a boolean carry. We do this
5051 // using ARMISD:ADDE 0, 0, Carry
5052 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
5053 DAG.getConstant(0, DL, MVT::i32),
5054 DAG.getConstant(0, DL, MVT::i32), Flags);
5055}
5056
5057SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
5058 SelectionDAG &DAG) const {
5059 // Let legalize expand this if it isn't a legal type yet.
5060 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
5061 return SDValue();
5062
5063 SDValue LHS = Op.getOperand(0);
5064 SDValue RHS = Op.getOperand(1);
5065 SDLoc dl(Op);
5066
5067 EVT VT = Op.getValueType();
5068 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
5069 SDValue Value;
5070 SDValue Overflow;
5071 switch (Op.getOpcode()) {
5072 default:
5073 llvm_unreachable("Unknown overflow instruction!");
5074 case ISD::UADDO:
5075 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
5076 // Convert the carry flag into a boolean value.
5077 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
5078 break;
5079 case ISD::USUBO: {
5080 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
5081 // Convert the carry flag into a boolean value.
5082 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
5083 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
5084 // value. So compute 1 - C.
5085 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
5086 DAG.getConstant(1, dl, MVT::i32), Overflow);
5087 break;
5088 }
5089 }
5090
5091 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
5092}
5093
5095 const ARMSubtarget *Subtarget) {
5096 EVT VT = Op.getValueType();
5097 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
5098 return SDValue();
5099 if (!VT.isSimple())
5100 return SDValue();
5101
5102 unsigned NewOpcode;
5103 switch (VT.getSimpleVT().SimpleTy) {
5104 default:
5105 return SDValue();
5106 case MVT::i8:
5107 switch (Op->getOpcode()) {
5108 case ISD::UADDSAT:
5109 NewOpcode = ARMISD::UQADD8b;
5110 break;
5111 case ISD::SADDSAT:
5112 NewOpcode = ARMISD::QADD8b;
5113 break;
5114 case ISD::USUBSAT:
5115 NewOpcode = ARMISD::UQSUB8b;
5116 break;
5117 case ISD::SSUBSAT:
5118 NewOpcode = ARMISD::QSUB8b;
5119 break;
5120 }
5121 break;
5122 case MVT::i16:
5123 switch (Op->getOpcode()) {
5124 case ISD::UADDSAT:
5125 NewOpcode = ARMISD::UQADD16b;
5126 break;
5127 case ISD::SADDSAT:
5128 NewOpcode = ARMISD::QADD16b;
5129 break;
5130 case ISD::USUBSAT:
5131 NewOpcode = ARMISD::UQSUB16b;
5132 break;
5133 case ISD::SSUBSAT:
5134 NewOpcode = ARMISD::QSUB16b;
5135 break;
5136 }
5137 break;
5138 }
5139
5140 SDLoc dl(Op);
5141 SDValue Add =
5142 DAG.getNode(NewOpcode, dl, MVT::i32,
5143 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5144 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5145 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5146}
5147
5148SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5149 SDValue Cond = Op.getOperand(0);
5150 SDValue SelectTrue = Op.getOperand(1);
5151 SDValue SelectFalse = Op.getOperand(2);
5152 SDLoc dl(Op);
5153 unsigned Opc = Cond.getOpcode();
5154
5155 if (Cond.getResNo() == 1 &&
5156 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5157 Opc == ISD::USUBO)) {
5158 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5159 return SDValue();
5160
5161 SDValue Value, OverflowCmp;
5162 SDValue ARMcc;
5163 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5164 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5165 EVT VT = Op.getValueType();
5166
5167 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
5168 OverflowCmp, DAG);
5169 }
5170
5171 // Convert:
5172 //
5173 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5174 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5175 //
5176 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5177 const ConstantSDNode *CMOVTrue =
5178 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5179 const ConstantSDNode *CMOVFalse =
5180 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5181
5182 if (CMOVTrue && CMOVFalse) {
5183 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5184 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5185
5186 SDValue True;
5187 SDValue False;
5188 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5189 True = SelectTrue;
5190 False = SelectFalse;
5191 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5192 True = SelectFalse;
5193 False = SelectTrue;
5194 }
5195
5196 if (True.getNode() && False.getNode()) {
5197 EVT VT = Op.getValueType();
5198 SDValue ARMcc = Cond.getOperand(2);
5199 SDValue CCR = Cond.getOperand(3);
5200 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
5201 assert(True.getValueType() == VT);
5202 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
5203 }
5204 }
5205 }
5206
5207 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5208 // undefined bits before doing a full-word comparison with zero.
5209 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5210 DAG.getConstant(1, dl, Cond.getValueType()));
5211
5212 return DAG.getSelectCC(dl, Cond,
5213 DAG.getConstant(0, dl, Cond.getValueType()),
5214 SelectTrue, SelectFalse, ISD::SETNE);
5215}
5216
5218 bool &swpCmpOps, bool &swpVselOps) {
5219 // Start by selecting the GE condition code for opcodes that return true for
5220 // 'equality'
5221 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5222 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5223 CondCode = ARMCC::GE;
5224
5225 // and GT for opcodes that return false for 'equality'.
5226 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5227 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5228 CondCode = ARMCC::GT;
5229
5230 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5231 // to swap the compare operands.
5232 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5233 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5234 swpCmpOps = true;
5235
5236 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5237 // If we have an unordered opcode, we need to swap the operands to the VSEL
5238 // instruction (effectively negating the condition).
5239 //
5240 // This also has the effect of swapping which one of 'less' or 'greater'
5241 // returns true, so we also swap the compare operands. It also switches
5242 // whether we return true for 'equality', so we compensate by picking the
5243 // opposite condition code to our original choice.
5244 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5245 CC == ISD::SETUGT) {
5246 swpCmpOps = !swpCmpOps;
5247 swpVselOps = !swpVselOps;
5248 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5249 }
5250
5251 // 'ordered' is 'anything but unordered', so use the VS condition code and
5252 // swap the VSEL operands.
5253 if (CC == ISD::SETO) {
5254 CondCode = ARMCC::VS;
5255 swpVselOps = true;
5256 }
5257
5258 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5259 // code and swap the VSEL operands. Also do this if we don't care about the
5260 // unordered case.
5261 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5262 CondCode = ARMCC::EQ;
5263 swpVselOps = true;
5264 }
5265}
5266
5267SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5268 SDValue TrueVal, SDValue ARMcc, SDValue CCR,
5269 SDValue Cmp, SelectionDAG &DAG) const {
5270 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5272 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5274 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5275
5276 SDValue TrueLow = TrueVal.getValue(0);
5277 SDValue TrueHigh = TrueVal.getValue(1);
5278 SDValue FalseLow = FalseVal.getValue(0);
5279 SDValue FalseHigh = FalseVal.getValue(1);
5280
5281 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5282 ARMcc, CCR, Cmp);
5283 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5284 ARMcc, CCR, duplicateCmp(Cmp, DAG));
5285
5286 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5287 } else {
5288 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
5289 Cmp);
5290 }
5291}
5292
5294 return CC == ISD::SETGT || CC == ISD::SETGE;
5295}
5296
5298 return CC == ISD::SETLT || CC == ISD::SETLE;
5299}
5300
5301// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5302// All of these conditions (and their <= and >= counterparts) will do:
5303// x < k ? k : x
5304// x > k ? x : k
5305// k < x ? x : k
5306// k > x ? k : x
5307static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5308 const SDValue TrueVal, const SDValue FalseVal,
5309 const ISD::CondCode CC, const SDValue K) {
5310 return (isGTorGE(CC) &&
5311 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5312 (isLTorLE(CC) &&
5313 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5314}
5315
5316// Check if two chained conditionals could be converted into SSAT or USAT.
5317//
5318// SSAT can replace a set of two conditional selectors that bound a number to an
5319// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5320//
5321// x < -k ? -k : (x > k ? k : x)
5322// x < -k ? -k : (x < k ? x : k)
5323// x > -k ? (x > k ? k : x) : -k
5324// x < k ? (x < -k ? -k : x) : k
5325// etc.
5326//
5327// LLVM canonicalizes these to either a min(max()) or a max(min())
5328// pattern. This function tries to match one of these and will return a SSAT
5329// node if successful.
5330//
5331// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5332// is a power of 2.
5334 EVT VT = Op.getValueType();
5335 SDValue V1 = Op.getOperand(0);
5336 SDValue K1 = Op.getOperand(1);
5337 SDValue TrueVal1 = Op.getOperand(2);
5338 SDValue FalseVal1 = Op.getOperand(3);
5339 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5340
5341 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5342 if (Op2.getOpcode() != ISD::SELECT_CC)
5343 return SDValue();
5344
5345 SDValue V2 = Op2.getOperand(0);
5346 SDValue K2 = Op2.getOperand(1);
5347 SDValue TrueVal2 = Op2.getOperand(2);
5348 SDValue FalseVal2 = Op2.getOperand(3);
5349 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5350
5351 SDValue V1Tmp = V1;
5352 SDValue V2Tmp = V2;
5353
5354 // Check that the registers and the constants match a max(min()) or min(max())
5355 // pattern
5356 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5357 K2 != FalseVal2 ||
5358 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5359 return SDValue();
5360
5361 // Check that the constant in the lower-bound check is
5362 // the opposite of the constant in the upper-bound check
5363 // in 1's complement.
5364 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
5365 return SDValue();
5366
5367 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5368 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5369 int64_t PosVal = std::max(Val1, Val2);
5370 int64_t NegVal = std::min(Val1, Val2);
5371
5372 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5373 !isPowerOf2_64(PosVal + 1))
5374 return SDValue();
5375
5376 // Handle the difference between USAT (unsigned) and SSAT (signed)
5377 // saturation
5378 // At this point, PosVal is guaranteed to be positive
5379 uint64_t K = PosVal;
5380 SDLoc dl(Op);
5381 if (Val1 == ~Val2)
5382 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5383 DAG.getConstant(llvm::countr_one(K), dl, VT));
5384 if (NegVal == 0)
5385 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5386 DAG.getConstant(llvm::countr_one(K), dl, VT));
5387
5388 return SDValue();
5389}
5390
5391// Check if a condition of the type x < k ? k : x can be converted into a
5392// bit operation instead of conditional moves.
5393// Currently this is allowed given:
5394// - The conditions and values match up
5395// - k is 0 or -1 (all ones)
5396// This function will not check the last condition, thats up to the caller
5397// It returns true if the transformation can be made, and in such case
5398// returns x in V, and k in SatK.
5400 SDValue &SatK)
5401{
5402 SDValue LHS = Op.getOperand(0);
5403 SDValue RHS = Op.getOperand(1);
5404 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5405 SDValue TrueVal = Op.getOperand(2);
5406 SDValue FalseVal = Op.getOperand(3);
5407
5408 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
5409 ? &RHS
5410 : nullptr;
5411
5412 // No constant operation in comparison, early out
5413 if (!K)
5414 return false;
5415
5416 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5417 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5418 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5419
5420 // If the constant on left and right side, or variable on left and right,
5421 // does not match, early out
5422 if (*K != KTmp || V != VTmp)
5423 return false;
5424
5425 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5426 SatK = *K;
5427 return true;
5428 }
5429
5430 return false;
5431}
5432
5433bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5434 if (VT == MVT::f32)
5435 return !Subtarget->hasVFP2Base();
5436 if (VT == MVT::f64)
5437 return !Subtarget->hasFP64();
5438 if (VT == MVT::f16)
5439 return !Subtarget->hasFullFP16();
5440 return false;
5441}
5442
5443SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5444 EVT VT = Op.getValueType();
5445 SDLoc dl(Op);
5446
5447 // Try to convert two saturating conditional selects into a single SSAT
5448 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5449 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5450 return SatValue;
5451
5452 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5453 // into more efficient bit operations, which is possible when k is 0 or -1
5454 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5455 // single instructions. On Thumb the shift and the bit operation will be two
5456 // instructions.
5457 // Only allow this transformation on full-width (32-bit) operations
5458 SDValue LowerSatConstant;
5459 SDValue SatValue;
5460 if (VT == MVT::i32 &&
5461 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5462 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5463 DAG.getConstant(31, dl, VT));
5464 if (isNullConstant(LowerSatConstant)) {
5465 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5466 DAG.getAllOnesConstant(dl, VT));
5467 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5468 } else if (isAllOnesConstant(LowerSatConstant))
5469 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5470 }
5471
5472 SDValue LHS = Op.getOperand(0);
5473 SDValue RHS = Op.getOperand(1);
5474 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5475 SDValue TrueVal = Op.getOperand(2);
5476 SDValue FalseVal = Op.getOperand(3);
5477 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5478 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5479
5480 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5481 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5482 unsigned TVal = CTVal->getZExtValue();
5483 unsigned FVal = CFVal->getZExtValue();
5484 unsigned Opcode = 0;
5485
5486 if (TVal == ~FVal) {
5487 Opcode = ARMISD::CSINV;
5488 } else if (TVal == ~FVal + 1) {
5489 Opcode = ARMISD::CSNEG;
5490 } else if (TVal + 1 == FVal) {
5491 Opcode = ARMISD::CSINC;
5492 } else if (TVal == FVal + 1) {
5493 Opcode = ARMISD::CSINC;
5494 std::swap(TrueVal, FalseVal);
5495 std::swap(TVal, FVal);
5496 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5497 }
5498
5499 if (Opcode) {
5500 // If one of the constants is cheaper than another, materialise the
5501 // cheaper one and let the csel generate the other.
5502 if (Opcode != ARMISD::CSINC &&
5503 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5504 std::swap(TrueVal, FalseVal);
5505 std::swap(TVal, FVal);
5506 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5507 }
5508
5509 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5510 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5511 // -(-a) == a, but (a+1)+1 != a).
5512 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5513 std::swap(TrueVal, FalseVal);
5514 std::swap(TVal, FVal);
5515 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5516 }
5517
5518 // Drops F's value because we can get it by inverting/negating TVal.
5519 FalseVal = TrueVal;
5520
5521 SDValue ARMcc;
5522 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5523 EVT VT = TrueVal.getValueType();
5524 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5525 }
5526 }
5527
5528 if (isUnsupportedFloatingType(LHS.getValueType())) {
5530 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5531
5532 // If softenSetCCOperands only returned one value, we should compare it to
5533 // zero.
5534 if (!RHS.getNode()) {
5535 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5536 CC = ISD::SETNE;
5537 }
5538 }
5539
5540 if (LHS.getValueType() == MVT::i32) {
5541 // Try to generate VSEL on ARMv8.
5542 // The VSEL instruction can't use all the usual ARM condition
5543 // codes: it only has two bits to select the condition code, so it's
5544 // constrained to use only GE, GT, VS and EQ.
5545 //
5546 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5547 // swap the operands of the previous compare instruction (effectively
5548 // inverting the compare condition, swapping 'less' and 'greater') and
5549 // sometimes need to swap the operands to the VSEL (which inverts the
5550 // condition in the sense of firing whenever the previous condition didn't)
5551 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5552 TrueVal.getValueType() == MVT::f32 ||
5553 TrueVal.getValueType() == MVT::f64)) {
5555 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5556 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5557 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5558 std::swap(TrueVal, FalseVal);
5559 }
5560 }
5561
5562 SDValue ARMcc;
5563 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5564 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5565 // Choose GE over PL, which vsel does now support
5566 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5567 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5568 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5569 }
5570
5571 ARMCC::CondCodes CondCode, CondCode2;
5572 FPCCToARMCC(CC, CondCode, CondCode2);
5573
5574 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5575 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5576 // must use VSEL (limited condition codes), due to not having conditional f16
5577 // moves.
5578 if (Subtarget->hasFPARMv8Base() &&
5579 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5580 (TrueVal.getValueType() == MVT::f16 ||
5581 TrueVal.getValueType() == MVT::f32 ||
5582 TrueVal.getValueType() == MVT::f64)) {
5583 bool swpCmpOps = false;
5584 bool swpVselOps = false;
5585 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5586
5587 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5588 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5589 if (swpCmpOps)
5590 std::swap(LHS, RHS);
5591 if (swpVselOps)
5592 std::swap(TrueVal, FalseVal);
5593 }
5594 }
5595
5596 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5597 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5598 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5599 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5600 if (CondCode2 != ARMCC::AL) {
5601 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5602 // FIXME: Needs another CMP because flag can have but one use.
5603 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
5604 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
5605 }
5606 return Result;
5607}
5608
5609/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5610/// to morph to an integer compare sequence.
5611static bool canChangeToInt(SDValue Op, bool &SeenZero,
5612 const ARMSubtarget *Subtarget) {
5613 SDNode *N = Op.getNode();
5614 if (!N->hasOneUse())
5615 // Otherwise it requires moving the value from fp to integer registers.
5616 return false;
5617 if (!N->getNumValues())
5618 return false;
5619 EVT VT = Op.getValueType();
5620 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5621 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5622 // vmrs are very slow, e.g. cortex-a8.
5623 return false;
5624
5625 if (isFloatingPointZero(Op)) {
5626 SeenZero = true;
5627 return true;
5628 }
5629 return ISD::isNormalLoad(N);
5630}
5631
5634 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5635
5636 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
5637 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5638 Ld->getPointerInfo(), Ld->getAlign(),
5639 Ld->getMemOperand()->getFlags());
5640
5641 llvm_unreachable("Unknown VFP cmp argument!");
5642}
5643
5645 SDValue &RetVal1, SDValue &RetVal2) {
5646 SDLoc dl(Op);
5647
5648 if (isFloatingPointZero(Op)) {
5649 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5650 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5651 return;
5652 }
5653
5654 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5655 SDValue Ptr = Ld->getBasePtr();
5656 RetVal1 =
5657 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5658 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5659
5660 EVT PtrType = Ptr.getValueType();
5661 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5662 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5663 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5664 Ld->getPointerInfo().getWithOffset(4),
5665 commonAlignment(Ld->getAlign(), 4),
5666 Ld->getMemOperand()->getFlags());
5667 return;
5668 }
5669
5670 llvm_unreachable("Unknown VFP cmp argument!");
5671}
5672
5673/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5674/// f32 and even f64 comparisons to integer ones.
5675SDValue
5676ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5677 SDValue Chain = Op.getOperand(0);
5678 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5679 SDValue LHS = Op.getOperand(2);
5680 SDValue RHS = Op.getOperand(3);
5681 SDValue Dest = Op.getOperand(4);
5682 SDLoc dl(Op);
5683
5684 bool LHSSeenZero = false;
5685 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5686 bool RHSSeenZero = false;
5687 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5688 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5689 // If unsafe fp math optimization is enabled and there are no other uses of
5690 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5691 // to an integer comparison.
5692 if (CC == ISD::SETOEQ)
5693 CC = ISD::SETEQ;
5694 else if (CC == ISD::SETUNE)
5695 CC = ISD::SETNE;
5696
5697 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5698 SDValue ARMcc;
5699 if (LHS.getValueType() == MVT::f32) {
5700 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5701 bitcastf32Toi32(LHS, DAG), Mask);
5702 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5703 bitcastf32Toi32(RHS, DAG), Mask);
5704 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5705 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5706 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5707 Chain, Dest, ARMcc, CCR, Cmp);
5708 }
5709
5710 SDValue LHS1, LHS2;
5711 SDValue RHS1, RHS2;
5712 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5713 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5714 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5715 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5717 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5718 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5719 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5720 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
5721 }
5722
5723 return SDValue();
5724}
5725
5726SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5727 SDValue Chain = Op.getOperand(0);
5728 SDValue Cond = Op.getOperand(1);
5729 SDValue Dest = Op.getOperand(2);
5730 SDLoc dl(Op);
5731
5732 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5733 // instruction.
5734 unsigned Opc = Cond.getOpcode();
5735 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5736 !Subtarget->isThumb1Only();
5737 if (Cond.getResNo() == 1 &&
5738 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5739 Opc == ISD::USUBO || OptimizeMul)) {
5740 // Only lower legal XALUO ops.
5741 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5742 return SDValue();
5743
5744 // The actual operation with overflow check.
5745 SDValue Value, OverflowCmp;
5746 SDValue ARMcc;
5747 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5748
5749 // Reverse the condition code.
5751 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5753 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5754 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5755
5756 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5757 OverflowCmp);
5758 }
5759
5760 return SDValue();
5761}
5762
5763SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5764 SDValue Chain = Op.getOperand(0);
5765 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5766 SDValue LHS = Op.getOperand(2);
5767 SDValue RHS = Op.getOperand(3);
5768 SDValue Dest = Op.getOperand(4);
5769 SDLoc dl(Op);
5770
5771 if (isUnsupportedFloatingType(LHS.getValueType())) {
5773 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5774
5775 // If softenSetCCOperands only returned one value, we should compare it to
5776 // zero.
5777 if (!RHS.getNode()) {
5778 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5779 CC = ISD::SETNE;
5780 }
5781 }
5782
5783 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5784 // instruction.
5785 unsigned Opc = LHS.getOpcode();
5786 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5787 !Subtarget->isThumb1Only();
5788 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5789 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5790 Opc == ISD::USUBO || OptimizeMul) &&
5791 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5792 // Only lower legal XALUO ops.
5793 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
5794 return SDValue();
5795
5796 // The actual operation with overflow check.
5797 SDValue Value, OverflowCmp;
5798 SDValue ARMcc;
5799 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5800
5801 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5802 // Reverse the condition code.
5804 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5806 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5807 }
5808 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5809
5810 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5811 OverflowCmp);
5812 }
5813
5814 if (LHS.getValueType() == MVT::i32) {
5815 SDValue ARMcc;
5816 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5817 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5818 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5819 Chain, Dest, ARMcc, CCR, Cmp);
5820 }
5821
5822 if (getTargetMachine().Options.UnsafeFPMath &&
5823 (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5824 CC == ISD::SETNE || CC == ISD::SETUNE)) {
5825 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5826 return Result;
5827 }
5828
5829 ARMCC::CondCodes CondCode, CondCode2;
5830 FPCCToARMCC(CC, CondCode, CondCode2);
5831
5832 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5833 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5834 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5835 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5836 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5837 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5838 if (CondCode2 != ARMCC::AL) {
5839 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5840 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
5841 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5842 }
5843 return Res;
5844}
5845
5846SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5847 SDValue Chain = Op.getOperand(0);
5848 SDValue Table = Op.getOperand(1);
5849 SDValue Index = Op.getOperand(2);
5850 SDLoc dl(Op);
5851
5852 EVT PTy = getPointerTy(DAG.getDataLayout());
5853 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5854 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5855 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5856 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5857 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5858 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5859 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5860 // which does another jump to the destination. This also makes it easier
5861 // to translate it to TBB / TBH later (Thumb2 only).
5862 // FIXME: This might not work if the function is extremely large.
5863 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5864 Addr, Op.getOperand(2), JTI);
5865 }
5866 if (isPositionIndependent() || Subtarget->isROPI()) {
5867 Addr =
5868 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5870 Chain = Addr.getValue(1);
5871 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5872 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5873 } else {
5874 Addr =
5875 DAG.getLoad(PTy, dl, Chain, Addr,
5877 Chain = Addr.getValue(1);
5878 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5879 }
5880}
5881
5883 EVT VT = Op.getValueType();
5884 SDLoc dl(Op);
5885
5886 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5887 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5888 return Op;
5889 return DAG.UnrollVectorOp(Op.getNode());
5890 }
5891
5892 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5893
5894 EVT NewTy;
5895 const EVT OpTy = Op.getOperand(0).getValueType();
5896 if (OpTy == MVT::v4f32)
5897 NewTy = MVT::v4i32;
5898 else if (OpTy == MVT::v4f16 && HasFullFP16)
5899 NewTy = MVT::v4i16;
5900 else if (OpTy == MVT::v8f16 && HasFullFP16)
5901 NewTy = MVT::v8i16;
5902 else
5903 llvm_unreachable("Invalid type for custom lowering!");
5904
5905 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5906 return DAG.UnrollVectorOp(Op.getNode());
5907
5908 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5909 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5910}
5911
5912SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5913 EVT VT = Op.getValueType();
5914 if (VT.isVector())
5915 return LowerVectorFP_TO_INT(Op, DAG);
5916
5917 bool IsStrict = Op->isStrictFPOpcode();
5918 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5919
5920 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5921 RTLIB::Libcall LC;
5922 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5923 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5924 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5925 Op.getValueType());
5926 else
5927 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5928 Op.getValueType());
5929 SDLoc Loc(Op);
5930 MakeLibCallOptions CallOptions;
5931 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5933 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5934 CallOptions, Loc, Chain);
5935 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5936 }
5937
5938 // FIXME: Remove this when we have strict fp instruction selection patterns
5939 if (IsStrict) {
5940 SDLoc Loc(Op);
5941 SDValue Result =
5944 Loc, Op.getValueType(), SrcVal);
5945 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5946 }
5947
5948 return Op;
5949}
5950
5952 const ARMSubtarget *Subtarget) {
5953 EVT VT = Op.getValueType();
5954 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5955 EVT FromVT = Op.getOperand(0).getValueType();
5956
5957 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5958 return Op;
5959 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5960 Subtarget->hasFP64())
5961 return Op;
5962 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5963 Subtarget->hasFullFP16())
5964 return Op;
5965 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5966 Subtarget->hasMVEFloatOps())
5967 return Op;
5968 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5969 Subtarget->hasMVEFloatOps())
5970 return Op;
5971
5972 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5973 return SDValue();
5974
5975 SDLoc DL(Op);
5976 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5977 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5978 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5979 DAG.getValueType(VT.getScalarType()));
5980 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5981 DAG.getConstant((1 << BW) - 1, DL, VT));
5982 if (IsSigned)
5983 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5984 DAG.getConstant(-(1 << BW), DL, VT));
5985 return Max;
5986}
5987
5989 EVT VT = Op.getValueType();
5990 SDLoc dl(Op);
5991
5992 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5993 if (VT.getVectorElementType() == MVT::f32)
5994 return Op;
5995 return DAG.UnrollVectorOp(Op.getNode());
5996 }
5997
5998 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5999 Op.getOperand(0).getValueType() == MVT::v8i16) &&
6000 "Invalid type for custom lowering!");
6001
6002 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
6003
6004 EVT DestVecType;
6005 if (VT == MVT::v4f32)
6006 DestVecType = MVT::v4i32;
6007 else if (VT == MVT::v4f16 && HasFullFP16)
6008 DestVecType = MVT::v4i16;
6009 else if (VT == MVT::v8f16 && HasFullFP16)
6010 DestVecType = MVT::v8i16;
6011 else
6012 return DAG.UnrollVectorOp(Op.getNode());
6013
6014 unsigned CastOpc;
6015 unsigned Opc;
6016 switch (Op.getOpcode()) {
6017 default: llvm_unreachable("Invalid opcode!");
6018 case ISD::SINT_TO_FP:
6019 CastOpc = ISD::SIGN_EXTEND;
6020 Opc = ISD::SINT_TO_FP;
6021 break;
6022 case ISD::UINT_TO_FP:
6023 CastOpc = ISD::ZERO_EXTEND;
6024 Opc = ISD::UINT_TO_FP;
6025 break;
6026 }
6027
6028 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
6029 return DAG.getNode(Opc, dl, VT, Op);
6030}
6031
6032SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
6033 EVT VT = Op.getValueType();
6034 if (VT.isVector())
6035 return LowerVectorINT_TO_FP(Op, DAG);
6036 if (isUnsupportedFloatingType(VT)) {
6037 RTLIB::Libcall LC;
6038 if (Op.getOpcode() == ISD::SINT_TO_FP)
6039 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
6040 Op.getValueType());
6041 else
6042 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
6043 Op.getValueType());
6044 MakeLibCallOptions CallOptions;
6045 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
6046 CallOptions, SDLoc(Op)).first;
6047 }
6048
6049 return Op;
6050}
6051
6052SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
6053 // Implement fcopysign with a fabs and a conditional fneg.
6054 SDValue Tmp0 = Op.getOperand(0);
6055 SDValue Tmp1 = Op.getOperand(1);
6056 SDLoc dl(Op);
6057 EVT VT = Op.getValueType();
6058 EVT SrcVT = Tmp1.getValueType();
6059 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
6060 Tmp0.getOpcode() == ARMISD::VMOVDRR;
6061 bool UseNEON = !InGPR && Subtarget->hasNEON();
6062
6063 if (UseNEON) {
6064 // Use VBSL to copy the sign bit.
6065 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
6066 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
6067 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
6068 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
6069 if (VT == MVT::f64)
6070 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6071 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
6072 DAG.getConstant(32, dl, MVT::i32));
6073 else /*if (VT == MVT::f32)*/
6074 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
6075 if (SrcVT == MVT::f32) {
6076 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
6077 if (VT == MVT::f64)
6078 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6079 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
6080 DAG.getConstant(32, dl, MVT::i32));
6081 } else if (VT == MVT::f32)
6082 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
6083 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
6084 DAG.getConstant(32, dl, MVT::i32));
6085 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
6086 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
6087
6089 dl, MVT::i32);
6090 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
6091 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
6092 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
6093
6094 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
6095 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
6096 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
6097 if (VT == MVT::f32) {
6098 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
6099 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
6100 DAG.getConstant(0, dl, MVT::i32));
6101 } else {
6102 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
6103 }
6104
6105 return Res;
6106 }
6107
6108 // Bitcast operand 1 to i32.
6109 if (SrcVT == MVT::f64)
6110 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6111 Tmp1).getValue(1);
6112 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6113
6114 // Or in the signbit with integer operations.
6115 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6116 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6117 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6118 if (VT == MVT::f32) {
6119 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6120 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6121 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6122 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6123 }
6124
6125 // f64: Or the high part with signbit and then combine two parts.
6126 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6127 Tmp0);
6128 SDValue Lo = Tmp0.getValue(0);
6129 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6130 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6131 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6132}
6133
6134SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6136 MachineFrameInfo &MFI = MF.getFrameInfo();
6137 MFI.setReturnAddressIsTaken(true);
6138
6140 return SDValue();
6141
6142 EVT VT = Op.getValueType();
6143 SDLoc dl(Op);
6144 unsigned Depth = Op.getConstantOperandVal(0);
6145 if (Depth) {
6146 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6147 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6148 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6149 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6151 }
6152
6153 // Return LR, which contains the return address. Mark it an implicit live-in.
6154 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6155 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6156}
6157
6158SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6159 const ARMBaseRegisterInfo &ARI =
6160 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6162 MachineFrameInfo &MFI = MF.getFrameInfo();
6163 MFI.setFrameAddressIsTaken(true);
6164
6165 EVT VT = Op.getValueType();
6166 SDLoc dl(Op); // FIXME probably not meaningful
6167 unsigned Depth = Op.getConstantOperandVal(0);
6168 Register FrameReg = ARI.getFrameRegister(MF);
6169 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6170 while (Depth--)
6171 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6173 return FrameAddr;
6174}
6175
6176// FIXME? Maybe this could be a TableGen attribute on some registers and
6177// this table could be generated automatically from RegInfo.
6178Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6179 const MachineFunction &MF) const {
6181 .Case("sp", ARM::SP)
6182 .Default(0);
6183 if (Reg)
6184 return Reg;
6185 report_fatal_error(Twine("Invalid register name \""
6186 + StringRef(RegName) + "\"."));
6187}
6188
6189// Result is 64 bit value so split into two 32 bit values and return as a
6190// pair of values.
6192 SelectionDAG &DAG) {
6193 SDLoc DL(N);
6194
6195 // This function is only supposed to be called for i64 type destination.
6196 assert(N->getValueType(0) == MVT::i64
6197 && "ExpandREAD_REGISTER called for non-i64 type result.");
6198
6200 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6201 N->getOperand(0),
6202 N->getOperand(1));
6203
6204 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6205 Read.getValue(1)));
6206 Results.push_back(Read.getOperand(0));
6207}
6208
6209/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6210/// When \p DstVT, the destination type of \p BC, is on the vector
6211/// register bank and the source of bitcast, \p Op, operates on the same bank,
6212/// it might be possible to combine them, such that everything stays on the
6213/// vector register bank.
6214/// \p return The node that would replace \p BT, if the combine
6215/// is possible.
6217 SelectionDAG &DAG) {
6218 SDValue Op = BC->getOperand(0);
6219 EVT DstVT = BC->getValueType(0);
6220
6221 // The only vector instruction that can produce a scalar (remember,
6222 // since the bitcast was about to be turned into VMOVDRR, the source
6223 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6224 // Moreover, we can do this combine only if there is one use.
6225 // Finally, if the destination type is not a vector, there is not
6226 // much point on forcing everything on the vector bank.
6227 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6228 !Op.hasOneUse())
6229 return SDValue();
6230
6231 // If the index is not constant, we will introduce an additional
6232 // multiply that will stick.
6233 // Give up in that case.
6234 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6235 if (!Index)
6236 return SDValue();
6237 unsigned DstNumElt = DstVT.getVectorNumElements();
6238
6239 // Compute the new index.
6240 const APInt &APIntIndex = Index->getAPIntValue();
6241 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6242 NewIndex *= APIntIndex;
6243 // Check if the new constant index fits into i32.
6244 if (NewIndex.getBitWidth() > 32)
6245 return SDValue();
6246
6247 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6248 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6249 SDLoc dl(Op);
6250 SDValue ExtractSrc = Op.getOperand(0);
6251 EVT VecVT = EVT::getVectorVT(
6252 *DAG.getContext(), DstVT.getScalarType(),
6253 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6254 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6255 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6256 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6257}
6258
6259/// ExpandBITCAST - If the target supports VFP, this function is called to
6260/// expand a bit convert where either the source or destination type is i64 to
6261/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6262/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6263/// vectors), since the legalizer won't know what to do with that.
6264SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6265 const ARMSubtarget *Subtarget) const {
6266 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6267 SDLoc dl(N);
6268 SDValue Op = N->getOperand(0);
6269
6270 // This function is only supposed to be called for i16 and i64 types, either
6271 // as the source or destination of the bit convert.
6272 EVT SrcVT = Op.getValueType();
6273 EVT DstVT = N->getValueType(0);
6274
6275 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6276 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6277 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6278 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6279
6280 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6281 (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
6282 return DAG.getNode(
6283 ISD::TRUNCATE, SDLoc(N), DstVT,
6284 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6285
6286 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6287 return SDValue();
6288
6289 // Turn i64->f64 into VMOVDRR.
6290 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
6291 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6292 // if we can combine the bitcast with its source.
6294 return Val;
6295 SDValue Lo, Hi;
6296 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6297 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6298 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6299 }
6300
6301 // Turn f64->i64 into VMOVRRD.
6302 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
6303 SDValue Cvt;
6304 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6305 SrcVT.getVectorNumElements() > 1)
6306 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6307 DAG.getVTList(MVT::i32, MVT::i32),
6308 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6309 else
6310 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6311 DAG.getVTList(MVT::i32, MVT::i32), Op);
6312 // Merge the pieces into a single i64 value.
6313 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6314 }
6315
6316 return SDValue();
6317}
6318
6319/// getZeroVector - Returns a vector of specified type with all zero elements.
6320/// Zero vectors are used to represent vector negation and in those cases
6321/// will be implemented with the NEON VNEG instruction. However, VNEG does
6322/// not support i64 elements, so sometimes the zero vectors will need to be
6323/// explicitly constructed. Regardless, use a canonical VMOV to create the
6324/// zero vector.
6325static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6326 assert(VT.isVector() && "Expected a vector type");
6327 // The canonical modified immediate encoding of a zero vector is....0!
6328 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6329 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6330 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6331 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6332}
6333
6334/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6335/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6336SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6337 SelectionDAG &DAG) const {
6338 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6339 EVT VT = Op.getValueType();
6340 unsigned VTBits = VT.getSizeInBits();
6341 SDLoc dl(Op);
6342 SDValue ShOpLo = Op.getOperand(0);
6343 SDValue ShOpHi = Op.getOperand(1);
6344 SDValue ShAmt = Op.getOperand(2);
6345 SDValue ARMcc;
6346 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6347 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6348
6349 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6350
6351 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6352 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6353 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6354 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6355 DAG.getConstant(VTBits, dl, MVT::i32));
6356 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6357 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6358 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6359 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6360 ISD::SETGE, ARMcc, DAG, dl);
6361 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
6362 ARMcc, CCR, CmpLo);
6363
6364 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6365 SDValue HiBigShift = Opc == ISD::SRA
6366 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6367 DAG.getConstant(VTBits - 1, dl, VT))
6368 : DAG.getConstant(0, dl, VT);
6369 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6370 ISD::SETGE, ARMcc, DAG, dl);
6371 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6372 ARMcc, CCR, CmpHi);
6373
6374 SDValue Ops[2] = { Lo, Hi };
6375 return DAG.getMergeValues(Ops, dl);
6376}
6377
6378/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6379/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6380SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6381 SelectionDAG &DAG) const {
6382 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6383 EVT VT = Op.getValueType();
6384 unsigned VTBits = VT.getSizeInBits();
6385 SDLoc dl(Op);
6386 SDValue ShOpLo = Op.getOperand(0);
6387 SDValue ShOpHi = Op.getOperand(1);
6388 SDValue ShAmt = Op.getOperand(2);
6389 SDValue ARMcc;
6390 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6391
6392 assert(Op.getOpcode() == ISD::SHL_PARTS);
6393 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6394 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6395 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6396 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6397 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6398
6399 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6400 DAG.getConstant(VTBits, dl, MVT::i32));
6401 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6402 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6403 ISD::SETGE, ARMcc, DAG, dl);
6404 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6405 ARMcc, CCR, CmpHi);
6406
6407 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6408 ISD::SETGE, ARMcc, DAG, dl);
6409 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6410 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6411 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
6412
6413 SDValue Ops[2] = { Lo, Hi };
6414 return DAG.getMergeValues(Ops, dl);
6415}
6416
6417SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6418 SelectionDAG &DAG) const {
6419 // The rounding mode is in bits 23:22 of the FPSCR.
6420 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6421 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6422 // so that the shift + and get folded into a bitfield extract.
6423 SDLoc dl(Op);
6424 SDValue Chain = Op.getOperand(0);
6425 SDValue Ops[] = {Chain,
6426 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6427
6428 SDValue FPSCR =
6429 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6430 Chain = FPSCR.getValue(1);
6431 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6432 DAG.getConstant(1U << 22, dl, MVT::i32));
6433 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6434 DAG.getConstant(22, dl, MVT::i32));
6435 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6436 DAG.getConstant(3, dl, MVT::i32));
6437 return DAG.getMergeValues({And, Chain}, dl);
6438}
6439
6440SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6441 SelectionDAG &DAG) const {
6442 SDLoc DL(Op);
6443 SDValue Chain = Op->getOperand(0);
6444 SDValue RMValue = Op->getOperand(1);
6445
6446 // The rounding mode is in bits 23:22 of the FPSCR.
6447 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6448 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6449 // ((arg - 1) & 3) << 22).
6450 //
6451 // It is expected that the argument of llvm.set.rounding is within the
6452 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6453 // responsibility of the code generated llvm.set.rounding to ensure this
6454 // condition.
6455
6456 // Calculate new value of FPSCR[23:22].
6457 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6458 DAG.getConstant(1, DL, MVT::i32));
6459 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6460 DAG.getConstant(0x3, DL, MVT::i32));
6461 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6462 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6463
6464 // Get current value of FPSCR.
6465 SDValue Ops[] = {Chain,
6466 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6467 SDValue FPSCR =
6468 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6469 Chain = FPSCR.getValue(1);
6470 FPSCR = FPSCR.getValue(0);
6471
6472 // Put new rounding mode into FPSCR[23:22].
6473 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6474 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6475 DAG.getConstant(RMMask, DL, MVT::i32));
6476 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6477 SDValue Ops2[] = {
6478 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6479 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6480}
6481
6482SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6483 SelectionDAG &DAG) const {
6484 SDLoc DL(Op);
6485 SDValue Chain = Op->getOperand(0);
6486 SDValue Mode = Op->getOperand(1);
6487
6488 // Generate nodes to build:
6489 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6490 SDValue Ops[] = {Chain,
6491 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6492 SDValue FPSCR =
6493 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6494 Chain = FPSCR.getValue(1);
6495 FPSCR = FPSCR.getValue(0);
6496
6497 SDValue FPSCRMasked =
6498 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6499 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6500 SDValue InputMasked =
6501 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6502 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6503 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6504
6505 SDValue Ops2[] = {
6506 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6507 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6508}
6509
6510SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6511 SelectionDAG &DAG) const {
6512 SDLoc DL(Op);
6513 SDValue Chain = Op->getOperand(0);
6514
6515 // To get the default FP mode all control bits are cleared:
6516 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6517 SDValue Ops[] = {Chain,
6518 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6519 SDValue FPSCR =
6520 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6521 Chain = FPSCR.getValue(1);
6522 FPSCR = FPSCR.getValue(0);
6523
6524 SDValue FPSCRMasked = DAG.getNode(
6525 ISD::AND, DL, MVT::i32, FPSCR,
6527 SDValue Ops2[] = {Chain,
6528 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6529 FPSCRMasked};
6530 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6531}
6532
6534 const ARMSubtarget *ST) {
6535 SDLoc dl(N);
6536 EVT VT = N->getValueType(0);
6537 if (VT.isVector() && ST->hasNEON()) {
6538
6539 // Compute the least significant set bit: LSB = X & -X
6540 SDValue X = N->getOperand(0);
6541 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6542 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6543
6544 EVT ElemTy = VT.getVectorElementType();
6545
6546 if (ElemTy == MVT::i8) {
6547 // Compute with: cttz(x) = ctpop(lsb - 1)
6548 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6549 DAG.getTargetConstant(1, dl, ElemTy));
6550 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6551 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6552 }
6553
6554 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6555 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6556 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6557 unsigned NumBits = ElemTy.getSizeInBits();
6558 SDValue WidthMinus1 =
6559 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6560 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6561 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6562 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6563 }
6564
6565 // Compute with: cttz(x) = ctpop(lsb - 1)
6566
6567 // Compute LSB - 1.
6568 SDValue Bits;
6569 if (ElemTy == MVT::i64) {
6570 // Load constant 0xffff'ffff'ffff'ffff to register.
6571 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6572 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6573 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6574 } else {
6575 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6576 DAG.getTargetConstant(1, dl, ElemTy));
6577 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6578 }
6579 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6580 }
6581
6582 if (!ST->hasV6T2Ops())
6583 return SDValue();
6584
6585 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6586 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6587}
6588
6590 const ARMSubtarget *ST) {
6591 EVT VT = N->getValueType(0);
6592 SDLoc DL(N);
6593
6594 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6595 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6596 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6597 "Unexpected type for custom ctpop lowering");
6598
6599 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6600 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6601 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6602 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6603
6604 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6605 unsigned EltSize = 8;
6606 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6607 while (EltSize != VT.getScalarSizeInBits()) {
6609 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6610 TLI.getPointerTy(DAG.getDataLayout())));
6611 Ops.push_back(Res);
6612
6613 EltSize *= 2;
6614 NumElts /= 2;
6615 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6616 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6617 }
6618
6619 return Res;
6620}
6621
6622/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6623/// operand of a vector shift operation, where all the elements of the
6624/// build_vector must have the same constant integer value.
6625static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6626 // Ignore bit_converts.
6627 while (Op.getOpcode() == ISD::BITCAST)
6628 Op = Op.getOperand(0);
6629 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
6630 APInt SplatBits, SplatUndef;
6631 unsigned SplatBitSize;
6632 bool HasAnyUndefs;
6633 if (!BVN ||
6634 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6635 ElementBits) ||
6636 SplatBitSize > ElementBits)
6637 return false;
6638 Cnt = SplatBits.getSExtValue();
6639 return true;
6640}
6641
6642/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6643/// operand of a vector shift left operation. That value must be in the range:
6644/// 0 <= Value < ElementBits for a left shift; or
6645/// 0 <= Value <= ElementBits for a long left shift.
6646static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6647 assert(VT.isVector() && "vector shift count is not a vector type");
6648 int64_t ElementBits = VT.getScalarSizeInBits();
6649 if (!getVShiftImm(Op, ElementBits, Cnt))
6650 return false;
6651 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6652}
6653
6654/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6655/// operand of a vector shift right operation. For a shift opcode, the value
6656/// is positive, but for an intrinsic the value count must be negative. The
6657/// absolute value must be in the range:
6658/// 1 <= |Value| <= ElementBits for a right shift; or
6659/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6660static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6661 int64_t &Cnt) {
6662 assert(VT.isVector() && "vector shift count is not a vector type");
6663 int64_t ElementBits = VT.getScalarSizeInBits();
6664 if (!getVShiftImm(Op, ElementBits, Cnt))
6665 return false;
6666 if (!isIntrinsic)
6667 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6668 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6669 Cnt = -Cnt;
6670 return true;
6671 }
6672 return false;
6673}
6674
6676 const ARMSubtarget *ST) {
6677 EVT VT = N->getValueType(0);
6678 SDLoc dl(N);
6679 int64_t Cnt;
6680
6681 if (!VT.isVector())
6682 return SDValue();
6683
6684 // We essentially have two forms here. Shift by an immediate and shift by a
6685 // vector register (there are also shift by a gpr, but that is just handled
6686 // with a tablegen pattern). We cannot easily match shift by an immediate in
6687 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6688 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6689 // signed or unsigned, and a negative shift indicates a shift right).
6690 if (N->getOpcode() == ISD::SHL) {
6691 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6692 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6693 DAG.getConstant(Cnt, dl, MVT::i32));
6694 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6695 N->getOperand(1));
6696 }
6697
6698 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6699 "unexpected vector shift opcode");
6700
6701 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6702 unsigned VShiftOpc =
6703 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6704 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6705 DAG.getConstant(Cnt, dl, MVT::i32));
6706 }
6707
6708 // Other right shifts we don't have operations for (we use a shift left by a
6709 // negative number).
6710 EVT ShiftVT = N->getOperand(1).getValueType();
6711 SDValue NegatedCount = DAG.getNode(
6712 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6713 unsigned VShiftOpc =
6714 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6715 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6716}
6717
6719 const ARMSubtarget *ST) {
6720 EVT VT = N->getValueType(0);
6721 SDLoc dl(N);
6722
6723 // We can get here for a node like i32 = ISD::SHL i32, i64
6724 if (VT != MVT::i64)
6725 return SDValue();
6726
6727 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6728 N->getOpcode() == ISD::SHL) &&
6729 "Unknown shift to lower!");
6730
6731 unsigned ShOpc = N->getOpcode();
6732 if (ST->hasMVEIntegerOps()) {
6733 SDValue ShAmt = N->getOperand(1);
6734 unsigned ShPartsOpc = ARMISD::LSLL;
6735 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
6736
6737 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6738 // then do the default optimisation
6739 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6740 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6741 return SDValue();
6742
6743 // Extract the lower 32 bits of the shift amount if it's not an i32
6744 if (ShAmt->getValueType(0) != MVT::i32)
6745 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6746
6747 if (ShOpc == ISD::SRL) {
6748 if (!Con)
6749 // There is no t2LSRLr instruction so negate and perform an lsll if the
6750 // shift amount is in a register, emulating a right shift.
6751 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6752 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6753 else
6754 // Else generate an lsrl on the immediate shift amount
6755 ShPartsOpc = ARMISD::LSRL;
6756 } else if (ShOpc == ISD::SRA)
6757 ShPartsOpc = ARMISD::ASRL;
6758
6759 // Split Lower/Upper 32 bits of the destination/source
6760 SDValue Lo, Hi;
6761 std::tie(Lo, Hi) =
6762 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6763 // Generate the shift operation as computed above
6764 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6765 ShAmt);
6766 // The upper 32 bits come from the second return value of lsll
6767 Hi = SDValue(Lo.getNode(), 1);
6768 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6769 }
6770
6771 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6772 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6773 return SDValue();
6774
6775 // If we are in thumb mode, we don't have RRX.
6776 if (ST->isThumb1Only())
6777 return SDValue();
6778
6779 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6780 SDValue Lo, Hi;
6781 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6782
6783 // First, build a SRA_GLUE/SRL_GLUE op, which shifts the top part by one and
6784 // captures the result into a carry flag.
6785 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_GLUE:ARMISD::SRA_GLUE;
6786 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
6787
6788 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6789 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6790
6791 // Merge the pieces into a single i64 value.
6792 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6793}
6794
6796 const ARMSubtarget *ST) {
6797 bool Invert = false;
6798 bool Swap = false;
6799 unsigned Opc = ARMCC::AL;
6800
6801 SDValue Op0 = Op.getOperand(0);
6802 SDValue Op1 = Op.getOperand(1);
6803 SDValue CC = Op.getOperand(2);
6804 EVT VT = Op.getValueType();
6805 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6806 SDLoc dl(Op);
6807
6808 EVT CmpVT;
6809 if (ST->hasNEON())
6811 else {
6812 assert(ST->hasMVEIntegerOps() &&
6813 "No hardware support for integer vector comparison!");
6814
6815 if (Op.getValueType().getVectorElementType() != MVT::i1)
6816 return SDValue();
6817
6818 // Make sure we expand floating point setcc to scalar if we do not have
6819 // mve.fp, so that we can handle them from there.
6820 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6821 return SDValue();
6822
6823 CmpVT = VT;
6824 }
6825
6826 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6827 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6828 // Special-case integer 64-bit equality comparisons. They aren't legal,
6829 // but they can be lowered with a few vector instructions.
6830 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6831 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6832 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6833 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6834 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6835 DAG.getCondCode(ISD::SETEQ));
6836 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6837 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6838 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6839 if (SetCCOpcode == ISD::SETNE)
6840 Merged = DAG.getNOT(dl, Merged, CmpVT);
6841 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6842 return Merged;
6843 }
6844
6845 if (CmpVT.getVectorElementType() == MVT::i64)
6846 // 64-bit comparisons are not legal in general.
6847 return SDValue();
6848
6849 if (Op1.getValueType().isFloatingPoint()) {
6850 switch (SetCCOpcode) {
6851 default: llvm_unreachable("Illegal FP comparison");
6852 case ISD::SETUNE:
6853 case ISD::SETNE:
6854 if (ST->hasMVEFloatOps()) {
6855 Opc = ARMCC::NE; break;
6856 } else {
6857 Invert = true; [[fallthrough]];
6858 }
6859 case ISD::SETOEQ:
6860 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6861 case ISD::SETOLT:
6862 case ISD::SETLT: Swap = true; [[fallthrough]];
6863 case ISD::SETOGT:
6864 case ISD::SETGT: Opc = ARMCC::GT; break;
6865 case ISD::SETOLE:
6866 case ISD::SETLE: Swap = true; [[fallthrough]];
6867 case ISD::SETOGE:
6868 case ISD::SETGE: Opc = ARMCC::GE; break;
6869 case ISD::SETUGE: Swap = true; [[fallthrough]];
6870 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6871 case ISD::SETUGT: Swap = true; [[fallthrough]];
6872 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6873 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6874 case ISD::SETONE: {
6875 // Expand this to (OLT | OGT).
6876 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6877 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6878 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6879 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6880 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6881 if (Invert)
6882 Result = DAG.getNOT(dl, Result, VT);
6883 return Result;
6884 }
6885 case ISD::SETUO: Invert = true; [[fallthrough]];
6886 case ISD::SETO: {
6887 // Expand this to (OLT | OGE).
6888 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6889 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6890 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6891 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6892 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6893 if (Invert)
6894 Result = DAG.getNOT(dl, Result, VT);
6895 return Result;
6896 }
6897 }
6898 } else {
6899 // Integer comparisons.
6900 switch (SetCCOpcode) {
6901 default: llvm_unreachable("Illegal integer comparison");
6902 case ISD::SETNE:
6903 if (ST->hasMVEIntegerOps()) {
6904 Opc = ARMCC::NE; break;
6905 } else {
6906 Invert = true; [[fallthrough]];
6907 }
6908 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6909 case ISD::SETLT: Swap = true; [[fallthrough]];
6910 case ISD::SETGT: Opc = ARMCC::GT; break;
6911 case ISD::SETLE: Swap = true; [[fallthrough]];
6912 case ISD::SETGE: Opc = ARMCC::GE; break;
6913 case ISD::SETULT: Swap = true; [[fallthrough]];
6914 case ISD::SETUGT: Opc = ARMCC::HI; break;
6915 case ISD::SETULE: Swap = true; [[fallthrough]];
6916 case ISD::SETUGE: Opc = ARMCC::HS; break;
6917 }
6918
6919 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6920 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6921 SDValue AndOp;
6923 AndOp = Op0;
6924 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6925 AndOp = Op1;
6926
6927 // Ignore bitconvert.
6928 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6929 AndOp = AndOp.getOperand(0);
6930
6931 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6932 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6933 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6934 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6935 if (!Invert)
6936 Result = DAG.getNOT(dl, Result, VT);
6937 return Result;
6938 }
6939 }
6940 }
6941
6942 if (Swap)
6943 std::swap(Op0, Op1);
6944
6945 // If one of the operands is a constant vector zero, attempt to fold the
6946 // comparison to a specialized compare-against-zero form.
6948 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6949 Opc == ARMCC::NE)) {
6950 if (Opc == ARMCC::GE)
6951 Opc = ARMCC::LE;
6952 else if (Opc == ARMCC::GT)
6953 Opc = ARMCC::LT;
6954 std::swap(Op0, Op1);
6955 }
6956
6957 SDValue Result;
6959 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6960 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6961 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6962 DAG.getConstant(Opc, dl, MVT::i32));
6963 else
6964 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6965 DAG.getConstant(Opc, dl, MVT::i32));
6966
6967 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6968
6969 if (Invert)
6970 Result = DAG.getNOT(dl, Result, VT);
6971
6972 return Result;
6973}
6974
6976 SDValue LHS = Op.getOperand(0);
6977 SDValue RHS = Op.getOperand(1);
6978 SDValue Carry = Op.getOperand(2);
6979 SDValue Cond = Op.getOperand(3);
6980 SDLoc DL(Op);
6981
6982 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6983
6984 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6985 // have to invert the carry first.
6986 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6987 DAG.getConstant(1, DL, MVT::i32), Carry);
6988 // This converts the boolean value carry into the carry flag.
6989 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6990
6991 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6992 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6993
6994 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6995 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6996 SDValue ARMcc = DAG.getConstant(
6997 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6998 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6999 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
7000 Cmp.getValue(1), SDValue());
7001 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
7002 CCR, Chain.getValue(1));
7003}
7004
7005/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
7006/// valid vector constant for a NEON or MVE instruction with a "modified
7007/// immediate" operand (e.g., VMOV). If so, return the encoded value.
7008static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
7009 unsigned SplatBitSize, SelectionDAG &DAG,
7010 const SDLoc &dl, EVT &VT, EVT VectorVT,
7011 VMOVModImmType type) {
7012 unsigned OpCmode, Imm;
7013 bool is128Bits = VectorVT.is128BitVector();
7014
7015 // SplatBitSize is set to the smallest size that splats the vector, so a
7016 // zero vector will always have SplatBitSize == 8. However, NEON modified
7017 // immediate instructions others than VMOV do not support the 8-bit encoding
7018 // of a zero vector, and the default encoding of zero is supposed to be the
7019 // 32-bit version.
7020 if (SplatBits == 0)
7021 SplatBitSize = 32;
7022
7023 switch (SplatBitSize) {
7024 case 8:
7025 if (type != VMOVModImm)
7026 return SDValue();
7027 // Any 1-byte value is OK. Op=0, Cmode=1110.
7028 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
7029 OpCmode = 0xe;
7030 Imm = SplatBits;
7031 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
7032 break;
7033
7034 case 16:
7035 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
7036 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
7037 if ((SplatBits & ~0xff) == 0) {
7038 // Value = 0x00nn: Op=x, Cmode=100x.
7039 OpCmode = 0x8;
7040 Imm = SplatBits;
7041 break;
7042 }
7043 if ((SplatBits & ~0xff00) == 0) {
7044 // Value = 0xnn00: Op=x, Cmode=101x.
7045 OpCmode = 0xa;
7046 Imm = SplatBits >> 8;
7047 break;
7048 }
7049 return SDValue();
7050
7051 case 32:
7052 // NEON's 32-bit VMOV supports splat values where:
7053 // * only one byte is nonzero, or
7054 // * the least significant byte is 0xff and the second byte is nonzero, or
7055 // * the least significant 2 bytes are 0xff and the third is nonzero.
7056 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
7057 if ((SplatBits & ~0xff) == 0) {
7058 // Value = 0x000000nn: Op=x, Cmode=000x.
7059 OpCmode = 0;
7060 Imm = SplatBits;
7061 break;
7062 }
7063 if ((SplatBits & ~0xff00) == 0) {
7064 // Value = 0x0000nn00: Op=x, Cmode=001x.
7065 OpCmode = 0x2;
7066 Imm = SplatBits >> 8;
7067 break;
7068 }
7069 if ((SplatBits & ~0xff0000) == 0) {
7070 // Value = 0x00nn0000: Op=x, Cmode=010x.
7071 OpCmode = 0x4;
7072 Imm = SplatBits >> 16;
7073 break;
7074 }
7075 if ((SplatBits & ~0xff000000) == 0) {
7076 // Value = 0xnn000000: Op=x, Cmode=011x.
7077 OpCmode = 0x6;
7078 Imm = SplatBits >> 24;
7079 break;
7080 }
7081
7082 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
7083 if (type == OtherModImm) return SDValue();
7084
7085 if ((SplatBits & ~0xffff) == 0 &&
7086 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
7087 // Value = 0x0000nnff: Op=x, Cmode=1100.
7088 OpCmode = 0xc;
7089 Imm = SplatBits >> 8;
7090 break;
7091 }
7092
7093 // cmode == 0b1101 is not supported for MVE VMVN
7094 if (type == MVEVMVNModImm)
7095 return SDValue();
7096
7097 if ((SplatBits & ~0xffffff) == 0 &&
7098 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
7099 // Value = 0x00nnffff: Op=x, Cmode=1101.
7100 OpCmode = 0xd;
7101 Imm = SplatBits >> 16;
7102 break;
7103 }
7104
7105 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
7106 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
7107 // VMOV.I32. A (very) minor optimization would be to replicate the value
7108 // and fall through here to test for a valid 64-bit splat. But, then the
7109 // caller would also need to check and handle the change in size.
7110 return SDValue();
7111
7112 case 64: {
7113 if (type != VMOVModImm)
7114 return SDValue();
7115 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
7116 uint64_t BitMask = 0xff;
7117 unsigned ImmMask = 1;
7118 Imm = 0;
7119 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7120 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7121 Imm |= ImmMask;
7122 } else if ((SplatBits & BitMask) != 0) {
7123 return SDValue();
7124 }
7125 BitMask <<= 8;
7126 ImmMask <<= 1;
7127 }
7128
7129 if (DAG.getDataLayout().isBigEndian()) {
7130 // Reverse the order of elements within the vector.
7131 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
7132 unsigned Mask = (1 << BytesPerElem) - 1;
7133 unsigned NumElems = 8 / BytesPerElem;
7134 unsigned NewImm = 0;
7135 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
7136 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
7137 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
7138 }
7139 Imm = NewImm;
7140 }
7141
7142 // Op=1, Cmode=1110.
7143 OpCmode = 0x1e;
7144 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7145 break;
7146 }
7147
7148 default:
7149 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7150 }
7151
7152 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7153 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7154}
7155
7156SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7157 const ARMSubtarget *ST) const {
7158 EVT VT = Op.getValueType();
7159 bool IsDouble = (VT == MVT::f64);
7160 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
7161 const APFloat &FPVal = CFP->getValueAPF();
7162
7163 // Prevent floating-point constants from using literal loads
7164 // when execute-only is enabled.
7165 if (ST->genExecuteOnly()) {
7166 // We shouldn't trigger this for v6m execute-only
7167 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7168 "Unexpected architecture");
7169
7170 // If we can represent the constant as an immediate, don't lower it
7171 if (isFPImmLegal(FPVal, VT))
7172 return Op;
7173 // Otherwise, construct as integer, and move to float register
7174 APInt INTVal = FPVal.bitcastToAPInt();
7175 SDLoc DL(CFP);
7176 switch (VT.getSimpleVT().SimpleTy) {
7177 default:
7178 llvm_unreachable("Unknown floating point type!");
7179 break;
7180 case MVT::f64: {
7181 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7182 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7183 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7184 }
7185 case MVT::f32:
7186 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7187 DAG.getConstant(INTVal, DL, MVT::i32));
7188 }
7189 }
7190
7191 if (!ST->hasVFP3Base())
7192 return SDValue();
7193
7194 // Use the default (constant pool) lowering for double constants when we have
7195 // an SP-only FPU
7196 if (IsDouble && !Subtarget->hasFP64())
7197 return SDValue();
7198
7199 // Try splatting with a VMOV.f32...
7200 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7201
7202 if (ImmVal != -1) {
7203 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7204 // We have code in place to select a valid ConstantFP already, no need to
7205 // do any mangling.
7206 return Op;
7207 }
7208
7209 // It's a float and we are trying to use NEON operations where
7210 // possible. Lower it to a splat followed by an extract.
7211 SDLoc DL(Op);
7212 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7213 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7214 NewVal);
7215 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7216 DAG.getConstant(0, DL, MVT::i32));
7217 }
7218
7219 // The rest of our options are NEON only, make sure that's allowed before
7220 // proceeding..
7221 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7222 return SDValue();
7223
7224 EVT VMovVT;
7225 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7226
7227 // It wouldn't really be worth bothering for doubles except for one very
7228 // important value, which does happen to match: 0.0. So make sure we don't do
7229 // anything stupid.
7230 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7231 return SDValue();
7232
7233 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7234 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7235 VMovVT, VT, VMOVModImm);
7236 if (NewVal != SDValue()) {
7237 SDLoc DL(Op);
7238 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7239 NewVal);
7240 if (IsDouble)
7241 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7242
7243 // It's a float: cast and extract a vector element.
7244 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7245 VecConstant);
7246 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7247 DAG.getConstant(0, DL, MVT::i32));
7248 }
7249
7250 // Finally, try a VMVN.i32
7251 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7252 VT, VMVNModImm);
7253 if (NewVal != SDValue()) {
7254 SDLoc DL(Op);
7255 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7256
7257 if (IsDouble)
7258 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7259
7260 // It's a float: cast and extract a vector element.
7261 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7262 VecConstant);
7263 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7264 DAG.getConstant(0, DL, MVT::i32));
7265 }
7266
7267 return SDValue();
7268}
7269
7270// check if an VEXT instruction can handle the shuffle mask when the
7271// vector sources of the shuffle are the same.
7272static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7273 unsigned NumElts = VT.getVectorNumElements();
7274
7275 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7276 if (M[0] < 0)
7277 return false;
7278
7279 Imm = M[0];
7280
7281 // If this is a VEXT shuffle, the immediate value is the index of the first
7282 // element. The other shuffle indices must be the successive elements after
7283 // the first one.
7284 unsigned ExpectedElt = Imm;
7285 for (unsigned i = 1; i < NumElts; ++i) {
7286 // Increment the expected index. If it wraps around, just follow it
7287 // back to index zero and keep going.
7288 ++ExpectedElt;
7289 if (ExpectedElt == NumElts)
7290 ExpectedElt = 0;
7291
7292 if (M[i] < 0) continue; // ignore UNDEF indices
7293 if (ExpectedElt != static_cast<unsigned>(M[i]))
7294 return false;
7295 }
7296
7297 return true;
7298}
7299
7300static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7301 bool &ReverseVEXT, unsigned &Imm) {
7302 unsigned NumElts = VT.getVectorNumElements();
7303 ReverseVEXT = false;
7304
7305 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7306 if (M[0] < 0)
7307 return false;
7308
7309 Imm = M[0];
7310
7311 // If this is a VEXT shuffle, the immediate value is the index of the first
7312 // element. The other shuffle indices must be the successive elements after
7313 // the first one.
7314 unsigned ExpectedElt = Imm;
7315 for (unsigned i = 1; i < NumElts; ++i) {
7316 // Increment the expected index. If it wraps around, it may still be
7317 // a VEXT but the source vectors must be swapped.
7318 ExpectedElt += 1;
7319 if (ExpectedElt == NumElts * 2) {
7320 ExpectedElt = 0;
7321 ReverseVEXT = true;
7322 }
7323
7324 if (M[i] < 0) continue; // ignore UNDEF indices
7325 if (ExpectedElt != static_cast<unsigned>(M[i]))
7326 return false;
7327 }
7328
7329 // Adjust the index value if the source operands will be swapped.
7330 if (ReverseVEXT)
7331 Imm -= NumElts;
7332
7333 return true;
7334}
7335
7336static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7337 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7338 // range, then 0 is placed into the resulting vector. So pretty much any mask
7339 // of 8 elements can work here.
7340 return VT == MVT::v8i8 && M.size() == 8;
7341}
7342
7343static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7344 unsigned Index) {
7345 if (Mask.size() == Elements * 2)
7346 return Index / Elements;
7347 return Mask[Index] == 0 ? 0 : 1;
7348}
7349
7350// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7351// checking that pairs of elements in the shuffle mask represent the same index
7352// in each vector, incrementing the expected index by 2 at each step.
7353// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7354// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7355// v2={e,f,g,h}
7356// WhichResult gives the offset for each element in the mask based on which
7357// of the two results it belongs to.
7358//
7359// The transpose can be represented either as:
7360// result1 = shufflevector v1, v2, result1_shuffle_mask
7361// result2 = shufflevector v1, v2, result2_shuffle_mask
7362// where v1/v2 and the shuffle masks have the same number of elements
7363// (here WhichResult (see below) indicates which result is being checked)
7364//
7365// or as:
7366// results = shufflevector v1, v2, shuffle_mask
7367// where both results are returned in one vector and the shuffle mask has twice
7368// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7369// want to check the low half and high half of the shuffle mask as if it were
7370// the other case
7371static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7372 unsigned EltSz = VT.getScalarSizeInBits();
7373 if (EltSz == 64)
7374 return false;
7375
7376 unsigned NumElts = VT.getVectorNumElements();
7377 if (M.size() != NumElts && M.size() != NumElts*2)
7378 return false;
7379
7380 // If the mask is twice as long as the input vector then we need to check the
7381 // upper and lower parts of the mask with a matching value for WhichResult
7382 // FIXME: A mask with only even values will be rejected in case the first
7383 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7384 // M[0] is used to determine WhichResult
7385 for (unsigned i = 0; i < M.size(); i += NumElts) {
7386 WhichResult = SelectPairHalf(NumElts, M, i);
7387 for (unsigned j = 0; j < NumElts; j += 2) {
7388 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7389 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7390 return false;
7391 }
7392 }
7393
7394 if (M.size() == NumElts*2)
7395 WhichResult = 0;
7396
7397 return true;
7398}
7399
7400/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7401/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7402/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7403static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7404 unsigned EltSz = VT.getScalarSizeInBits();
7405 if (EltSz == 64)
7406 return false;
7407
7408 unsigned NumElts = VT.getVectorNumElements();
7409 if (M.size() != NumElts && M.size() != NumElts*2)
7410 return false;
7411
7412 for (unsigned i = 0; i < M.size(); i += NumElts) {
7413 WhichResult = SelectPairHalf(NumElts, M, i);
7414 for (unsigned j = 0; j < NumElts; j += 2) {
7415 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7416 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7417 return false;
7418 }
7419 }
7420
7421 if (M.size() == NumElts*2)
7422 WhichResult = 0;
7423
7424 return true;
7425}
7426
7427// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7428// that the mask elements are either all even and in steps of size 2 or all odd
7429// and in steps of size 2.
7430// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7431// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7432// v2={e,f,g,h}
7433// Requires similar checks to that of isVTRNMask with
7434// respect the how results are returned.
7435static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7436 unsigned EltSz = VT.getScalarSizeInBits();
7437 if (EltSz == 64)
7438 return false;
7439
7440 unsigned NumElts = VT.getVectorNumElements();
7441 if (M.size() != NumElts && M.size() != NumElts*2)
7442 return false;
7443
7444 for (unsigned i = 0; i < M.size(); i += NumElts) {
7445 WhichResult = SelectPairHalf(NumElts, M, i);
7446 for (unsigned j = 0; j < NumElts; ++j) {
7447 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7448 return false;
7449 }
7450 }
7451
7452 if (M.size() == NumElts*2)
7453 WhichResult = 0;
7454
7455 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7456 if (VT.is64BitVector() && EltSz == 32)
7457 return false;
7458
7459 return true;
7460}
7461
7462/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7463/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7464/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7465static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7466 unsigned EltSz = VT.getScalarSizeInBits();
7467 if (EltSz == 64)
7468 return false;
7469
7470 unsigned NumElts = VT.getVectorNumElements();
7471 if (M.size() != NumElts && M.size() != NumElts*2)
7472 return false;
7473
7474 unsigned Half = NumElts / 2;
7475 for (unsigned i = 0; i < M.size(); i += NumElts) {
7476 WhichResult = SelectPairHalf(NumElts, M, i);
7477 for (unsigned j = 0; j < NumElts; j += Half) {
7478 unsigned Idx = WhichResult;
7479 for (unsigned k = 0; k < Half; ++k) {
7480 int MIdx = M[i + j + k];
7481 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7482 return false;
7483 Idx += 2;
7484 }
7485 }
7486 }
7487
7488 if (M.size() == NumElts*2)
7489 WhichResult = 0;
7490
7491 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7492 if (VT.is64BitVector() && EltSz == 32)
7493 return false;
7494
7495 return true;
7496}
7497
7498// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7499// that pairs of elements of the shufflemask represent the same index in each
7500// vector incrementing sequentially through the vectors.
7501// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7502// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7503// v2={e,f,g,h}
7504// Requires similar checks to that of isVTRNMask with respect the how results
7505// are returned.
7506static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7507 unsigned EltSz = VT.getScalarSizeInBits();
7508 if (EltSz == 64)
7509 return false;
7510
7511 unsigned NumElts = VT.getVectorNumElements();
7512 if (M.size() != NumElts && M.size() != NumElts*2)
7513 return false;
7514
7515 for (unsigned i = 0; i < M.size(); i += NumElts) {
7516 WhichResult = SelectPairHalf(NumElts, M, i);
7517 unsigned Idx = WhichResult * NumElts / 2;
7518 for (unsigned j = 0; j < NumElts; j += 2) {
7519 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7520 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7521 return false;
7522 Idx += 1;
7523 }
7524 }
7525
7526 if (M.size() == NumElts*2)
7527 WhichResult = 0;
7528
7529 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7530 if (VT.is64BitVector() && EltSz == 32)
7531 return false;
7532
7533 return true;
7534}
7535
7536/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7537/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7538/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7539static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7540 unsigned EltSz = VT.getScalarSizeInBits();
7541 if (EltSz == 64)
7542 return false;
7543
7544 unsigned NumElts = VT.getVectorNumElements();
7545 if (M.size() != NumElts && M.size() != NumElts*2)
7546 return false;
7547
7548 for (unsigned i = 0; i < M.size(); i += NumElts) {
7549 WhichResult = SelectPairHalf(NumElts, M, i);
7550 unsigned Idx = WhichResult * NumElts / 2;
7551 for (unsigned j = 0; j < NumElts; j += 2) {
7552 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7553 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7554 return false;
7555 Idx += 1;
7556 }
7557 }
7558
7559 if (M.size() == NumElts*2)
7560 WhichResult = 0;
7561
7562 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7563 if (VT.is64BitVector() && EltSz == 32)
7564 return false;
7565
7566 return true;
7567}
7568
7569/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7570/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7571static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7572 unsigned &WhichResult,
7573 bool &isV_UNDEF) {
7574 isV_UNDEF = false;
7575 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7576 return ARMISD::VTRN;
7577 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7578 return ARMISD::VUZP;
7579 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7580 return ARMISD::VZIP;
7581
7582 isV_UNDEF = true;
7583 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7584 return ARMISD::VTRN;
7585 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7586 return ARMISD::VUZP;
7587 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7588 return ARMISD::VZIP;
7589
7590 return 0;
7591}
7592
7593/// \return true if this is a reverse operation on an vector.
7594static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7595 unsigned NumElts = VT.getVectorNumElements();
7596 // Make sure the mask has the right size.
7597 if (NumElts != M.size())
7598 return false;
7599
7600 // Look for <15, ..., 3, -1, 1, 0>.
7601 for (unsigned i = 0; i != NumElts; ++i)
7602 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7603 return false;
7604
7605 return true;
7606}
7607
7608static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7609 unsigned NumElts = VT.getVectorNumElements();
7610 // Make sure the mask has the right size.
7611 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7612 return false;
7613
7614 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7615 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7616 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7617 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7618 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7619 int Ofs = Top ? 1 : 0;
7620 int Upper = SingleSource ? 0 : NumElts;
7621 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7622 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7623 return false;
7624 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7625 return false;
7626 }
7627 return true;
7628}
7629
7630static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7631 unsigned NumElts = VT.getVectorNumElements();
7632 // Make sure the mask has the right size.
7633 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7634 return false;
7635
7636 // If Top
7637 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7638 // This inserts Input2 into Input1
7639 // else if not Top
7640 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7641 // This inserts Input1 into Input2
7642 unsigned Offset = Top ? 0 : 1;
7643 unsigned N = SingleSource ? 0 : NumElts;
7644 for (unsigned i = 0; i < NumElts; i += 2) {
7645 if (M[i] >= 0 && M[i] != (int)i)
7646 return false;
7647 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7648 return false;
7649 }
7650
7651 return true;
7652}
7653
7654static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7655 unsigned NumElts = ToVT.getVectorNumElements();
7656 if (NumElts != M.size())
7657 return false;
7658
7659 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7660 // looking for patterns of:
7661 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7662 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7663
7664 unsigned Off0 = rev ? NumElts / 2 : 0;
7665 unsigned Off1 = rev ? 0 : NumElts / 2;
7666 for (unsigned i = 0; i < NumElts; i += 2) {
7667 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7668 return false;
7669 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7670 return false;
7671 }
7672
7673 return true;
7674}
7675
7676// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7677// from a pair of inputs. For example:
7678// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7679// FP_ROUND(EXTRACT_ELT(Y, 0),
7680// FP_ROUND(EXTRACT_ELT(X, 1),
7681// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7683 const ARMSubtarget *ST) {
7684 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7685 if (!ST->hasMVEFloatOps())
7686 return SDValue();
7687
7688 SDLoc dl(BV);
7689 EVT VT = BV.getValueType();
7690 if (VT != MVT::v8f16)
7691 return SDValue();
7692
7693 // We are looking for a buildvector of fptrunc elements, where all the
7694 // elements are interleavingly extracted from two sources. Check the first two
7695 // items are valid enough and extract some info from them (they are checked
7696 // properly in the loop below).
7697 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7700 return SDValue();
7701 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7704 return SDValue();
7705 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7706 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7707 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7708 return SDValue();
7709
7710 // Check all the values in the BuildVector line up with our expectations.
7711 for (unsigned i = 1; i < 4; i++) {
7712 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7713 return Trunc.getOpcode() == ISD::FP_ROUND &&
7715 Trunc.getOperand(0).getOperand(0) == Op &&
7716 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7717 };
7718 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7719 return SDValue();
7720 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7721 return SDValue();
7722 }
7723
7724 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7725 DAG.getConstant(0, dl, MVT::i32));
7726 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7727 DAG.getConstant(1, dl, MVT::i32));
7728}
7729
7730// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7731// from a single input on alternating lanes. For example:
7732// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7733// FP_ROUND(EXTRACT_ELT(X, 2),
7734// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7736 const ARMSubtarget *ST) {
7737 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7738 if (!ST->hasMVEFloatOps())
7739 return SDValue();
7740
7741 SDLoc dl(BV);
7742 EVT VT = BV.getValueType();
7743 if (VT != MVT::v4f32)
7744 return SDValue();
7745
7746 // We are looking for a buildvector of fptext elements, where all the
7747 // elements are alternating lanes from a single source. For example <0,2,4,6>
7748 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7749 // info from them (they are checked properly in the loop below).
7750 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7752 return SDValue();
7753 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7755 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7756 return SDValue();
7757
7758 // Check all the values in the BuildVector line up with our expectations.
7759 for (unsigned i = 1; i < 4; i++) {
7760 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7761 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7763 Trunc.getOperand(0).getOperand(0) == Op &&
7764 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7765 };
7766 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7767 return SDValue();
7768 }
7769
7770 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7771 DAG.getConstant(Offset, dl, MVT::i32));
7772}
7773
7774// If N is an integer constant that can be moved into a register in one
7775// instruction, return an SDValue of such a constant (will become a MOV
7776// instruction). Otherwise return null.
7778 const ARMSubtarget *ST, const SDLoc &dl) {
7779 uint64_t Val;
7780 if (!isa<ConstantSDNode>(N))
7781 return SDValue();
7782 Val = N->getAsZExtVal();
7783
7784 if (ST->isThumb1Only()) {
7785 if (Val <= 255 || ~Val <= 255)
7786 return DAG.getConstant(Val, dl, MVT::i32);
7787 } else {
7788 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7789 return DAG.getConstant(Val, dl, MVT::i32);
7790 }
7791 return SDValue();
7792}
7793
7795 const ARMSubtarget *ST) {
7796 SDLoc dl(Op);
7797 EVT VT = Op.getValueType();
7798
7799 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7800
7801 unsigned NumElts = VT.getVectorNumElements();
7802 unsigned BoolMask;
7803 unsigned BitsPerBool;
7804 if (NumElts == 2) {
7805 BitsPerBool = 8;
7806 BoolMask = 0xff;
7807 } else if (NumElts == 4) {
7808 BitsPerBool = 4;
7809 BoolMask = 0xf;
7810 } else if (NumElts == 8) {
7811 BitsPerBool = 2;
7812 BoolMask = 0x3;
7813 } else if (NumElts == 16) {
7814 BitsPerBool = 1;
7815 BoolMask = 0x1;
7816 } else
7817 return SDValue();
7818
7819 // If this is a single value copied into all lanes (a splat), we can just sign
7820 // extend that single value
7821 SDValue FirstOp = Op.getOperand(0);
7822 if (!isa<ConstantSDNode>(FirstOp) &&
7823 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7824 return U.get().isUndef() || U.get() == FirstOp;
7825 })) {
7826 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7827 DAG.getValueType(MVT::i1));
7828 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7829 }
7830
7831 // First create base with bits set where known
7832 unsigned Bits32 = 0;
7833 for (unsigned i = 0; i < NumElts; ++i) {
7834 SDValue V = Op.getOperand(i);
7835 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7836 continue;
7837 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7838 if (BitSet)
7839 Bits32 |= BoolMask << (i * BitsPerBool);
7840 }
7841
7842 // Add in unknown nodes
7844 DAG.getConstant(Bits32, dl, MVT::i32));
7845 for (unsigned i = 0; i < NumElts; ++i) {
7846 SDValue V = Op.getOperand(i);
7847 if (isa<ConstantSDNode>(V) || V.isUndef())
7848 continue;
7849 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7850 DAG.getConstant(i, dl, MVT::i32));
7851 }
7852
7853 return Base;
7854}
7855
7857 const ARMSubtarget *ST) {
7858 if (!ST->hasMVEIntegerOps())
7859 return SDValue();
7860
7861 // We are looking for a buildvector where each element is Op[0] + i*N
7862 EVT VT = Op.getValueType();
7863 SDValue Op0 = Op.getOperand(0);
7864 unsigned NumElts = VT.getVectorNumElements();
7865
7866 // Get the increment value from operand 1
7867 SDValue Op1 = Op.getOperand(1);
7868 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7869 !isa<ConstantSDNode>(Op1.getOperand(1)))
7870 return SDValue();
7871 unsigned N = Op1.getConstantOperandVal(1);
7872 if (N != 1 && N != 2 && N != 4 && N != 8)
7873 return SDValue();
7874
7875 // Check that each other operand matches
7876 for (unsigned I = 2; I < NumElts; I++) {
7877 SDValue OpI = Op.getOperand(I);
7878 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7879 !isa<ConstantSDNode>(OpI.getOperand(1)) ||
7880 OpI.getConstantOperandVal(1) != I * N)
7881 return SDValue();
7882 }
7883
7884 SDLoc DL(Op);
7885 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7886 DAG.getConstant(N, DL, MVT::i32));
7887}
7888
7889// Returns true if the operation N can be treated as qr instruction variant at
7890// operand Op.
7891static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7892 switch (N->getOpcode()) {
7893 case ISD::ADD:
7894 case ISD::MUL:
7895 case ISD::SADDSAT:
7896 case ISD::UADDSAT:
7897 return true;
7898 case ISD::SUB:
7899 case ISD::SSUBSAT:
7900 case ISD::USUBSAT:
7901 return N->getOperand(1).getNode() == Op;
7903 switch (N->getConstantOperandVal(0)) {
7904 case Intrinsic::arm_mve_add_predicated:
7905 case Intrinsic::arm_mve_mul_predicated:
7906 case Intrinsic::arm_mve_qadd_predicated:
7907 case Intrinsic::arm_mve_vhadd:
7908 case Intrinsic::arm_mve_hadd_predicated:
7909 case Intrinsic::arm_mve_vqdmulh:
7910 case Intrinsic::arm_mve_qdmulh_predicated:
7911 case Intrinsic::arm_mve_vqrdmulh:
7912 case Intrinsic::arm_mve_qrdmulh_predicated:
7913 case Intrinsic::arm_mve_vqdmull:
7914 case Intrinsic::arm_mve_vqdmull_predicated:
7915 return true;
7916 case Intrinsic::arm_mve_sub_predicated:
7917 case Intrinsic::arm_mve_qsub_predicated:
7918 case Intrinsic::arm_mve_vhsub:
7919 case Intrinsic::arm_mve_hsub_predicated:
7920 return N->getOperand(2).getNode() == Op;
7921 default:
7922 return false;
7923 }
7924 default:
7925 return false;
7926 }
7927}
7928
7929// If this is a case we can't handle, return null and let the default
7930// expansion code take care of it.
7931SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7932 const ARMSubtarget *ST) const {
7933 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7934 SDLoc dl(Op);
7935 EVT VT = Op.getValueType();
7936
7937 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7938 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7939
7940 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7941 return R;
7942
7943 APInt SplatBits, SplatUndef;
7944 unsigned SplatBitSize;
7945 bool HasAnyUndefs;
7946 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7947 if (SplatUndef.isAllOnes())
7948 return DAG.getUNDEF(VT);
7949
7950 // If all the users of this constant splat are qr instruction variants,
7951 // generate a vdup of the constant.
7952 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7953 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7954 all_of(BVN->uses(),
7955 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7956 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7957 : SplatBitSize == 16 ? MVT::v8i16
7958 : MVT::v16i8;
7959 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7960 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7961 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7962 }
7963
7964 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7965 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7966 // Check if an immediate VMOV works.
7967 EVT VmovVT;
7968 SDValue Val =
7969 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7970 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7971
7972 if (Val.getNode()) {
7973 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7974 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7975 }
7976
7977 // Try an immediate VMVN.
7978 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7979 Val = isVMOVModifiedImm(
7980 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7981 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7982 if (Val.getNode()) {
7983 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7984 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7985 }
7986
7987 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7988 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7989 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7990 if (ImmVal != -1) {
7991 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7992 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7993 }
7994 }
7995
7996 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7997 // type.
7998 if (ST->hasMVEIntegerOps() &&
7999 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
8000 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
8001 : SplatBitSize == 16 ? MVT::v8i16
8002 : MVT::v16i8;
8003 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
8004 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
8005 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
8006 }
8007 }
8008 }
8009
8010 // Scan through the operands to see if only one value is used.
8011 //
8012 // As an optimisation, even if more than one value is used it may be more
8013 // profitable to splat with one value then change some lanes.
8014 //
8015 // Heuristically we decide to do this if the vector has a "dominant" value,
8016 // defined as splatted to more than half of the lanes.
8017 unsigned NumElts = VT.getVectorNumElements();
8018 bool isOnlyLowElement = true;
8019 bool usesOnlyOneValue = true;
8020 bool hasDominantValue = false;
8021 bool isConstant = true;
8022
8023 // Map of the number of times a particular SDValue appears in the
8024 // element list.
8025 DenseMap<SDValue, unsigned> ValueCounts;
8026 SDValue Value;
8027 for (unsigned i = 0; i < NumElts; ++i) {
8028 SDValue V = Op.getOperand(i);
8029 if (V.isUndef())
8030 continue;
8031 if (i > 0)
8032 isOnlyLowElement = false;
8033 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
8034 isConstant = false;
8035
8036 ValueCounts.insert(std::make_pair(V, 0));
8037 unsigned &Count = ValueCounts[V];
8038
8039 // Is this value dominant? (takes up more than half of the lanes)
8040 if (++Count > (NumElts / 2)) {
8041 hasDominantValue = true;
8042 Value = V;
8043 }
8044 }
8045 if (ValueCounts.size() != 1)
8046 usesOnlyOneValue = false;
8047 if (!Value.getNode() && !ValueCounts.empty())
8048 Value = ValueCounts.begin()->first;
8049
8050 if (ValueCounts.empty())
8051 return DAG.getUNDEF(VT);
8052
8053 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
8054 // Keep going if we are hitting this case.
8055 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
8056 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
8057
8058 unsigned EltSize = VT.getScalarSizeInBits();
8059
8060 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
8061 // i32 and try again.
8062 if (hasDominantValue && EltSize <= 32) {
8063 if (!isConstant) {
8064 SDValue N;
8065
8066 // If we are VDUPing a value that comes directly from a vector, that will
8067 // cause an unnecessary move to and from a GPR, where instead we could
8068 // just use VDUPLANE. We can only do this if the lane being extracted
8069 // is at a constant index, as the VDUP from lane instructions only have
8070 // constant-index forms.
8071 ConstantSDNode *constIndex;
8072 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8073 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
8074 // We need to create a new undef vector to use for the VDUPLANE if the
8075 // size of the vector from which we get the value is different than the
8076 // size of the vector that we need to create. We will insert the element
8077 // such that the register coalescer will remove unnecessary copies.
8078 if (VT != Value->getOperand(0).getValueType()) {
8079 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
8081 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8082 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
8083 Value, DAG.getConstant(index, dl, MVT::i32)),
8084 DAG.getConstant(index, dl, MVT::i32));
8085 } else
8086 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8087 Value->getOperand(0), Value->getOperand(1));
8088 } else
8089 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
8090
8091 if (!usesOnlyOneValue) {
8092 // The dominant value was splatted as 'N', but we now have to insert
8093 // all differing elements.
8094 for (unsigned I = 0; I < NumElts; ++I) {
8095 if (Op.getOperand(I) == Value)
8096 continue;
8098 Ops.push_back(N);
8099 Ops.push_back(Op.getOperand(I));
8100 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
8101 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
8102 }
8103 }
8104 return N;
8105 }
8109 assert(FVT == MVT::f32 || FVT == MVT::f16);
8110 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
8111 for (unsigned i = 0; i < NumElts; ++i)
8112 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
8113 Op.getOperand(i)));
8114 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
8115 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
8116 Val = LowerBUILD_VECTOR(Val, DAG, ST);
8117 if (Val.getNode())
8118 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8119 }
8120 if (usesOnlyOneValue) {
8121 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
8122 if (isConstant && Val.getNode())
8123 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
8124 }
8125 }
8126
8127 // If all elements are constants and the case above didn't get hit, fall back
8128 // to the default expansion, which will generate a load from the constant
8129 // pool.
8130 if (isConstant)
8131 return SDValue();
8132
8133 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8134 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8135 // length <= 2.
8136 if (NumElts >= 4)
8137 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8138 return shuffle;
8139
8140 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8141 // VCVT's
8142 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8143 return VCVT;
8144 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8145 return VCVT;
8146
8147 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8148 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8149 // into two 64-bit vectors; we might discover a better way to lower it.
8150 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8151 EVT ExtVT = VT.getVectorElementType();
8152 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8153 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8154 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8155 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8156 SDValue Upper =
8157 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8158 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8159 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8160 if (Lower && Upper)
8161 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8162 }
8163
8164 // Vectors with 32- or 64-bit elements can be built by directly assigning
8165 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8166 // will be legalized.
8167 if (EltSize >= 32) {
8168 // Do the expansion with floating-point types, since that is what the VFP
8169 // registers are defined to use, and since i64 is not legal.
8170 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8171 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8173 for (unsigned i = 0; i < NumElts; ++i)
8174 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8175 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8176 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8177 }
8178
8179 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8180 // know the default expansion would otherwise fall back on something even
8181 // worse. For a vector with one or two non-undef values, that's
8182 // scalar_to_vector for the elements followed by a shuffle (provided the
8183 // shuffle is valid for the target) and materialization element by element
8184 // on the stack followed by a load for everything else.
8185 if (!isConstant && !usesOnlyOneValue) {
8186 SDValue Vec = DAG.getUNDEF(VT);
8187 for (unsigned i = 0 ; i < NumElts; ++i) {
8188 SDValue V = Op.getOperand(i);
8189 if (V.isUndef())
8190 continue;
8191 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8192 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8193 }
8194 return Vec;
8195 }
8196
8197 return SDValue();
8198}
8199
8200// Gather data to see if the operation can be modelled as a
8201// shuffle in combination with VEXTs.
8202SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8203 SelectionDAG &DAG) const {
8204 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8205 SDLoc dl(Op);
8206 EVT VT = Op.getValueType();
8207 unsigned NumElts = VT.getVectorNumElements();
8208
8209 struct ShuffleSourceInfo {
8210 SDValue Vec;
8211 unsigned MinElt = std::numeric_limits<unsigned>::max();
8212 unsigned MaxElt = 0;
8213
8214 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8215 // be compatible with the shuffle we intend to construct. As a result
8216 // ShuffleVec will be some sliding window into the original Vec.
8217 SDValue ShuffleVec;
8218
8219 // Code should guarantee that element i in Vec starts at element "WindowBase
8220 // + i * WindowScale in ShuffleVec".
8221 int WindowBase = 0;
8222 int WindowScale = 1;
8223
8224 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8225
8226 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8227 };
8228
8229 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8230 // node.
8232 for (unsigned i = 0; i < NumElts; ++i) {
8233 SDValue V = Op.getOperand(i);
8234 if (V.isUndef())
8235 continue;
8236 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8237 // A shuffle can only come from building a vector from various
8238 // elements of other vectors.
8239 return SDValue();
8240 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8241 // Furthermore, shuffles require a constant mask, whereas extractelts
8242 // accept variable indices.
8243 return SDValue();
8244 }
8245
8246 // Add this element source to the list if it's not already there.
8247 SDValue SourceVec = V.getOperand(0);
8248 auto Source = llvm::find(Sources, SourceVec);
8249 if (Source == Sources.end())
8250 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8251
8252 // Update the minimum and maximum lane number seen.
8253 unsigned EltNo = V.getConstantOperandVal(1);
8254 Source->MinElt = std::min(Source->MinElt, EltNo);
8255 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8256 }
8257
8258 // Currently only do something sane when at most two source vectors
8259 // are involved.
8260 if (Sources.size() > 2)
8261 return SDValue();
8262
8263 // Find out the smallest element size among result and two sources, and use
8264 // it as element size to build the shuffle_vector.
8265 EVT SmallestEltTy = VT.getVectorElementType();
8266 for (auto &Source : Sources) {
8267 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8268 if (SrcEltTy.bitsLT(SmallestEltTy))
8269 SmallestEltTy = SrcEltTy;
8270 }
8271 unsigned ResMultiplier =
8272 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8273 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8274 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8275
8276 // If the source vector is too wide or too narrow, we may nevertheless be able
8277 // to construct a compatible shuffle either by concatenating it with UNDEF or
8278 // extracting a suitable range of elements.
8279 for (auto &Src : Sources) {
8280 EVT SrcVT = Src.ShuffleVec.getValueType();
8281
8282 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8283 uint64_t VTSize = VT.getFixedSizeInBits();
8284 if (SrcVTSize == VTSize)
8285 continue;
8286
8287 // This stage of the search produces a source with the same element type as
8288 // the original, but with a total width matching the BUILD_VECTOR output.
8289 EVT EltVT = SrcVT.getVectorElementType();
8290 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8291 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8292
8293 if (SrcVTSize < VTSize) {
8294 if (2 * SrcVTSize != VTSize)
8295 return SDValue();
8296 // We can pad out the smaller vector for free, so if it's part of a
8297 // shuffle...
8298 Src.ShuffleVec =
8299 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8300 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8301 continue;
8302 }
8303
8304 if (SrcVTSize != 2 * VTSize)
8305 return SDValue();
8306
8307 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8308 // Span too large for a VEXT to cope
8309 return SDValue();
8310 }
8311
8312 if (Src.MinElt >= NumSrcElts) {
8313 // The extraction can just take the second half
8314 Src.ShuffleVec =
8315 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8316 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8317 Src.WindowBase = -NumSrcElts;
8318 } else if (Src.MaxElt < NumSrcElts) {
8319 // The extraction can just take the first half
8320 Src.ShuffleVec =
8321 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8322 DAG.getConstant(0, dl, MVT::i32));
8323 } else {
8324 // An actual VEXT is needed
8325 SDValue VEXTSrc1 =
8326 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8327 DAG.getConstant(0, dl, MVT::i32));
8328 SDValue VEXTSrc2 =
8329 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8330 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8331
8332 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8333 VEXTSrc2,
8334 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8335 Src.WindowBase = -Src.MinElt;
8336 }
8337 }
8338
8339 // Another possible incompatibility occurs from the vector element types. We
8340 // can fix this by bitcasting the source vectors to the same type we intend
8341 // for the shuffle.
8342 for (auto &Src : Sources) {
8343 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8344 if (SrcEltTy == SmallestEltTy)
8345 continue;
8346 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8347 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8348 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8349 Src.WindowBase *= Src.WindowScale;
8350 }
8351
8352 // Final check before we try to actually produce a shuffle.
8353 LLVM_DEBUG(for (auto Src
8354 : Sources)
8355 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8356
8357 // The stars all align, our next step is to produce the mask for the shuffle.
8359 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8360 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8361 SDValue Entry = Op.getOperand(i);
8362 if (Entry.isUndef())
8363 continue;
8364
8365 auto Src = llvm::find(Sources, Entry.getOperand(0));
8366 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8367
8368 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8369 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8370 // segment.
8371 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8372 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8373 VT.getScalarSizeInBits());
8374 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8375
8376 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8377 // starting at the appropriate offset.
8378 int *LaneMask = &Mask[i * ResMultiplier];
8379
8380 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8381 ExtractBase += NumElts * (Src - Sources.begin());
8382 for (int j = 0; j < LanesDefined; ++j)
8383 LaneMask[j] = ExtractBase + j;
8384 }
8385
8386
8387 // We can't handle more than two sources. This should have already
8388 // been checked before this point.
8389 assert(Sources.size() <= 2 && "Too many sources!");
8390
8391 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8392 for (unsigned i = 0; i < Sources.size(); ++i)
8393 ShuffleOps[i] = Sources[i].ShuffleVec;
8394
8395 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8396 ShuffleOps[1], Mask, DAG);
8397 if (!Shuffle)
8398 return SDValue();
8399 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8400}
8401
8403 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8412 OP_VUZPL, // VUZP, left result
8413 OP_VUZPR, // VUZP, right result
8414 OP_VZIPL, // VZIP, left result
8415 OP_VZIPR, // VZIP, right result
8416 OP_VTRNL, // VTRN, left result
8417 OP_VTRNR // VTRN, right result
8419
8420static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8421 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8422 switch (OpNum) {
8423 case OP_COPY:
8424 case OP_VREV:
8425 case OP_VDUP0:
8426 case OP_VDUP1:
8427 case OP_VDUP2:
8428 case OP_VDUP3:
8429 return true;
8430 }
8431 return false;
8432}
8433
8434/// isShuffleMaskLegal - Targets can use this to indicate that they only
8435/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8436/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8437/// are assumed to be legal.
8439 if (VT.getVectorNumElements() == 4 &&
8440 (VT.is128BitVector() || VT.is64BitVector())) {
8441 unsigned PFIndexes[4];
8442 for (unsigned i = 0; i != 4; ++i) {
8443 if (M[i] < 0)
8444 PFIndexes[i] = 8;
8445 else
8446 PFIndexes[i] = M[i];
8447 }
8448
8449 // Compute the index in the perfect shuffle table.
8450 unsigned PFTableIndex =
8451 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8452 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8453 unsigned Cost = (PFEntry >> 30);
8454
8455 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8456 return true;
8457 }
8458
8459 bool ReverseVEXT, isV_UNDEF;
8460 unsigned Imm, WhichResult;
8461
8462 unsigned EltSize = VT.getScalarSizeInBits();
8463 if (EltSize >= 32 ||
8465 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8466 isVREVMask(M, VT, 64) ||
8467 isVREVMask(M, VT, 32) ||
8468 isVREVMask(M, VT, 16))
8469 return true;
8470 else if (Subtarget->hasNEON() &&
8471 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8472 isVTBLMask(M, VT) ||
8473 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8474 return true;
8475 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8476 isReverseMask(M, VT))
8477 return true;
8478 else if (Subtarget->hasMVEIntegerOps() &&
8479 (isVMOVNMask(M, VT, true, false) ||
8480 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8481 return true;
8482 else if (Subtarget->hasMVEIntegerOps() &&
8483 (isTruncMask(M, VT, false, false) ||
8484 isTruncMask(M, VT, false, true) ||
8485 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8486 return true;
8487 else
8488 return false;
8489}
8490
8491/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8492/// the specified operations to build the shuffle.
8493static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
8494 SDValue RHS, SelectionDAG &DAG,
8495 const SDLoc &dl) {
8496 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8497 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8498 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8499
8500 if (OpNum == OP_COPY) {
8501 if (LHSID == (1*9+2)*9+3) return LHS;
8502 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8503 return RHS;
8504 }
8505
8506 SDValue OpLHS, OpRHS;
8507 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8508 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8509 EVT VT = OpLHS.getValueType();
8510
8511 switch (OpNum) {
8512 default: llvm_unreachable("Unknown shuffle opcode!");
8513 case OP_VREV:
8514 // VREV divides the vector in half and swaps within the half.
8515 if (VT.getScalarSizeInBits() == 32)
8516 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8517 // vrev <4 x i16> -> VREV32
8518 if (VT.getScalarSizeInBits() == 16)
8519 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8520 // vrev <4 x i8> -> VREV16
8521 assert(VT.getScalarSizeInBits() == 8);
8522 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8523 case OP_VDUP0:
8524 case OP_VDUP1:
8525 case OP_VDUP2:
8526 case OP_VDUP3:
8527 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8528 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8529 case OP_VEXT1:
8530 case OP_VEXT2:
8531 case OP_VEXT3:
8532 return DAG.getNode(ARMISD::VEXT, dl, VT,
8533 OpLHS, OpRHS,
8534 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8535 case OP_VUZPL:
8536 case OP_VUZPR:
8537 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8538 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8539 case OP_VZIPL:
8540 case OP_VZIPR:
8541 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8542 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8543 case OP_VTRNL:
8544 case OP_VTRNR:
8545 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8546 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8547 }
8548}
8549
8551 ArrayRef<int> ShuffleMask,
8552 SelectionDAG &DAG) {
8553 // Check to see if we can use the VTBL instruction.
8554 SDValue V1 = Op.getOperand(0);
8555 SDValue V2 = Op.getOperand(1);
8556 SDLoc DL(Op);
8557
8558 SmallVector<SDValue, 8> VTBLMask;
8559 for (int I : ShuffleMask)
8560 VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32));
8561
8562 if (V2.getNode()->isUndef())
8563 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8564 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8565
8566 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8567 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8568}
8569
8571 SDLoc DL(Op);
8572 EVT VT = Op.getValueType();
8573
8574 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8575 "Expect an v8i16/v16i8 type");
8576 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8577 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8578 // extract the first 8 bytes into the top double word and the last 8 bytes
8579 // into the bottom double word, through a new vector shuffle that will be
8580 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8581 std::vector<int> NewMask;
8582 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8583 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8584 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8585 NewMask.push_back(i);
8586 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8587}
8588
8590 switch (VT.getSimpleVT().SimpleTy) {
8591 case MVT::v2i1:
8592 return MVT::v2f64;
8593 case MVT::v4i1:
8594 return MVT::v4i32;
8595 case MVT::v8i1:
8596 return MVT::v8i16;
8597 case MVT::v16i1:
8598 return MVT::v16i8;
8599 default:
8600 llvm_unreachable("Unexpected vector predicate type");
8601 }
8602}
8603
8605 SelectionDAG &DAG) {
8606 // Converting from boolean predicates to integers involves creating a vector
8607 // of all ones or all zeroes and selecting the lanes based upon the real
8608 // predicate.
8610 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8611 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8612
8613 SDValue AllZeroes =
8614 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8615 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8616
8617 // Get full vector type from predicate type
8619
8620 SDValue RecastV1;
8621 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8622 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8623 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8624 // since we know in hardware the sizes are really the same.
8625 if (VT != MVT::v16i1)
8626 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8627 else
8628 RecastV1 = Pred;
8629
8630 // Select either all ones or zeroes depending upon the real predicate bits.
8631 SDValue PredAsVector =
8632 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8633
8634 // Recast our new predicate-as-integer v16i8 vector into something
8635 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8636 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8637}
8638
8640 const ARMSubtarget *ST) {
8641 EVT VT = Op.getValueType();
8642 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8643 ArrayRef<int> ShuffleMask = SVN->getMask();
8644
8645 assert(ST->hasMVEIntegerOps() &&
8646 "No support for vector shuffle of boolean predicates");
8647
8648 SDValue V1 = Op.getOperand(0);
8649 SDValue V2 = Op.getOperand(1);
8650 SDLoc dl(Op);
8651 if (isReverseMask(ShuffleMask, VT)) {
8652 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8653 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8654 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8655 DAG.getConstant(16, dl, MVT::i32));
8656 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8657 }
8658
8659 // Until we can come up with optimised cases for every single vector
8660 // shuffle in existence we have chosen the least painful strategy. This is
8661 // to essentially promote the boolean predicate to a 8-bit integer, where
8662 // each predicate represents a byte. Then we fall back on a normal integer
8663 // vector shuffle and convert the result back into a predicate vector. In
8664 // many cases the generated code might be even better than scalar code
8665 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8666 // fields in a register into 8 other arbitrary 2-bit fields!
8667 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8668 EVT NewVT = PredAsVector1.getValueType();
8669 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8670 : PromoteMVEPredVector(dl, V2, VT, DAG);
8671 assert(PredAsVector2.getValueType() == NewVT &&
8672 "Expected identical vector type in expanded i1 shuffle!");
8673
8674 // Do the shuffle!
8675 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8676 PredAsVector2, ShuffleMask);
8677
8678 // Now return the result of comparing the shuffled vector with zero,
8679 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8680 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8681 if (VT == MVT::v2i1) {
8682 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8683 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8684 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8685 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8686 }
8687 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8688 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8689}
8690
8692 ArrayRef<int> ShuffleMask,
8693 SelectionDAG &DAG) {
8694 // Attempt to lower the vector shuffle using as many whole register movs as
8695 // possible. This is useful for types smaller than 32bits, which would
8696 // often otherwise become a series for grp movs.
8697 SDLoc dl(Op);
8698 EVT VT = Op.getValueType();
8699 if (VT.getScalarSizeInBits() >= 32)
8700 return SDValue();
8701
8702 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8703 "Unexpected vector type");
8704 int NumElts = VT.getVectorNumElements();
8705 int QuarterSize = NumElts / 4;
8706 // The four final parts of the vector, as i32's
8707 SDValue Parts[4];
8708
8709 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8710 // <u,u,u,u>), returning the vmov lane index
8711 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8712 // Detect which mov lane this would be from the first non-undef element.
8713 int MovIdx = -1;
8714 for (int i = 0; i < Length; i++) {
8715 if (ShuffleMask[Start + i] >= 0) {
8716 if (ShuffleMask[Start + i] % Length != i)
8717 return -1;
8718 MovIdx = ShuffleMask[Start + i] / Length;
8719 break;
8720 }
8721 }
8722 // If all items are undef, leave this for other combines
8723 if (MovIdx == -1)
8724 return -1;
8725 // Check the remaining values are the correct part of the same mov
8726 for (int i = 1; i < Length; i++) {
8727 if (ShuffleMask[Start + i] >= 0 &&
8728 (ShuffleMask[Start + i] / Length != MovIdx ||
8729 ShuffleMask[Start + i] % Length != i))
8730 return -1;
8731 }
8732 return MovIdx;
8733 };
8734
8735 for (int Part = 0; Part < 4; ++Part) {
8736 // Does this part look like a mov
8737 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8738 if (Elt != -1) {
8739 SDValue Input = Op->getOperand(0);
8740 if (Elt >= 4) {
8741 Input = Op->getOperand(1);
8742 Elt -= 4;
8743 }
8744 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8745 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8746 DAG.getConstant(Elt, dl, MVT::i32));
8747 }
8748 }
8749
8750 // Nothing interesting found, just return
8751 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8752 return SDValue();
8753
8754 // The other parts need to be built with the old shuffle vector, cast to a
8755 // v4i32 and extract_vector_elts
8756 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8757 SmallVector<int, 16> NewShuffleMask;
8758 for (int Part = 0; Part < 4; ++Part)
8759 for (int i = 0; i < QuarterSize; i++)
8760 NewShuffleMask.push_back(
8761 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8762 SDValue NewShuffle = DAG.getVectorShuffle(
8763 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8764 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8765
8766 for (int Part = 0; Part < 4; ++Part)
8767 if (!Parts[Part])
8768 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8769 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8770 }
8771 // Build a vector out of the various parts and bitcast it back to the original
8772 // type.
8773 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8774 return DAG.getBitcast(VT, NewVec);
8775}
8776
8778 ArrayRef<int> ShuffleMask,
8779 SelectionDAG &DAG) {
8780 SDValue V1 = Op.getOperand(0);
8781 SDValue V2 = Op.getOperand(1);
8782 EVT VT = Op.getValueType();
8783 unsigned NumElts = VT.getVectorNumElements();
8784
8785 // An One-Off Identity mask is one that is mostly an identity mask from as
8786 // single source but contains a single element out-of-place, either from a
8787 // different vector or from another position in the same vector. As opposed to
8788 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8789 // pair directly.
8790 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8791 int &OffElement) {
8792 OffElement = -1;
8793 int NonUndef = 0;
8794 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8795 if (Mask[i] == -1)
8796 continue;
8797 NonUndef++;
8798 if (Mask[i] != i + BaseOffset) {
8799 if (OffElement == -1)
8800 OffElement = i;
8801 else
8802 return false;
8803 }
8804 }
8805 return NonUndef > 2 && OffElement != -1;
8806 };
8807 int OffElement;
8808 SDValue VInput;
8809 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8810 VInput = V1;
8811 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8812 VInput = V2;
8813 else
8814 return SDValue();
8815
8816 SDLoc dl(Op);
8817 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8818 ? MVT::i32
8819 : VT.getScalarType();
8820 SDValue Elt = DAG.getNode(
8821 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8822 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8823 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8824 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8825 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8826}
8827
8829 const ARMSubtarget *ST) {
8830 SDValue V1 = Op.getOperand(0);
8831 SDValue V2 = Op.getOperand(1);
8832 SDLoc dl(Op);
8833 EVT VT = Op.getValueType();
8834 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8835 unsigned EltSize = VT.getScalarSizeInBits();
8836
8837 if (ST->hasMVEIntegerOps() && EltSize == 1)
8838 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8839
8840 // Convert shuffles that are directly supported on NEON to target-specific
8841 // DAG nodes, instead of keeping them as shuffles and matching them again
8842 // during code selection. This is more efficient and avoids the possibility
8843 // of inconsistencies between legalization and selection.
8844 // FIXME: floating-point vectors should be canonicalized to integer vectors
8845 // of the same time so that they get CSEd properly.
8846 ArrayRef<int> ShuffleMask = SVN->getMask();
8847
8848 if (EltSize <= 32) {
8849 if (SVN->isSplat()) {
8850 int Lane = SVN->getSplatIndex();
8851 // If this is undef splat, generate it via "just" vdup, if possible.
8852 if (Lane == -1) Lane = 0;
8853
8854 // Test if V1 is a SCALAR_TO_VECTOR.
8855 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8856 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8857 }
8858 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8859 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8860 // reaches it).
8861 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8862 !isa<ConstantSDNode>(V1.getOperand(0))) {
8863 bool IsScalarToVector = true;
8864 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8865 if (!V1.getOperand(i).isUndef()) {
8866 IsScalarToVector = false;
8867 break;
8868 }
8869 if (IsScalarToVector)
8870 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8871 }
8872 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8873 DAG.getConstant(Lane, dl, MVT::i32));
8874 }
8875
8876 bool ReverseVEXT = false;
8877 unsigned Imm = 0;
8878 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8879 if (ReverseVEXT)
8880 std::swap(V1, V2);
8881 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8882 DAG.getConstant(Imm, dl, MVT::i32));
8883 }
8884
8885 if (isVREVMask(ShuffleMask, VT, 64))
8886 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8887 if (isVREVMask(ShuffleMask, VT, 32))
8888 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8889 if (isVREVMask(ShuffleMask, VT, 16))
8890 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8891
8892 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8893 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8894 DAG.getConstant(Imm, dl, MVT::i32));
8895 }
8896
8897 // Check for Neon shuffles that modify both input vectors in place.
8898 // If both results are used, i.e., if there are two shuffles with the same
8899 // source operands and with masks corresponding to both results of one of
8900 // these operations, DAG memoization will ensure that a single node is
8901 // used for both shuffles.
8902 unsigned WhichResult = 0;
8903 bool isV_UNDEF = false;
8904 if (ST->hasNEON()) {
8905 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8906 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8907 if (isV_UNDEF)
8908 V2 = V1;
8909 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8910 .getValue(WhichResult);
8911 }
8912 }
8913 if (ST->hasMVEIntegerOps()) {
8914 if (isVMOVNMask(ShuffleMask, VT, false, false))
8915 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8916 DAG.getConstant(0, dl, MVT::i32));
8917 if (isVMOVNMask(ShuffleMask, VT, true, false))
8918 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8919 DAG.getConstant(1, dl, MVT::i32));
8920 if (isVMOVNMask(ShuffleMask, VT, true, true))
8921 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8922 DAG.getConstant(1, dl, MVT::i32));
8923 }
8924
8925 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8926 // shuffles that produce a result larger than their operands with:
8927 // shuffle(concat(v1, undef), concat(v2, undef))
8928 // ->
8929 // shuffle(concat(v1, v2), undef)
8930 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8931 //
8932 // This is useful in the general case, but there are special cases where
8933 // native shuffles produce larger results: the two-result ops.
8934 //
8935 // Look through the concat when lowering them:
8936 // shuffle(concat(v1, v2), undef)
8937 // ->
8938 // concat(VZIP(v1, v2):0, :1)
8939 //
8940 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8941 SDValue SubV1 = V1->getOperand(0);
8942 SDValue SubV2 = V1->getOperand(1);
8943 EVT SubVT = SubV1.getValueType();
8944
8945 // We expect these to have been canonicalized to -1.
8946 assert(llvm::all_of(ShuffleMask, [&](int i) {
8947 return i < (int)VT.getVectorNumElements();
8948 }) && "Unexpected shuffle index into UNDEF operand!");
8949
8950 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8951 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8952 if (isV_UNDEF)
8953 SubV2 = SubV1;
8954 assert((WhichResult == 0) &&
8955 "In-place shuffle of concat can only have one result!");
8956 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8957 SubV1, SubV2);
8958 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8959 Res.getValue(1));
8960 }
8961 }
8962 }
8963
8964 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8965 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8966 return V;
8967
8968 for (bool Top : {false, true}) {
8969 for (bool SingleSource : {false, true}) {
8970 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8971 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8972 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8973 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8974 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8975 SingleSource ? V1 : V2);
8976 if (Top) {
8977 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8978 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8979 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8980 }
8981 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8982 }
8983 }
8984 }
8985 }
8986
8987 // If the shuffle is not directly supported and it has 4 elements, use
8988 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8989 unsigned NumElts = VT.getVectorNumElements();
8990 if (NumElts == 4) {
8991 unsigned PFIndexes[4];
8992 for (unsigned i = 0; i != 4; ++i) {
8993 if (ShuffleMask[i] < 0)
8994 PFIndexes[i] = 8;
8995 else
8996 PFIndexes[i] = ShuffleMask[i];
8997 }
8998
8999 // Compute the index in the perfect shuffle table.
9000 unsigned PFTableIndex =
9001 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
9002 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
9003 unsigned Cost = (PFEntry >> 30);
9004
9005 if (Cost <= 4) {
9006 if (ST->hasNEON())
9007 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
9008 else if (isLegalMVEShuffleOp(PFEntry)) {
9009 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
9010 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
9011 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
9012 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
9013 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
9014 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
9015 }
9016 }
9017 }
9018
9019 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
9020 if (EltSize >= 32) {
9021 // Do the expansion with floating-point types, since that is what the VFP
9022 // registers are defined to use, and since i64 is not legal.
9023 EVT EltVT = EVT::getFloatingPointVT(EltSize);
9024 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
9025 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
9026 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
9028 for (unsigned i = 0; i < NumElts; ++i) {
9029 if (ShuffleMask[i] < 0)
9030 Ops.push_back(DAG.getUNDEF(EltVT));
9031 else
9032 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
9033 ShuffleMask[i] < (int)NumElts ? V1 : V2,
9034 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
9035 dl, MVT::i32)));
9036 }
9037 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
9038 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
9039 }
9040
9041 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
9042 isReverseMask(ShuffleMask, VT))
9043 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
9044
9045 if (ST->hasNEON() && VT == MVT::v8i8)
9046 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
9047 return NewOp;
9048
9049 if (ST->hasMVEIntegerOps())
9050 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
9051 return NewOp;
9052
9053 return SDValue();
9054}
9055
9057 const ARMSubtarget *ST) {
9058 EVT VecVT = Op.getOperand(0).getValueType();
9059 SDLoc dl(Op);
9060
9061 assert(ST->hasMVEIntegerOps() &&
9062 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9063
9064 SDValue Conv =
9065 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9066 unsigned Lane = Op.getConstantOperandVal(2);
9067 unsigned LaneWidth =
9069 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
9070 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
9071 Op.getOperand(1), DAG.getValueType(MVT::i1));
9072 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
9073 DAG.getConstant(~Mask, dl, MVT::i32));
9074 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
9075}
9076
9077SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
9078 SelectionDAG &DAG) const {
9079 // INSERT_VECTOR_ELT is legal only for immediate indexes.
9080 SDValue Lane = Op.getOperand(2);
9081 if (!isa<ConstantSDNode>(Lane))
9082 return SDValue();
9083
9084 SDValue Elt = Op.getOperand(1);
9085 EVT EltVT = Elt.getValueType();
9086
9087 if (Subtarget->hasMVEIntegerOps() &&
9088 Op.getValueType().getScalarSizeInBits() == 1)
9089 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
9090
9091 if (getTypeAction(*DAG.getContext(), EltVT) ==
9093 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
9094 // but the type system will try to do that if we don't intervene.
9095 // Reinterpret any such vector-element insertion as one with the
9096 // corresponding integer types.
9097
9098 SDLoc dl(Op);
9099
9100 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
9101 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
9103
9104 SDValue VecIn = Op.getOperand(0);
9105 EVT VecVT = VecIn.getValueType();
9106 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
9107 VecVT.getVectorNumElements());
9108
9109 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
9110 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
9111 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
9112 IVecIn, IElt, Lane);
9113 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
9114 }
9115
9116 return Op;
9117}
9118
9120 const ARMSubtarget *ST) {
9121 EVT VecVT = Op.getOperand(0).getValueType();
9122 SDLoc dl(Op);
9123
9124 assert(ST->hasMVEIntegerOps() &&
9125 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9126
9127 SDValue Conv =
9128 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9129 unsigned Lane = Op.getConstantOperandVal(1);
9130 unsigned LaneWidth =
9132 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9133 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9134 return Shift;
9135}
9136
9138 const ARMSubtarget *ST) {
9139 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9140 SDValue Lane = Op.getOperand(1);
9141 if (!isa<ConstantSDNode>(Lane))
9142 return SDValue();
9143
9144 SDValue Vec = Op.getOperand(0);
9145 EVT VT = Vec.getValueType();
9146
9147 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9148 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9149
9150 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9151 SDLoc dl(Op);
9152 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9153 }
9154
9155 return Op;
9156}
9157
9159 const ARMSubtarget *ST) {
9160 SDLoc dl(Op);
9161 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9162 "Unexpected custom CONCAT_VECTORS lowering");
9164 "Unexpected custom CONCAT_VECTORS lowering");
9165 assert(ST->hasMVEIntegerOps() &&
9166 "CONCAT_VECTORS lowering only supported for MVE");
9167
9168 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9169 EVT Op1VT = V1.getValueType();
9170 EVT Op2VT = V2.getValueType();
9171 assert(Op1VT == Op2VT && "Operand types don't match!");
9172 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9173 "Unexpected i1 concat operations!");
9174 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9175
9176 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9177 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9178
9179 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9180 // promoted to v8i16, etc.
9181 MVT ElType =
9183 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9184
9185 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9186 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9187 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9188 // ConcatVT.
9189 SDValue ConVec =
9190 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9191 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9192 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9193 }
9194
9195 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9196 // to be the right size for the destination. For example, if Op1 is v4i1
9197 // then the promoted vector is v4i32. The result of concatenation gives a
9198 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9199 // needs truncating to i16 and inserting in the result.
9200 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9201 EVT NewVT = NewV.getValueType();
9202 EVT ConcatVT = ConVec.getValueType();
9203 unsigned ExtScale = 1;
9204 if (NewVT == MVT::v2f64) {
9205 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9206 ExtScale = 2;
9207 }
9208 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9209 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9210 DAG.getIntPtrConstant(i * ExtScale, dl));
9211 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9212 DAG.getConstant(j, dl, MVT::i32));
9213 }
9214 return ConVec;
9215 };
9216 unsigned j = 0;
9217 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9218 ConVec = ExtractInto(NewV1, ConVec, j);
9219 ConVec = ExtractInto(NewV2, ConVec, j);
9220
9221 // Now return the result of comparing the subvector with zero, which will
9222 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9223 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9224 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9225 };
9226
9227 // Concat each pair of subvectors and pack into the lower half of the array.
9228 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
9229 while (ConcatOps.size() > 1) {
9230 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9231 SDValue V1 = ConcatOps[I];
9232 SDValue V2 = ConcatOps[I + 1];
9233 ConcatOps[I / 2] = ConcatPair(V1, V2);
9234 }
9235 ConcatOps.resize(ConcatOps.size() / 2);
9236 }
9237 return ConcatOps[0];
9238}
9239
9241 const ARMSubtarget *ST) {
9242 EVT VT = Op->getValueType(0);
9243 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9244 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9245
9246 // The only time a CONCAT_VECTORS operation can have legal types is when
9247 // two 64-bit vectors are concatenated to a 128-bit vector.
9248 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9249 "unexpected CONCAT_VECTORS");
9250 SDLoc dl(Op);
9251 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9252 SDValue Op0 = Op.getOperand(0);
9253 SDValue Op1 = Op.getOperand(1);
9254 if (!Op0.isUndef())
9255 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9256 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9257 DAG.getIntPtrConstant(0, dl));
9258 if (!Op1.isUndef())
9259 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9260 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9261 DAG.getIntPtrConstant(1, dl));
9262 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9263}
9264
9266 const ARMSubtarget *ST) {
9267 SDValue V1 = Op.getOperand(0);
9268 SDValue V2 = Op.getOperand(1);
9269 SDLoc dl(Op);
9270 EVT VT = Op.getValueType();
9271 EVT Op1VT = V1.getValueType();
9272 unsigned NumElts = VT.getVectorNumElements();
9273 unsigned Index = V2->getAsZExtVal();
9274
9275 assert(VT.getScalarSizeInBits() == 1 &&
9276 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9277 assert(ST->hasMVEIntegerOps() &&
9278 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9279
9280 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9281
9282 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9283 // promoted to v8i16, etc.
9284
9286
9287 if (NumElts == 2) {
9288 EVT SubVT = MVT::v4i32;
9289 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9290 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9291 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9292 DAG.getIntPtrConstant(i, dl));
9293 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9294 DAG.getConstant(j, dl, MVT::i32));
9295 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9296 DAG.getConstant(j + 1, dl, MVT::i32));
9297 }
9298 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9299 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9300 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9301 }
9302
9303 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9304 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9305 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9306 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9307 DAG.getIntPtrConstant(i, dl));
9308 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9309 DAG.getConstant(j, dl, MVT::i32));
9310 }
9311
9312 // Now return the result of comparing the subvector with zero,
9313 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9314 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9315 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9316}
9317
9318// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9320 const ARMSubtarget *ST) {
9321 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9322 EVT VT = N->getValueType(0);
9323 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9324 "Expected a vector i1 type!");
9325 SDValue Op = N->getOperand(0);
9326 EVT FromVT = Op.getValueType();
9327 SDLoc DL(N);
9328
9329 SDValue And =
9330 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9331 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9332 DAG.getCondCode(ISD::SETNE));
9333}
9334
9336 const ARMSubtarget *Subtarget) {
9337 if (!Subtarget->hasMVEIntegerOps())
9338 return SDValue();
9339
9340 EVT ToVT = N->getValueType(0);
9341 if (ToVT.getScalarType() == MVT::i1)
9342 return LowerTruncatei1(N, DAG, Subtarget);
9343
9344 // MVE does not have a single instruction to perform the truncation of a v4i32
9345 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9346 // Most of the instructions in MVE follow the 'Beats' system, where moving
9347 // values from different lanes is usually something that the instructions
9348 // avoid.
9349 //
9350 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9351 // which take a the top/bottom half of a larger lane and extend it (or do the
9352 // opposite, truncating into the top/bottom lane from a larger lane). Note
9353 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9354 // bottom 16bits from each vector lane. This works really well with T/B
9355 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9356 // to move order.
9357 //
9358 // But truncates and sext/zext are always going to be fairly common from llvm.
9359 // We have several options for how to deal with them:
9360 // - Wherever possible combine them into an instruction that makes them
9361 // "free". This includes loads/stores, which can perform the trunc as part
9362 // of the memory operation. Or certain shuffles that can be turned into
9363 // VMOVN/VMOVL.
9364 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9365 // trunc(mul(sext(a), sext(b))) may become
9366 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9367 // this case can use VMULL). This is performed in the
9368 // MVELaneInterleavingPass.
9369 // - Otherwise we have an option. By default we would expand the
9370 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9371 // registers. One for each vector lane in the vector. This can obviously be
9372 // very expensive.
9373 // - The other option is to use the fact that loads/store can extend/truncate
9374 // to turn a trunc into two truncating stack stores and a stack reload. This
9375 // becomes 3 back-to-back memory operations, but at least that is less than
9376 // all the insert/extracts.
9377 //
9378 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9379 // are either optimized where they can be, or eventually lowered into stack
9380 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9381 // two early, where other instructions would be better, and stops us from
9382 // having to reconstruct multiple buildvector shuffles into loads/stores.
9383 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9384 return SDValue();
9385 EVT FromVT = N->getOperand(0).getValueType();
9386 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9387 return SDValue();
9388
9389 SDValue Lo, Hi;
9390 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9391 SDLoc DL(N);
9392 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9393}
9394
9396 const ARMSubtarget *Subtarget) {
9397 if (!Subtarget->hasMVEIntegerOps())
9398 return SDValue();
9399
9400 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9401
9402 EVT ToVT = N->getValueType(0);
9403 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9404 return SDValue();
9405 SDValue Op = N->getOperand(0);
9406 EVT FromVT = Op.getValueType();
9407 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9408 return SDValue();
9409
9410 SDLoc DL(N);
9411 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9412 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9413 ExtVT = MVT::v8i16;
9414
9415 unsigned Opcode =
9417 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9418 SDValue Ext1 = Ext.getValue(1);
9419
9420 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9421 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9422 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9423 }
9424
9425 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9426}
9427
9428/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9429/// element has been zero/sign-extended, depending on the isSigned parameter,
9430/// from an integer type half its size.
9432 bool isSigned) {
9433 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9434 EVT VT = N->getValueType(0);
9435 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9436 SDNode *BVN = N->getOperand(0).getNode();
9437 if (BVN->getValueType(0) != MVT::v4i32 ||
9438 BVN->getOpcode() != ISD::BUILD_VECTOR)
9439 return false;
9440 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9441 unsigned HiElt = 1 - LoElt;
9442 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
9443 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
9444 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
9445 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
9446 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9447 return false;
9448 if (isSigned) {
9449 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9450 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9451 return true;
9452 } else {
9453 if (Hi0->isZero() && Hi1->isZero())
9454 return true;
9455 }
9456 return false;
9457 }
9458
9459 if (N->getOpcode() != ISD::BUILD_VECTOR)
9460 return false;
9461
9462 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9463 SDNode *Elt = N->getOperand(i).getNode();
9464 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
9465 unsigned EltSize = VT.getScalarSizeInBits();
9466 unsigned HalfSize = EltSize / 2;
9467 if (isSigned) {
9468 if (!isIntN(HalfSize, C->getSExtValue()))
9469 return false;
9470 } else {
9471 if (!isUIntN(HalfSize, C->getZExtValue()))
9472 return false;
9473 }
9474 continue;
9475 }
9476 return false;
9477 }
9478
9479 return true;
9480}
9481
9482/// isSignExtended - Check if a node is a vector value that is sign-extended
9483/// or a constant BUILD_VECTOR with sign-extended elements.
9485 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9486 return true;
9487 if (isExtendedBUILD_VECTOR(N, DAG, true))
9488 return true;
9489 return false;
9490}
9491
9492/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9493/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9495 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9497 return true;
9498 if (isExtendedBUILD_VECTOR(N, DAG, false))
9499 return true;
9500 return false;
9501}
9502
9503static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9504 if (OrigVT.getSizeInBits() >= 64)
9505 return OrigVT;
9506
9507 assert(OrigVT.isSimple() && "Expecting a simple value type");
9508
9509 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9510 switch (OrigSimpleTy) {
9511 default: llvm_unreachable("Unexpected Vector Type");
9512 case MVT::v2i8:
9513 case MVT::v2i16:
9514 return MVT::v2i32;
9515 case MVT::v4i8:
9516 return MVT::v4i16;
9517 }
9518}
9519
9520/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9521/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9522/// We insert the required extension here to get the vector to fill a D register.
9524 const EVT &OrigTy,
9525 const EVT &ExtTy,
9526 unsigned ExtOpcode) {
9527 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9528 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9529 // 64-bits we need to insert a new extension so that it will be 64-bits.
9530 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9531 if (OrigTy.getSizeInBits() >= 64)
9532 return N;
9533
9534 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9535 EVT NewVT = getExtensionTo64Bits(OrigTy);
9536
9537 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9538}
9539
9540/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9541/// does not do any sign/zero extension. If the original vector is less
9542/// than 64 bits, an appropriate extension will be added after the load to
9543/// reach a total size of 64 bits. We have to add the extension separately
9544/// because ARM does not have a sign/zero extending load for vectors.
9546 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9547
9548 // The load already has the right type.
9549 if (ExtendedTy == LD->getMemoryVT())
9550 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9551 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9552 LD->getMemOperand()->getFlags());
9553
9554 // We need to create a zextload/sextload. We cannot just create a load
9555 // followed by a zext/zext node because LowerMUL is also run during normal
9556 // operation legalization where we can't create illegal types.
9557 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9558 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9559 LD->getMemoryVT(), LD->getAlign(),
9560 LD->getMemOperand()->getFlags());
9561}
9562
9563/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9564/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9565/// the unextended value. The unextended vector should be 64 bits so that it can
9566/// be used as an operand to a VMULL instruction. If the original vector size
9567/// before extension is less than 64 bits we add a an extension to resize
9568/// the vector to 64 bits.
9570 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9571 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9572 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9573 N->getOperand(0)->getValueType(0),
9574 N->getValueType(0),
9575 N->getOpcode());
9576
9577 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9578 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9579 "Expected extending load");
9580
9581 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9582 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9583 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9584 SDValue extLoad =
9585 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9586 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9587
9588 return newLoad;
9589 }
9590
9591 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9592 // have been legalized as a BITCAST from v4i32.
9593 if (N->getOpcode() == ISD::BITCAST) {
9594 SDNode *BVN = N->getOperand(0).getNode();
9596 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9597 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9598 return DAG.getBuildVector(
9599 MVT::v2i32, SDLoc(N),
9600 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9601 }
9602 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9603 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9604 EVT VT = N->getValueType(0);
9605 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9606 unsigned NumElts = VT.getVectorNumElements();
9607 MVT TruncVT = MVT::getIntegerVT(EltSize);
9609 SDLoc dl(N);
9610 for (unsigned i = 0; i != NumElts; ++i) {
9611 const APInt &CInt = N->getConstantOperandAPInt(i);
9612 // Element types smaller than 32 bits are not legal, so use i32 elements.
9613 // The values are implicitly truncated so sext vs. zext doesn't matter.
9614 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9615 }
9616 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9617}
9618
9619static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9620 unsigned Opcode = N->getOpcode();
9621 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9622 SDNode *N0 = N->getOperand(0).getNode();
9623 SDNode *N1 = N->getOperand(1).getNode();
9624 return N0->hasOneUse() && N1->hasOneUse() &&
9625 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9626 }
9627 return false;
9628}
9629
9630static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9631 unsigned Opcode = N->getOpcode();
9632 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9633 SDNode *N0 = N->getOperand(0).getNode();
9634 SDNode *N1 = N->getOperand(1).getNode();
9635 return N0->hasOneUse() && N1->hasOneUse() &&
9636 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9637 }
9638 return false;
9639}
9640
9642 // Multiplications are only custom-lowered for 128-bit vectors so that
9643 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9644 EVT VT = Op.getValueType();
9645 assert(VT.is128BitVector() && VT.isInteger() &&
9646 "unexpected type for custom-lowering ISD::MUL");
9647 SDNode *N0 = Op.getOperand(0).getNode();
9648 SDNode *N1 = Op.getOperand(1).getNode();
9649 unsigned NewOpc = 0;
9650 bool isMLA = false;
9651 bool isN0SExt = isSignExtended(N0, DAG);
9652 bool isN1SExt = isSignExtended(N1, DAG);
9653 if (isN0SExt && isN1SExt)
9654 NewOpc = ARMISD::VMULLs;
9655 else {
9656 bool isN0ZExt = isZeroExtended(N0, DAG);
9657 bool isN1ZExt = isZeroExtended(N1, DAG);
9658 if (isN0ZExt && isN1ZExt)
9659 NewOpc = ARMISD::VMULLu;
9660 else if (isN1SExt || isN1ZExt) {
9661 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9662 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9663 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9664 NewOpc = ARMISD::VMULLs;
9665 isMLA = true;
9666 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9667 NewOpc = ARMISD::VMULLu;
9668 isMLA = true;
9669 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9670 std::swap(N0, N1);
9671 NewOpc = ARMISD::VMULLu;
9672 isMLA = true;
9673 }
9674 }
9675
9676 if (!NewOpc) {
9677 if (VT == MVT::v2i64)
9678 // Fall through to expand this. It is not legal.
9679 return SDValue();
9680 else
9681 // Other vector multiplications are legal.
9682 return Op;
9683 }
9684 }
9685
9686 // Legalize to a VMULL instruction.
9687 SDLoc DL(Op);
9688 SDValue Op0;
9689 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9690 if (!isMLA) {
9691 Op0 = SkipExtensionForVMULL(N0, DAG);
9693 Op1.getValueType().is64BitVector() &&
9694 "unexpected types for extended operands to VMULL");
9695 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9696 }
9697
9698 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9699 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9700 // vmull q0, d4, d6
9701 // vmlal q0, d5, d6
9702 // is faster than
9703 // vaddl q0, d4, d5
9704 // vmovl q1, d6
9705 // vmul q0, q0, q1
9706 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9707 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9708 EVT Op1VT = Op1.getValueType();
9709 return DAG.getNode(N0->getOpcode(), DL, VT,
9710 DAG.getNode(NewOpc, DL, VT,
9711 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9712 DAG.getNode(NewOpc, DL, VT,
9713 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9714}
9715
9717 SelectionDAG &DAG) {
9718 // TODO: Should this propagate fast-math-flags?
9719
9720 // Convert to float
9721 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9722 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9723 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9724 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9725 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9726 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9727 // Get reciprocal estimate.
9728 // float4 recip = vrecpeq_f32(yf);
9729 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9730 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9731 Y);
9732 // Because char has a smaller range than uchar, we can actually get away
9733 // without any newton steps. This requires that we use a weird bias
9734 // of 0xb000, however (again, this has been exhaustively tested).
9735 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9736 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9737 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9738 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9739 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9740 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9741 // Convert back to short.
9742 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9743 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9744 return X;
9745}
9746
9748 SelectionDAG &DAG) {
9749 // TODO: Should this propagate fast-math-flags?
9750
9751 SDValue N2;
9752 // Convert to float.
9753 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9754 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9755 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9756 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9757 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9758 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9759
9760 // Use reciprocal estimate and one refinement step.
9761 // float4 recip = vrecpeq_f32(yf);
9762 // recip *= vrecpsq_f32(yf, recip);
9763 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9764 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9765 N1);
9766 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9767 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9768 N1, N2);
9769 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9770 // Because short has a smaller range than ushort, we can actually get away
9771 // with only a single newton step. This requires that we use a weird bias
9772 // of 89, however (again, this has been exhaustively tested).
9773 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9774 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9775 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9776 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9777 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9778 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9779 // Convert back to integer and return.
9780 // return vmovn_s32(vcvt_s32_f32(result));
9781 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9782 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9783 return N0;
9784}
9785
9787 const ARMSubtarget *ST) {
9788 EVT VT = Op.getValueType();
9789 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9790 "unexpected type for custom-lowering ISD::SDIV");
9791
9792 SDLoc dl(Op);
9793 SDValue N0 = Op.getOperand(0);
9794 SDValue N1 = Op.getOperand(1);
9795 SDValue N2, N3;
9796
9797 if (VT == MVT::v8i8) {
9798 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9799 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9800
9801 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9802 DAG.getIntPtrConstant(4, dl));
9803 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9804 DAG.getIntPtrConstant(4, dl));
9805 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9806 DAG.getIntPtrConstant(0, dl));
9807 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9808 DAG.getIntPtrConstant(0, dl));
9809
9810 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9811 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9812
9813 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9814 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9815
9816 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9817 return N0;
9818 }
9819 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9820}
9821
9823 const ARMSubtarget *ST) {
9824 // TODO: Should this propagate fast-math-flags?
9825 EVT VT = Op.getValueType();
9826 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9827 "unexpected type for custom-lowering ISD::UDIV");
9828
9829 SDLoc dl(Op);
9830 SDValue N0 = Op.getOperand(0);
9831 SDValue N1 = Op.getOperand(1);
9832 SDValue N2, N3;
9833
9834 if (VT == MVT::v8i8) {
9835 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9836 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9837
9838 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9839 DAG.getIntPtrConstant(4, dl));
9840 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9841 DAG.getIntPtrConstant(4, dl));
9842 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9843 DAG.getIntPtrConstant(0, dl));
9844 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9845 DAG.getIntPtrConstant(0, dl));
9846
9847 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9848 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9849
9850 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9851 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9852
9853 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9854 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9855 MVT::i32),
9856 N0);
9857 return N0;
9858 }
9859
9860 // v4i16 sdiv ... Convert to float.
9861 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9862 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9863 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9864 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9865 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9866 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9867
9868 // Use reciprocal estimate and two refinement steps.
9869 // float4 recip = vrecpeq_f32(yf);
9870 // recip *= vrecpsq_f32(yf, recip);
9871 // recip *= vrecpsq_f32(yf, recip);
9872 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9873 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9874 BN1);
9875 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9876 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9877 BN1, N2);
9878 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9879 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9880 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9881 BN1, N2);
9882 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9883 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9884 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9885 // and that it will never cause us to return an answer too large).
9886 // float4 result = as_float4(as_int4(xf*recip) + 2);
9887 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9888 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9889 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9890 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9891 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9892 // Convert back to integer and return.
9893 // return vmovn_u32(vcvt_s32_f32(result));
9894 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9895 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9896 return N0;
9897}
9898
9900 SDNode *N = Op.getNode();
9901 EVT VT = N->getValueType(0);
9902 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9903
9904 SDValue Carry = Op.getOperand(2);
9905
9906 SDLoc DL(Op);
9907
9908 SDValue Result;
9909 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9910 // This converts the boolean value carry into the carry flag.
9911 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9912
9913 // Do the addition proper using the carry flag we wanted.
9914 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9915 Op.getOperand(1), Carry);
9916
9917 // Now convert the carry flag into a boolean value.
9918 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9919 } else {
9920 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9921 // have to invert the carry first.
9922 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9923 DAG.getConstant(1, DL, MVT::i32), Carry);
9924 // This converts the boolean value carry into the carry flag.
9925 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9926
9927 // Do the subtraction proper using the carry flag we wanted.
9928 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9929 Op.getOperand(1), Carry);
9930
9931 // Now convert the carry flag into a boolean value.
9932 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9933 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9934 // by ISD::USUBO_CARRY, so compute 1 - C.
9935 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9936 DAG.getConstant(1, DL, MVT::i32), Carry);
9937 }
9938
9939 // Return both values.
9940 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9941}
9942
9943SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9944 assert(Subtarget->isTargetDarwin());
9945
9946 // For iOS, we want to call an alternative entry point: __sincos_stret,
9947 // return values are passed via sret.
9948 SDLoc dl(Op);
9949 SDValue Arg = Op.getOperand(0);
9950 EVT ArgVT = Arg.getValueType();
9951 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9952 auto PtrVT = getPointerTy(DAG.getDataLayout());
9953
9955 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9956
9957 // Pair of floats / doubles used to pass the result.
9958 Type *RetTy = StructType::get(ArgTy, ArgTy);
9959 auto &DL = DAG.getDataLayout();
9960
9962 bool ShouldUseSRet = Subtarget->isAPCS_ABI();
9963 SDValue SRet;
9964 if (ShouldUseSRet) {
9965 // Create stack object for sret.
9966 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9967 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9968 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9969 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
9970
9971 ArgListEntry Entry;
9972 Entry.Node = SRet;
9973 Entry.Ty = PointerType::getUnqual(RetTy->getContext());
9974 Entry.IsSExt = false;
9975 Entry.IsZExt = false;
9976 Entry.IsSRet = true;
9977 Args.push_back(Entry);
9979 }
9980
9981 ArgListEntry Entry;
9982 Entry.Node = Arg;
9983 Entry.Ty = ArgTy;
9984 Entry.IsSExt = false;
9985 Entry.IsZExt = false;
9986 Args.push_back(Entry);
9987
9988 RTLIB::Libcall LC =
9989 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9990 const char *LibcallName = getLibcallName(LC);
9992 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9993
9995 CLI.setDebugLoc(dl)
9996 .setChain(DAG.getEntryNode())
9997 .setCallee(CC, RetTy, Callee, std::move(Args))
9998 .setDiscardResult(ShouldUseSRet);
9999 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
10000
10001 if (!ShouldUseSRet)
10002 return CallResult.first;
10003
10004 SDValue LoadSin =
10005 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
10006
10007 // Address of cos field.
10008 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
10009 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
10010 SDValue LoadCos =
10011 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
10012
10013 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
10014 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
10015 LoadSin.getValue(0), LoadCos.getValue(0));
10016}
10017
10018SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
10019 bool Signed,
10020 SDValue &Chain) const {
10021 EVT VT = Op.getValueType();
10022 assert((VT == MVT::i32 || VT == MVT::i64) &&
10023 "unexpected type for custom lowering DIV");
10024 SDLoc dl(Op);
10025
10026 const auto &DL = DAG.getDataLayout();
10027 const auto &TLI = DAG.getTargetLoweringInfo();
10028
10029 const char *Name = nullptr;
10030 if (Signed)
10031 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
10032 else
10033 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
10034
10036
10038
10039 for (auto AI : {1, 0}) {
10040 ArgListEntry Arg;
10041 Arg.Node = Op.getOperand(AI);
10042 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
10043 Args.push_back(Arg);
10044 }
10045
10046 CallLoweringInfo CLI(DAG);
10047 CLI.setDebugLoc(dl)
10048 .setChain(Chain)
10050 ES, std::move(Args));
10051
10052 return LowerCallTo(CLI).first;
10053}
10054
10055// This is a code size optimisation: return the original SDIV node to
10056// DAGCombiner when we don't want to expand SDIV into a sequence of
10057// instructions, and an empty node otherwise which will cause the
10058// SDIV to be expanded in DAGCombine.
10059SDValue
10060ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
10061 SelectionDAG &DAG,
10062 SmallVectorImpl<SDNode *> &Created) const {
10063 // TODO: Support SREM
10064 if (N->getOpcode() != ISD::SDIV)
10065 return SDValue();
10066
10067 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
10068 const bool MinSize = ST.hasMinSize();
10069 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
10070 : ST.hasDivideInARMMode();
10071
10072 // Don't touch vector types; rewriting this may lead to scalarizing
10073 // the int divs.
10074 if (N->getOperand(0).getValueType().isVector())
10075 return SDValue();
10076
10077 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
10078 // hwdiv support for this to be really profitable.
10079 if (!(MinSize && HasDivide))
10080 return SDValue();
10081
10082 // ARM mode is a bit simpler than Thumb: we can handle large power
10083 // of 2 immediates with 1 mov instruction; no further checks required,
10084 // just return the sdiv node.
10085 if (!ST.isThumb())
10086 return SDValue(N, 0);
10087
10088 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
10089 // and thus lose the code size benefits of a MOVS that requires only 2.
10090 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
10091 // but as it's doing exactly this, it's not worth the trouble to get TTI.
10092 if (Divisor.sgt(128))
10093 return SDValue();
10094
10095 return SDValue(N, 0);
10096}
10097
10098SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
10099 bool Signed) const {
10100 assert(Op.getValueType() == MVT::i32 &&
10101 "unexpected type for custom lowering DIV");
10102 SDLoc dl(Op);
10103
10104 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
10105 DAG.getEntryNode(), Op.getOperand(1));
10106
10107 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
10108}
10109
10111 SDLoc DL(N);
10112 SDValue Op = N->getOperand(1);
10113 if (N->getValueType(0) == MVT::i32)
10114 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
10115 SDValue Lo, Hi;
10116 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
10117 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
10118 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
10119}
10120
10121void ARMTargetLowering::ExpandDIV_Windows(
10122 SDValue Op, SelectionDAG &DAG, bool Signed,
10124 const auto &DL = DAG.getDataLayout();
10125 const auto &TLI = DAG.getTargetLoweringInfo();
10126
10127 assert(Op.getValueType() == MVT::i64 &&
10128 "unexpected type for custom lowering DIV");
10129 SDLoc dl(Op);
10130
10131 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
10132
10133 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
10134
10135 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
10136 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
10137 DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
10138 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
10139
10140 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10141}
10142
10144 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
10145 EVT MemVT = LD->getMemoryVT();
10146 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10147 MemVT == MVT::v16i1) &&
10148 "Expected a predicate type!");
10149 assert(MemVT == Op.getValueType());
10150 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10151 "Expected a non-extending load");
10152 assert(LD->isUnindexed() && "Expected a unindexed load");
10153
10154 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10155 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10156 // need to make sure that 8/4/2 bits are actually loaded into the correct
10157 // place, which means loading the value and then shuffling the values into
10158 // the bottom bits of the predicate.
10159 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10160 // for BE).
10161 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10162 // a natural VMSR(load), so needs to be reversed.
10163
10164 SDLoc dl(Op);
10165 SDValue Load = DAG.getExtLoad(
10166 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10168 LD->getMemOperand());
10169 SDValue Val = Load;
10170 if (DAG.getDataLayout().isBigEndian())
10171 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10172 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10173 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10174 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10175 if (MemVT != MVT::v16i1)
10176 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10177 DAG.getConstant(0, dl, MVT::i32));
10178 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10179}
10180
10181void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10182 SelectionDAG &DAG) const {
10183 LoadSDNode *LD = cast<LoadSDNode>(N);
10184 EVT MemVT = LD->getMemoryVT();
10185 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10186
10187 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10188 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10189 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10190 SDLoc dl(N);
10192 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10193 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10194 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10195 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10196 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10197 Results.append({Pair, Result.getValue(2)});
10198 }
10199}
10200
10202 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10203 EVT MemVT = ST->getMemoryVT();
10204 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10205 MemVT == MVT::v16i1) &&
10206 "Expected a predicate type!");
10207 assert(MemVT == ST->getValue().getValueType());
10208 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10209 assert(ST->isUnindexed() && "Expected a unindexed store");
10210
10211 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10212 // top bits unset and a scalar store.
10213 SDLoc dl(Op);
10214 SDValue Build = ST->getValue();
10215 if (MemVT != MVT::v16i1) {
10217 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10218 unsigned Elt = DAG.getDataLayout().isBigEndian()
10219 ? MemVT.getVectorNumElements() - I - 1
10220 : I;
10221 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10222 DAG.getConstant(Elt, dl, MVT::i32)));
10223 }
10224 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10225 Ops.push_back(DAG.getUNDEF(MVT::i32));
10226 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10227 }
10228 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10229 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10230 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10231 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10232 DAG.getConstant(16, dl, MVT::i32));
10233 return DAG.getTruncStore(
10234 ST->getChain(), dl, GRP, ST->getBasePtr(),
10236 ST->getMemOperand());
10237}
10238
10240 const ARMSubtarget *Subtarget) {
10241 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10242 EVT MemVT = ST->getMemoryVT();
10243 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10244
10245 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10246 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10247 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10248 SDNode *N = Op.getNode();
10249 SDLoc dl(N);
10250
10251 SDValue Lo = DAG.getNode(
10252 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10253 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10254 MVT::i32));
10255 SDValue Hi = DAG.getNode(
10256 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10257 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10258 MVT::i32));
10259
10260 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10261 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10262 MemVT, ST->getMemOperand());
10263 } else if (Subtarget->hasMVEIntegerOps() &&
10264 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10265 MemVT == MVT::v16i1))) {
10266 return LowerPredicateStore(Op, DAG);
10267 }
10268
10269 return SDValue();
10270}
10271
10272static bool isZeroVector(SDValue N) {
10273 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10274 (N->getOpcode() == ARMISD::VMOVIMM &&
10275 isNullConstant(N->getOperand(0))));
10276}
10277
10279 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
10280 MVT VT = Op.getSimpleValueType();
10281 SDValue Mask = N->getMask();
10282 SDValue PassThru = N->getPassThru();
10283 SDLoc dl(Op);
10284
10285 if (isZeroVector(PassThru))
10286 return Op;
10287
10288 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10289 // zero too, and other values are lowered to a select.
10290 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10291 DAG.getTargetConstant(0, dl, MVT::i32));
10292 SDValue NewLoad = DAG.getMaskedLoad(
10293 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10294 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10295 N->getExtensionType(), N->isExpandingLoad());
10296 SDValue Combo = NewLoad;
10297 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10298 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10299 isZeroVector(PassThru->getOperand(0));
10300 if (!PassThru.isUndef() && !PassThruIsCastZero)
10301 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10302 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10303}
10304
10306 const ARMSubtarget *ST) {
10307 if (!ST->hasMVEIntegerOps())
10308 return SDValue();
10309
10310 SDLoc dl(Op);
10311 unsigned BaseOpcode = 0;
10312 switch (Op->getOpcode()) {
10313 default: llvm_unreachable("Expected VECREDUCE opcode");
10314 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10315 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10316 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10317 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10318 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10319 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10320 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10321 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10322 }
10323
10324 SDValue Op0 = Op->getOperand(0);
10325 EVT VT = Op0.getValueType();
10326 EVT EltVT = VT.getVectorElementType();
10327 unsigned NumElts = VT.getVectorNumElements();
10328 unsigned NumActiveLanes = NumElts;
10329
10330 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10331 NumActiveLanes == 2) &&
10332 "Only expected a power 2 vector size");
10333
10334 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10335 // allows us to easily extract vector elements from the lanes.
10336 while (NumActiveLanes > 4) {
10337 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10338 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10339 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10340 NumActiveLanes /= 2;
10341 }
10342
10343 SDValue Res;
10344 if (NumActiveLanes == 4) {
10345 // The remaining 4 elements are summed sequentially
10346 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10347 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10348 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10349 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10350 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10351 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10352 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10353 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10354 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10355 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10356 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10357 } else {
10358 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10359 DAG.getConstant(0, dl, MVT::i32));
10360 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10361 DAG.getConstant(1, dl, MVT::i32));
10362 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10363 }
10364
10365 // Result type may be wider than element type.
10366 if (EltVT != Op->getValueType(0))
10367 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10368 return Res;
10369}
10370
10372 const ARMSubtarget *ST) {
10373 if (!ST->hasMVEFloatOps())
10374 return SDValue();
10375 return LowerVecReduce(Op, DAG, ST);
10376}
10377
10379 const ARMSubtarget *ST) {
10380 if (!ST->hasNEON())
10381 return SDValue();
10382
10383 SDLoc dl(Op);
10384 SDValue Op0 = Op->getOperand(0);
10385 EVT VT = Op0.getValueType();
10386 EVT EltVT = VT.getVectorElementType();
10387
10388 unsigned PairwiseIntrinsic = 0;
10389 switch (Op->getOpcode()) {
10390 default:
10391 llvm_unreachable("Expected VECREDUCE opcode");
10393 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10394 break;
10396 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10397 break;
10399 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10400 break;
10402 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10403 break;
10404 }
10405 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10406
10407 unsigned NumElts = VT.getVectorNumElements();
10408 unsigned NumActiveLanes = NumElts;
10409
10410 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10411 NumActiveLanes == 2) &&
10412 "Only expected a power 2 vector size");
10413
10414 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10415 if (VT.is128BitVector()) {
10416 SDValue Lo, Hi;
10417 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10418 VT = Lo.getValueType();
10419 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10420 NumActiveLanes /= 2;
10421 }
10422
10423 // Use pairwise reductions until one lane remains
10424 while (NumActiveLanes > 1) {
10425 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10426 NumActiveLanes /= 2;
10427 }
10428
10429 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10430 DAG.getConstant(0, dl, MVT::i32));
10431
10432 // Result type may be wider than element type.
10433 if (EltVT != Op.getValueType()) {
10434 unsigned Extend = 0;
10435 switch (Op->getOpcode()) {
10436 default:
10437 llvm_unreachable("Expected VECREDUCE opcode");
10440 Extend = ISD::ZERO_EXTEND;
10441 break;
10444 Extend = ISD::SIGN_EXTEND;
10445 break;
10446 }
10447 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10448 }
10449 return Res;
10450}
10451
10453 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10454 // Acquire/Release load/store is not legal for targets without a dmb or
10455 // equivalent available.
10456 return SDValue();
10457
10458 // Monotonic load/store is legal for all targets.
10459 return Op;
10460}
10461
10464 SelectionDAG &DAG,
10465 const ARMSubtarget *Subtarget) {
10466 SDLoc DL(N);
10467 // Under Power Management extensions, the cycle-count is:
10468 // mrc p15, #0, <Rt>, c9, c13, #0
10469 SDValue Ops[] = { N->getOperand(0), // Chain
10470 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10471 DAG.getTargetConstant(15, DL, MVT::i32),
10472 DAG.getTargetConstant(0, DL, MVT::i32),
10473 DAG.getTargetConstant(9, DL, MVT::i32),
10474 DAG.getTargetConstant(13, DL, MVT::i32),
10475 DAG.getTargetConstant(0, DL, MVT::i32)
10476 };
10477
10478 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10479 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10480 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10481 DAG.getConstant(0, DL, MVT::i32)));
10482 Results.push_back(Cycles32.getValue(1));
10483}
10484
10486 SDLoc dl(V.getNode());
10487 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10488 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10489 if (isBigEndian)
10490 std::swap (VLo, VHi);
10491 SDValue RegClass =
10492 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10493 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10494 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10495 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
10496 return SDValue(
10497 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10498}
10499
10502 SelectionDAG &DAG) {
10503 assert(N->getValueType(0) == MVT::i64 &&
10504 "AtomicCmpSwap on types less than 64 should be legal");
10505 SDValue Ops[] = {N->getOperand(1),
10506 createGPRPairNode(DAG, N->getOperand(2)),
10507 createGPRPairNode(DAG, N->getOperand(3)),
10508 N->getOperand(0)};
10509 SDNode *CmpSwap = DAG.getMachineNode(
10510 ARM::CMP_SWAP_64, SDLoc(N),
10511 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
10512
10513 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10514 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10515
10516 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10517
10518 SDValue Lo =
10519 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10520 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10521 SDValue Hi =
10522 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10523 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10524 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10525 Results.push_back(SDValue(CmpSwap, 2));
10526}
10527
10528SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10529 SDLoc dl(Op);
10530 EVT VT = Op.getValueType();
10531 SDValue Chain = Op.getOperand(0);
10532 SDValue LHS = Op.getOperand(1);
10533 SDValue RHS = Op.getOperand(2);
10534 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10535 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10536
10537 // If we don't have instructions of this float type then soften to a libcall
10538 // and use SETCC instead.
10539 if (isUnsupportedFloatingType(LHS.getValueType())) {
10541 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
10542 if (!RHS.getNode()) {
10543 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10544 CC = ISD::SETNE;
10545 }
10546 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10547 DAG.getCondCode(CC));
10548 return DAG.getMergeValues({Result, Chain}, dl);
10549 }
10550
10551 ARMCC::CondCodes CondCode, CondCode2;
10552 FPCCToARMCC(CC, CondCode, CondCode2);
10553
10554 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
10555 // in CMPFP and CMPFPE, but instead it should be made explicit by these
10556 // instructions using a chain instead of glue. This would also fix the problem
10557 // here (and also in LowerSELECT_CC) where we generate two comparisons when
10558 // CondCode2 != AL.
10559 SDValue True = DAG.getConstant(1, dl, VT);
10560 SDValue False = DAG.getConstant(0, dl, VT);
10561 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10562 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
10563 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10564 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
10565 if (CondCode2 != ARMCC::AL) {
10566 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10567 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10568 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
10569 }
10570 return DAG.getMergeValues({Result, Chain}, dl);
10571}
10572
10573SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10575
10576 EVT VT = getPointerTy(DAG.getDataLayout());
10577 SDLoc DL(Op);
10578 int FI = MFI.CreateFixedObject(4, 0, false);
10579 return DAG.getFrameIndex(FI, VT);
10580}
10581
10583 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10584 switch (Op.getOpcode()) {
10585 default: llvm_unreachable("Don't know how to custom lower this!");
10586 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10587 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10588 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10589 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10590 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10591 case ISD::SELECT: return LowerSELECT(Op, DAG);
10592 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10593 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10594 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10595 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10596 case ISD::VASTART: return LowerVASTART(Op, DAG);
10597 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10598 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10599 case ISD::SINT_TO_FP:
10600 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10603 case ISD::FP_TO_SINT:
10604 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10606 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10607 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10608 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10609 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10610 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10611 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10612 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10613 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10614 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10615 Subtarget);
10616 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10617 case ISD::SHL:
10618 case ISD::SRL:
10619 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10620 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10621 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10622 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10623 case ISD::SRL_PARTS:
10624 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10625 case ISD::CTTZ:
10626 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10627 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10628 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10629 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10630 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10631 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10632 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10633 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10634 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10635 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10636 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10637 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10638 case ISD::SIGN_EXTEND:
10639 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10640 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10641 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10642 case ISD::SET_FPMODE:
10643 return LowerSET_FPMODE(Op, DAG);
10644 case ISD::RESET_FPMODE:
10645 return LowerRESET_FPMODE(Op, DAG);
10646 case ISD::MUL: return LowerMUL(Op, DAG);
10647 case ISD::SDIV:
10648 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10649 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10650 return LowerSDIV(Op, DAG, Subtarget);
10651 case ISD::UDIV:
10652 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10653 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10654 return LowerUDIV(Op, DAG, Subtarget);
10655 case ISD::UADDO_CARRY:
10656 case ISD::USUBO_CARRY:
10657 return LowerUADDSUBO_CARRY(Op, DAG);
10658 case ISD::SADDO:
10659 case ISD::SSUBO:
10660 return LowerSignedALUO(Op, DAG);
10661 case ISD::UADDO:
10662 case ISD::USUBO:
10663 return LowerUnsignedALUO(Op, DAG);
10664 case ISD::SADDSAT:
10665 case ISD::SSUBSAT:
10666 case ISD::UADDSAT:
10667 case ISD::USUBSAT:
10668 return LowerADDSUBSAT(Op, DAG, Subtarget);
10669 case ISD::LOAD:
10670 return LowerPredicateLoad(Op, DAG);
10671 case ISD::STORE:
10672 return LowerSTORE(Op, DAG, Subtarget);
10673 case ISD::MLOAD:
10674 return LowerMLOAD(Op, DAG);
10675 case ISD::VECREDUCE_MUL:
10676 case ISD::VECREDUCE_AND:
10677 case ISD::VECREDUCE_OR:
10678 case ISD::VECREDUCE_XOR:
10679 return LowerVecReduce(Op, DAG, Subtarget);
10684 return LowerVecReduceF(Op, DAG, Subtarget);
10689 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10690 case ISD::ATOMIC_LOAD:
10691 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10692 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10693 case ISD::SDIVREM:
10694 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10696 if (Subtarget->isTargetWindows())
10697 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10698 llvm_unreachable("Don't know how to custom lower this!");
10700 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10702 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10703 case ISD::STRICT_FSETCC:
10704 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10705 case ISD::SPONENTRY:
10706 return LowerSPONENTRY(Op, DAG);
10707 case ARMISD::WIN__DBZCHK: return SDValue();
10708 }
10709}
10710
10712 SelectionDAG &DAG) {
10713 unsigned IntNo = N->getConstantOperandVal(0);
10714 unsigned Opc = 0;
10715 if (IntNo == Intrinsic::arm_smlald)
10716 Opc = ARMISD::SMLALD;
10717 else if (IntNo == Intrinsic::arm_smlaldx)
10718 Opc = ARMISD::SMLALDX;
10719 else if (IntNo == Intrinsic::arm_smlsld)
10720 Opc = ARMISD::SMLSLD;
10721 else if (IntNo == Intrinsic::arm_smlsldx)
10722 Opc = ARMISD::SMLSLDX;
10723 else
10724 return;
10725
10726 SDLoc dl(N);
10727 SDValue Lo, Hi;
10728 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10729
10730 SDValue LongMul = DAG.getNode(Opc, dl,
10731 DAG.getVTList(MVT::i32, MVT::i32),
10732 N->getOperand(1), N->getOperand(2),
10733 Lo, Hi);
10734 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10735 LongMul.getValue(0), LongMul.getValue(1)));
10736}
10737
10738/// ReplaceNodeResults - Replace the results of node with an illegal result
10739/// type with new values built out of custom code.
10742 SelectionDAG &DAG) const {
10743 SDValue Res;
10744 switch (N->getOpcode()) {
10745 default:
10746 llvm_unreachable("Don't know how to custom expand this!");
10747 case ISD::READ_REGISTER:
10749 break;
10750 case ISD::BITCAST:
10751 Res = ExpandBITCAST(N, DAG, Subtarget);
10752 break;
10753 case ISD::SRL:
10754 case ISD::SRA:
10755 case ISD::SHL:
10756 Res = Expand64BitShift(N, DAG, Subtarget);
10757 break;
10758 case ISD::SREM:
10759 case ISD::UREM:
10760 Res = LowerREM(N, DAG);
10761 break;
10762 case ISD::SDIVREM:
10763 case ISD::UDIVREM:
10764 Res = LowerDivRem(SDValue(N, 0), DAG);
10765 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10766 Results.push_back(Res.getValue(0));
10767 Results.push_back(Res.getValue(1));
10768 return;
10769 case ISD::SADDSAT:
10770 case ISD::SSUBSAT:
10771 case ISD::UADDSAT:
10772 case ISD::USUBSAT:
10773 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10774 break;
10776 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10777 return;
10778 case ISD::UDIV:
10779 case ISD::SDIV:
10780 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10781 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10782 Results);
10785 return;
10787 return ReplaceLongIntrinsic(N, Results, DAG);
10788 case ISD::LOAD:
10789 LowerLOAD(N, Results, DAG);
10790 break;
10791 case ISD::TRUNCATE:
10792 Res = LowerTruncate(N, DAG, Subtarget);
10793 break;
10794 case ISD::SIGN_EXTEND:
10795 case ISD::ZERO_EXTEND:
10796 Res = LowerVectorExtend(N, DAG, Subtarget);
10797 break;
10800 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10801 break;
10802 }
10803 if (Res.getNode())
10804 Results.push_back(Res);
10805}
10806
10807//===----------------------------------------------------------------------===//
10808// ARM Scheduler Hooks
10809//===----------------------------------------------------------------------===//
10810
10811/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10812/// registers the function context.
10813void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10815 MachineBasicBlock *DispatchBB,
10816 int FI) const {
10817 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10818 "ROPI/RWPI not currently supported with SjLj");
10819 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10820 DebugLoc dl = MI.getDebugLoc();
10821 MachineFunction *MF = MBB->getParent();
10825 const Function &F = MF->getFunction();
10826
10827 bool isThumb = Subtarget->isThumb();
10828 bool isThumb2 = Subtarget->isThumb2();
10829
10830 unsigned PCLabelId = AFI->createPICLabelUId();
10831 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10833 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10834 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10835
10836 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10837 : &ARM::GPRRegClass;
10838
10839 // Grab constant pool and fixed stack memory operands.
10840 MachineMemOperand *CPMMO =
10843
10844 MachineMemOperand *FIMMOSt =
10847
10848 // Load the address of the dispatch MBB into the jump buffer.
10849 if (isThumb2) {
10850 // Incoming value: jbuf
10851 // ldr.n r5, LCPI1_1
10852 // orr r5, r5, #1
10853 // add r5, pc
10854 // str r5, [$jbuf, #+4] ; &jbuf[1]
10855 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10856 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10858 .addMemOperand(CPMMO)
10860 // Set the low bit because of thumb mode.
10861 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10862 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10863 .addReg(NewVReg1, RegState::Kill)
10864 .addImm(0x01)
10866 .add(condCodeOp());
10867 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10868 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10869 .addReg(NewVReg2, RegState::Kill)
10870 .addImm(PCLabelId);
10871 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10872 .addReg(NewVReg3, RegState::Kill)
10873 .addFrameIndex(FI)
10874 .addImm(36) // &jbuf[1] :: pc
10875 .addMemOperand(FIMMOSt)
10877 } else if (isThumb) {
10878 // Incoming value: jbuf
10879 // ldr.n r1, LCPI1_4
10880 // add r1, pc
10881 // mov r2, #1
10882 // orrs r1, r2
10883 // add r2, $jbuf, #+4 ; &jbuf[1]
10884 // str r1, [r2]
10885 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10886 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10888 .addMemOperand(CPMMO)
10890 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10891 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10892 .addReg(NewVReg1, RegState::Kill)
10893 .addImm(PCLabelId);
10894 // Set the low bit because of thumb mode.
10895 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10896 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10897 .addReg(ARM::CPSR, RegState::Define)
10898 .addImm(1)
10900 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10901 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10902 .addReg(ARM::CPSR, RegState::Define)
10903 .addReg(NewVReg2, RegState::Kill)
10904 .addReg(NewVReg3, RegState::Kill)
10906 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10907 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10908 .addFrameIndex(FI)
10909 .addImm(36); // &jbuf[1] :: pc
10910 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10911 .addReg(NewVReg4, RegState::Kill)
10912 .addReg(NewVReg5, RegState::Kill)
10913 .addImm(0)
10914 .addMemOperand(FIMMOSt)
10916 } else {
10917 // Incoming value: jbuf
10918 // ldr r1, LCPI1_1
10919 // add r1, pc, r1
10920 // str r1, [$jbuf, #+4] ; &jbuf[1]
10921 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10922 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10924 .addImm(0)
10925 .addMemOperand(CPMMO)
10927 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10928 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10929 .addReg(NewVReg1, RegState::Kill)
10930 .addImm(PCLabelId)
10932 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10933 .addReg(NewVReg2, RegState::Kill)
10934 .addFrameIndex(FI)
10935 .addImm(36) // &jbuf[1] :: pc
10936 .addMemOperand(FIMMOSt)
10938 }
10939}
10940
10941void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10942 MachineBasicBlock *MBB) const {
10943 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10944 DebugLoc dl = MI.getDebugLoc();
10945 MachineFunction *MF = MBB->getParent();
10947 MachineFrameInfo &MFI = MF->getFrameInfo();
10948 int FI = MFI.getFunctionContextIndex();
10949
10950 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10951 : &ARM::GPRnopcRegClass;
10952
10953 // Get a mapping of the call site numbers to all of the landing pads they're
10954 // associated with.
10956 unsigned MaxCSNum = 0;
10957 for (MachineBasicBlock &BB : *MF) {
10958 if (!BB.isEHPad())
10959 continue;
10960
10961 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10962 // pad.
10963 for (MachineInstr &II : BB) {
10964 if (!II.isEHLabel())
10965 continue;
10966
10967 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10968 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10969
10970 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10971 for (unsigned Idx : CallSiteIdxs) {
10972 CallSiteNumToLPad[Idx].push_back(&BB);
10973 MaxCSNum = std::max(MaxCSNum, Idx);
10974 }
10975 break;
10976 }
10977 }
10978
10979 // Get an ordered list of the machine basic blocks for the jump table.
10980 std::vector<MachineBasicBlock*> LPadList;
10982 LPadList.reserve(CallSiteNumToLPad.size());
10983 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10984 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10985 for (MachineBasicBlock *MBB : MBBList) {
10986 LPadList.push_back(MBB);
10987 InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end());
10988 }
10989 }
10990
10991 assert(!LPadList.empty() &&
10992 "No landing pad destinations for the dispatch jump table!");
10993
10994 // Create the jump table and associated information.
10996 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10997 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10998
10999 // Create the MBBs for the dispatch code.
11000
11001 // Shove the dispatch's address into the return slot in the function context.
11002 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
11003 DispatchBB->setIsEHPad();
11004
11005 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11006 unsigned trap_opcode;
11007 if (Subtarget->isThumb())
11008 trap_opcode = ARM::tTRAP;
11009 else
11010 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
11011
11012 BuildMI(TrapBB, dl, TII->get(trap_opcode));
11013 DispatchBB->addSuccessor(TrapBB);
11014
11015 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
11016 DispatchBB->addSuccessor(DispContBB);
11017
11018 // Insert and MBBs.
11019 MF->insert(MF->end(), DispatchBB);
11020 MF->insert(MF->end(), DispContBB);
11021 MF->insert(MF->end(), TrapBB);
11022
11023 // Insert code into the entry block that creates and registers the function
11024 // context.
11025 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11026
11027 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11030
11032 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11033
11034 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11035 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11036
11037 // Add a register mask with no preserved registers. This results in all
11038 // registers being marked as clobbered. This can't work if the dispatch block
11039 // is in a Thumb1 function and is linked with ARM code which uses the FP
11040 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11042
11043 bool IsPositionIndependent = isPositionIndependent();
11044 unsigned NumLPads = LPadList.size();
11045 if (Subtarget->isThumb2()) {
11046 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11047 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11048 .addFrameIndex(FI)
11049 .addImm(4)
11050 .addMemOperand(FIMMOLd)
11052
11053 if (NumLPads < 256) {
11054 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11055 .addReg(NewVReg1)
11056 .addImm(LPadList.size())
11058 } else {
11059 Register VReg1 = MRI->createVirtualRegister(TRC);
11060 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11061 .addImm(NumLPads & 0xFFFF)
11063
11064 unsigned VReg2 = VReg1;
11065 if ((NumLPads & 0xFFFF0000) != 0) {
11066 VReg2 = MRI->createVirtualRegister(TRC);
11067 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11068 .addReg(VReg1)
11069 .addImm(NumLPads >> 16)
11071 }
11072
11073 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11074 .addReg(NewVReg1)
11075 .addReg(VReg2)
11077 }
11078
11079 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11080 .addMBB(TrapBB)
11082 .addReg(ARM::CPSR);
11083
11084 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11085 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11086 .addJumpTableIndex(MJTI)
11088
11089 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11090 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11091 .addReg(NewVReg3, RegState::Kill)
11092 .addReg(NewVReg1)
11095 .add(condCodeOp());
11096
11097 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11098 .addReg(NewVReg4, RegState::Kill)
11099 .addReg(NewVReg1)
11100 .addJumpTableIndex(MJTI);
11101 } else if (Subtarget->isThumb()) {
11102 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11103 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11104 .addFrameIndex(FI)
11105 .addImm(1)
11106 .addMemOperand(FIMMOLd)
11108
11109 if (NumLPads < 256) {
11110 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11111 .addReg(NewVReg1)
11112 .addImm(NumLPads)
11114 } else {
11115 MachineConstantPool *ConstantPool = MF->getConstantPool();
11116 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11117 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11118
11119 // MachineConstantPool wants an explicit alignment.
11120 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11121 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11122
11123 Register VReg1 = MRI->createVirtualRegister(TRC);
11124 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11125 .addReg(VReg1, RegState::Define)
11128 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11129 .addReg(NewVReg1)
11130 .addReg(VReg1)
11132 }
11133
11134 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11135 .addMBB(TrapBB)
11137 .addReg(ARM::CPSR);
11138
11139 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11140 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11141 .addReg(ARM::CPSR, RegState::Define)
11142 .addReg(NewVReg1)
11143 .addImm(2)
11145
11146 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11147 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11148 .addJumpTableIndex(MJTI)
11150
11151 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11152 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11153 .addReg(ARM::CPSR, RegState::Define)
11154 .addReg(NewVReg2, RegState::Kill)
11155 .addReg(NewVReg3)
11157
11158 MachineMemOperand *JTMMOLd =
11159 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11161
11162 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11163 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11164 .addReg(NewVReg4, RegState::Kill)
11165 .addImm(0)
11166 .addMemOperand(JTMMOLd)
11168
11169 unsigned NewVReg6 = NewVReg5;
11170 if (IsPositionIndependent) {
11171 NewVReg6 = MRI->createVirtualRegister(TRC);
11172 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11173 .addReg(ARM::CPSR, RegState::Define)
11174 .addReg(NewVReg5, RegState::Kill)
11175 .addReg(NewVReg3)
11177 }
11178
11179 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11180 .addReg(NewVReg6, RegState::Kill)
11181 .addJumpTableIndex(MJTI);
11182 } else {
11183 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11184 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11185 .addFrameIndex(FI)
11186 .addImm(4)
11187 .addMemOperand(FIMMOLd)
11189
11190 if (NumLPads < 256) {
11191 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11192 .addReg(NewVReg1)
11193 .addImm(NumLPads)
11195 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11196 Register VReg1 = MRI->createVirtualRegister(TRC);
11197 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11198 .addImm(NumLPads & 0xFFFF)
11200
11201 unsigned VReg2 = VReg1;
11202 if ((NumLPads & 0xFFFF0000) != 0) {
11203 VReg2 = MRI->createVirtualRegister(TRC);
11204 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11205 .addReg(VReg1)
11206 .addImm(NumLPads >> 16)
11208 }
11209
11210 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11211 .addReg(NewVReg1)
11212 .addReg(VReg2)
11214 } else {
11215 MachineConstantPool *ConstantPool = MF->getConstantPool();
11216 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11217 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11218
11219 // MachineConstantPool wants an explicit alignment.
11220 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11221 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11222
11223 Register VReg1 = MRI->createVirtualRegister(TRC);
11224 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11225 .addReg(VReg1, RegState::Define)
11227 .addImm(0)
11229 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11230 .addReg(NewVReg1)
11231 .addReg(VReg1, RegState::Kill)
11233 }
11234
11235 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11236 .addMBB(TrapBB)
11238 .addReg(ARM::CPSR);
11239
11240 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11241 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11242 .addReg(NewVReg1)
11245 .add(condCodeOp());
11246 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11247 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11248 .addJumpTableIndex(MJTI)
11250
11251 MachineMemOperand *JTMMOLd =
11252 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11254 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11255 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11256 .addReg(NewVReg3, RegState::Kill)
11257 .addReg(NewVReg4)
11258 .addImm(0)
11259 .addMemOperand(JTMMOLd)
11261
11262 if (IsPositionIndependent) {
11263 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11264 .addReg(NewVReg5, RegState::Kill)
11265 .addReg(NewVReg4)
11266 .addJumpTableIndex(MJTI);
11267 } else {
11268 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11269 .addReg(NewVReg5, RegState::Kill)
11270 .addJumpTableIndex(MJTI);
11271 }
11272 }
11273
11274 // Add the jump table entries as successors to the MBB.
11276 for (MachineBasicBlock *CurMBB : LPadList) {
11277 if (SeenMBBs.insert(CurMBB).second)
11278 DispContBB->addSuccessor(CurMBB);
11279 }
11280
11281 // N.B. the order the invoke BBs are processed in doesn't matter here.
11282 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11284 for (MachineBasicBlock *BB : InvokeBBs) {
11285
11286 // Remove the landing pad successor from the invoke block and replace it
11287 // with the new dispatch block.
11288 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11289 while (!Successors.empty()) {
11290 MachineBasicBlock *SMBB = Successors.pop_back_val();
11291 if (SMBB->isEHPad()) {
11292 BB->removeSuccessor(SMBB);
11293 MBBLPads.push_back(SMBB);
11294 }
11295 }
11296
11297 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11298 BB->normalizeSuccProbs();
11299
11300 // Find the invoke call and mark all of the callee-saved registers as
11301 // 'implicit defined' so that they're spilled. This prevents code from
11302 // moving instructions to before the EH block, where they will never be
11303 // executed.
11305 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11306 if (!II->isCall()) continue;
11307
11310 OI = II->operands_begin(), OE = II->operands_end();
11311 OI != OE; ++OI) {
11312 if (!OI->isReg()) continue;
11313 DefRegs[OI->getReg()] = true;
11314 }
11315
11316 MachineInstrBuilder MIB(*MF, &*II);
11317
11318 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11319 unsigned Reg = SavedRegs[i];
11320 if (Subtarget->isThumb2() &&
11321 !ARM::tGPRRegClass.contains(Reg) &&
11322 !ARM::hGPRRegClass.contains(Reg))
11323 continue;
11324 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11325 continue;
11326 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11327 continue;
11328 if (!DefRegs[Reg])
11330 }
11331
11332 break;
11333 }
11334 }
11335
11336 // Mark all former landing pads as non-landing pads. The dispatch is the only
11337 // landing pad now.
11338 for (MachineBasicBlock *MBBLPad : MBBLPads)
11339 MBBLPad->setIsEHPad(false);
11340
11341 // The instruction is gone now.
11342 MI.eraseFromParent();
11343}
11344
11345static
11347 for (MachineBasicBlock *S : MBB->successors())
11348 if (S != Succ)
11349 return S;
11350 llvm_unreachable("Expecting a BB with two successors!");
11351}
11352
11353/// Return the load opcode for a given load size. If load size >= 8,
11354/// neon opcode will be returned.
11355static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11356 if (LdSize >= 8)
11357 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11358 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11359 if (IsThumb1)
11360 return LdSize == 4 ? ARM::tLDRi
11361 : LdSize == 2 ? ARM::tLDRHi
11362 : LdSize == 1 ? ARM::tLDRBi : 0;
11363 if (IsThumb2)
11364 return LdSize == 4 ? ARM::t2LDR_POST
11365 : LdSize == 2 ? ARM::t2LDRH_POST
11366 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11367 return LdSize == 4 ? ARM::LDR_POST_IMM
11368 : LdSize == 2 ? ARM::LDRH_POST
11369 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11370}
11371
11372/// Return the store opcode for a given store size. If store size >= 8,
11373/// neon opcode will be returned.
11374static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11375 if (StSize >= 8)
11376 return StSize == 16 ? ARM::VST1q32wb_fixed
11377 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11378 if (IsThumb1)
11379 return StSize == 4 ? ARM::tSTRi
11380 : StSize == 2 ? ARM::tSTRHi
11381 : StSize == 1 ? ARM::tSTRBi : 0;
11382 if (IsThumb2)
11383 return StSize == 4 ? ARM::t2STR_POST
11384 : StSize == 2 ? ARM::t2STRH_POST
11385 : StSize == 1 ? ARM::t2STRB_POST : 0;
11386 return StSize == 4 ? ARM::STR_POST_IMM
11387 : StSize == 2 ? ARM::STRH_POST
11388 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11389}
11390
11391/// Emit a post-increment load operation with given size. The instructions
11392/// will be added to BB at Pos.
11394 const TargetInstrInfo *TII, const DebugLoc &dl,
11395 unsigned LdSize, unsigned Data, unsigned AddrIn,
11396 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11397 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11398 assert(LdOpc != 0 && "Should have a load opcode");
11399 if (LdSize >= 8) {
11400 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11401 .addReg(AddrOut, RegState::Define)
11402 .addReg(AddrIn)
11403 .addImm(0)
11405 } else if (IsThumb1) {
11406 // load + update AddrIn
11407 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11408 .addReg(AddrIn)
11409 .addImm(0)
11411 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11412 .add(t1CondCodeOp())
11413 .addReg(AddrIn)
11414 .addImm(LdSize)
11416 } else if (IsThumb2) {
11417 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11418 .addReg(AddrOut, RegState::Define)
11419 .addReg(AddrIn)
11420 .addImm(LdSize)
11422 } else { // arm
11423 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11424 .addReg(AddrOut, RegState::Define)
11425 .addReg(AddrIn)
11426 .addReg(0)
11427 .addImm(LdSize)
11429 }
11430}
11431
11432/// Emit a post-increment store operation with given size. The instructions
11433/// will be added to BB at Pos.
11435 const TargetInstrInfo *TII, const DebugLoc &dl,
11436 unsigned StSize, unsigned Data, unsigned AddrIn,
11437 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11438 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11439 assert(StOpc != 0 && "Should have a store opcode");
11440 if (StSize >= 8) {
11441 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11442 .addReg(AddrIn)
11443 .addImm(0)
11444 .addReg(Data)
11446 } else if (IsThumb1) {
11447 // store + update AddrIn
11448 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11449 .addReg(Data)
11450 .addReg(AddrIn)
11451 .addImm(0)
11453 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11454 .add(t1CondCodeOp())
11455 .addReg(AddrIn)
11456 .addImm(StSize)
11458 } else if (IsThumb2) {
11459 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11460 .addReg(Data)
11461 .addReg(AddrIn)
11462 .addImm(StSize)
11464 } else { // arm
11465 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11466 .addReg(Data)
11467 .addReg(AddrIn)
11468 .addReg(0)
11469 .addImm(StSize)
11471 }
11472}
11473
11475ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11476 MachineBasicBlock *BB) const {
11477 // This pseudo instruction has 3 operands: dst, src, size
11478 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11479 // Otherwise, we will generate unrolled scalar copies.
11480 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11481 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11483
11484 Register dest = MI.getOperand(0).getReg();
11485 Register src = MI.getOperand(1).getReg();
11486 unsigned SizeVal = MI.getOperand(2).getImm();
11487 unsigned Alignment = MI.getOperand(3).getImm();
11488 DebugLoc dl = MI.getDebugLoc();
11489
11490 MachineFunction *MF = BB->getParent();
11492 unsigned UnitSize = 0;
11493 const TargetRegisterClass *TRC = nullptr;
11494 const TargetRegisterClass *VecTRC = nullptr;
11495
11496 bool IsThumb1 = Subtarget->isThumb1Only();
11497 bool IsThumb2 = Subtarget->isThumb2();
11498 bool IsThumb = Subtarget->isThumb();
11499
11500 if (Alignment & 1) {
11501 UnitSize = 1;
11502 } else if (Alignment & 2) {
11503 UnitSize = 2;
11504 } else {
11505 // Check whether we can use NEON instructions.
11506 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11507 Subtarget->hasNEON()) {
11508 if ((Alignment % 16 == 0) && SizeVal >= 16)
11509 UnitSize = 16;
11510 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11511 UnitSize = 8;
11512 }
11513 // Can't use NEON instructions.
11514 if (UnitSize == 0)
11515 UnitSize = 4;
11516 }
11517
11518 // Select the correct opcode and register class for unit size load/store
11519 bool IsNeon = UnitSize >= 8;
11520 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11521 if (IsNeon)
11522 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11523 : UnitSize == 8 ? &ARM::DPRRegClass
11524 : nullptr;
11525
11526 unsigned BytesLeft = SizeVal % UnitSize;
11527 unsigned LoopSize = SizeVal - BytesLeft;
11528
11529 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11530 // Use LDR and STR to copy.
11531 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11532 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11533 unsigned srcIn = src;
11534 unsigned destIn = dest;
11535 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11536 Register srcOut = MRI.createVirtualRegister(TRC);
11537 Register destOut = MRI.createVirtualRegister(TRC);
11538 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11539 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11540 IsThumb1, IsThumb2);
11541 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11542 IsThumb1, IsThumb2);
11543 srcIn = srcOut;
11544 destIn = destOut;
11545 }
11546
11547 // Handle the leftover bytes with LDRB and STRB.
11548 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11549 // [destOut] = STRB_POST(scratch, destIn, 1)
11550 for (unsigned i = 0; i < BytesLeft; i++) {
11551 Register srcOut = MRI.createVirtualRegister(TRC);
11552 Register destOut = MRI.createVirtualRegister(TRC);
11553 Register scratch = MRI.createVirtualRegister(TRC);
11554 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11555 IsThumb1, IsThumb2);
11556 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11557 IsThumb1, IsThumb2);
11558 srcIn = srcOut;
11559 destIn = destOut;
11560 }
11561 MI.eraseFromParent(); // The instruction is gone now.
11562 return BB;
11563 }
11564
11565 // Expand the pseudo op to a loop.
11566 // thisMBB:
11567 // ...
11568 // movw varEnd, # --> with thumb2
11569 // movt varEnd, #
11570 // ldrcp varEnd, idx --> without thumb2
11571 // fallthrough --> loopMBB
11572 // loopMBB:
11573 // PHI varPhi, varEnd, varLoop
11574 // PHI srcPhi, src, srcLoop
11575 // PHI destPhi, dst, destLoop
11576 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11577 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11578 // subs varLoop, varPhi, #UnitSize
11579 // bne loopMBB
11580 // fallthrough --> exitMBB
11581 // exitMBB:
11582 // epilogue to handle left-over bytes
11583 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11584 // [destOut] = STRB_POST(scratch, destLoop, 1)
11585 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11586 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11587 MF->insert(It, loopMBB);
11588 MF->insert(It, exitMBB);
11589
11590 // Set the call frame size on entry to the new basic blocks.
11591 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11592 loopMBB->setCallFrameSize(CallFrameSize);
11593 exitMBB->setCallFrameSize(CallFrameSize);
11594
11595 // Transfer the remainder of BB and its successor edges to exitMBB.
11596 exitMBB->splice(exitMBB->begin(), BB,
11597 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11599
11600 // Load an immediate to varEnd.
11601 Register varEnd = MRI.createVirtualRegister(TRC);
11602 if (Subtarget->useMovt()) {
11603 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11604 varEnd)
11605 .addImm(LoopSize);
11606 } else if (Subtarget->genExecuteOnly()) {
11607 assert(IsThumb && "Non-thumb expected to have used movt");
11608 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11609 } else {
11611 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11612 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11613
11614 // MachineConstantPool wants an explicit alignment.
11615 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11616 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11617 MachineMemOperand *CPMMO =
11620
11621 if (IsThumb)
11622 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11623 .addReg(varEnd, RegState::Define)
11626 .addMemOperand(CPMMO);
11627 else
11628 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11629 .addReg(varEnd, RegState::Define)
11631 .addImm(0)
11633 .addMemOperand(CPMMO);
11634 }
11635 BB->addSuccessor(loopMBB);
11636
11637 // Generate the loop body:
11638 // varPhi = PHI(varLoop, varEnd)
11639 // srcPhi = PHI(srcLoop, src)
11640 // destPhi = PHI(destLoop, dst)
11641 MachineBasicBlock *entryBB = BB;
11642 BB = loopMBB;
11643 Register varLoop = MRI.createVirtualRegister(TRC);
11644 Register varPhi = MRI.createVirtualRegister(TRC);
11645 Register srcLoop = MRI.createVirtualRegister(TRC);
11646 Register srcPhi = MRI.createVirtualRegister(TRC);
11647 Register destLoop = MRI.createVirtualRegister(TRC);
11648 Register destPhi = MRI.createVirtualRegister(TRC);
11649
11650 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11651 .addReg(varLoop).addMBB(loopMBB)
11652 .addReg(varEnd).addMBB(entryBB);
11653 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11654 .addReg(srcLoop).addMBB(loopMBB)
11655 .addReg(src).addMBB(entryBB);
11656 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11657 .addReg(destLoop).addMBB(loopMBB)
11658 .addReg(dest).addMBB(entryBB);
11659
11660 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11661 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11662 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11663 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11664 IsThumb1, IsThumb2);
11665 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11666 IsThumb1, IsThumb2);
11667
11668 // Decrement loop variable by UnitSize.
11669 if (IsThumb1) {
11670 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11671 .add(t1CondCodeOp())
11672 .addReg(varPhi)
11673 .addImm(UnitSize)
11675 } else {
11677 BuildMI(*BB, BB->end(), dl,
11678 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11679 MIB.addReg(varPhi)
11680 .addImm(UnitSize)
11682 .add(condCodeOp());
11683 MIB->getOperand(5).setReg(ARM::CPSR);
11684 MIB->getOperand(5).setIsDef(true);
11685 }
11686 BuildMI(*BB, BB->end(), dl,
11687 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11688 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11689
11690 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11691 BB->addSuccessor(loopMBB);
11692 BB->addSuccessor(exitMBB);
11693
11694 // Add epilogue to handle BytesLeft.
11695 BB = exitMBB;
11696 auto StartOfExit = exitMBB->begin();
11697
11698 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11699 // [destOut] = STRB_POST(scratch, destLoop, 1)
11700 unsigned srcIn = srcLoop;
11701 unsigned destIn = destLoop;
11702 for (unsigned i = 0; i < BytesLeft; i++) {
11703 Register srcOut = MRI.createVirtualRegister(TRC);
11704 Register destOut = MRI.createVirtualRegister(TRC);
11705 Register scratch = MRI.createVirtualRegister(TRC);
11706 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11707 IsThumb1, IsThumb2);
11708 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11709 IsThumb1, IsThumb2);
11710 srcIn = srcOut;
11711 destIn = destOut;
11712 }
11713
11714 MI.eraseFromParent(); // The instruction is gone now.
11715 return BB;
11716}
11717
11719ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11720 MachineBasicBlock *MBB) const {
11722 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11723 DebugLoc DL = MI.getDebugLoc();
11724
11725 assert(Subtarget->isTargetWindows() &&
11726 "__chkstk is only supported on Windows");
11727 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11728
11729 // __chkstk takes the number of words to allocate on the stack in R4, and
11730 // returns the stack adjustment in number of bytes in R4. This will not
11731 // clober any other registers (other than the obvious lr).
11732 //
11733 // Although, technically, IP should be considered a register which may be
11734 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11735 // thumb-2 environment, so there is no interworking required. As a result, we
11736 // do not expect a veneer to be emitted by the linker, clobbering IP.
11737 //
11738 // Each module receives its own copy of __chkstk, so no import thunk is
11739 // required, again, ensuring that IP is not clobbered.
11740 //
11741 // Finally, although some linkers may theoretically provide a trampoline for
11742 // out of range calls (which is quite common due to a 32M range limitation of
11743 // branches for Thumb), we can generate the long-call version via
11744 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11745 // IP.
11746
11747 switch (TM.getCodeModel()) {
11748 case CodeModel::Tiny:
11749 llvm_unreachable("Tiny code model not available on ARM.");
11750 case CodeModel::Small:
11751 case CodeModel::Medium:
11752 case CodeModel::Kernel:
11753 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11755 .addExternalSymbol("__chkstk")
11758 .addReg(ARM::R12,
11760 .addReg(ARM::CPSR,
11762 break;
11763 case CodeModel::Large: {
11765 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11766
11767 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11768 .addExternalSymbol("__chkstk");
11771 .addReg(Reg, RegState::Kill)
11774 .addReg(ARM::R12,
11776 .addReg(ARM::CPSR,
11778 break;
11779 }
11780 }
11781
11782 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11783 .addReg(ARM::SP, RegState::Kill)
11784 .addReg(ARM::R4, RegState::Kill)
11787 .add(condCodeOp());
11788
11789 MI.eraseFromParent();
11790 return MBB;
11791}
11792
11794ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11795 MachineBasicBlock *MBB) const {
11796 DebugLoc DL = MI.getDebugLoc();
11797 MachineFunction *MF = MBB->getParent();
11798 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11799
11801 MF->insert(++MBB->getIterator(), ContBB);
11802 ContBB->splice(ContBB->begin(), MBB,
11803 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11805 MBB->addSuccessor(ContBB);
11806
11808 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11809 MF->push_back(TrapBB);
11810 MBB->addSuccessor(TrapBB);
11811
11812 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11813 .addReg(MI.getOperand(0).getReg())
11814 .addImm(0)
11816 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11817 .addMBB(TrapBB)
11819 .addReg(ARM::CPSR);
11820
11821 MI.eraseFromParent();
11822 return ContBB;
11823}
11824
11825// The CPSR operand of SelectItr might be missing a kill marker
11826// because there were multiple uses of CPSR, and ISel didn't know
11827// which to mark. Figure out whether SelectItr should have had a
11828// kill marker, and set it if it should. Returns the correct kill
11829// marker value.
11832 const TargetRegisterInfo* TRI) {
11833 // Scan forward through BB for a use/def of CPSR.
11834 MachineBasicBlock::iterator miI(std::next(SelectItr));
11835 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11836 const MachineInstr& mi = *miI;
11837 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11838 return false;
11839 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11840 break; // Should have kill-flag - update below.
11841 }
11842
11843 // If we hit the end of the block, check whether CPSR is live into a
11844 // successor.
11845 if (miI == BB->end()) {
11846 for (MachineBasicBlock *Succ : BB->successors())
11847 if (Succ->isLiveIn(ARM::CPSR))
11848 return false;
11849 }
11850
11851 // We found a def, or hit the end of the basic block and CPSR wasn't live
11852 // out. SelectMI should have a kill flag on CPSR.
11853 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11854 return true;
11855}
11856
11857/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11858/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11860 MachineBasicBlock *TpLoopBody,
11861 MachineBasicBlock *TpExit, Register OpSizeReg,
11862 const TargetInstrInfo *TII, DebugLoc Dl,
11864 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11865 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11866 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11867 .addUse(OpSizeReg)
11868 .addImm(15)
11870 .addReg(0);
11871
11872 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11873 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11874 .addUse(AddDestReg, RegState::Kill)
11875 .addImm(4)
11877 .addReg(0);
11878
11879 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11880 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11881 .addUse(LsrDestReg, RegState::Kill);
11882
11883 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11884 .addUse(TotalIterationsReg)
11885 .addMBB(TpExit);
11886
11887 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11888 .addMBB(TpLoopBody)
11890
11891 return TotalIterationsReg;
11892}
11893
11894/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11895/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11896/// loops.
11897static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11898 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11899 const TargetInstrInfo *TII, DebugLoc Dl,
11900 MachineRegisterInfo &MRI, Register OpSrcReg,
11901 Register OpDestReg, Register ElementCountReg,
11902 Register TotalIterationsReg, bool IsMemcpy) {
11903 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11904 // array, loop iteration counter, predication counter.
11905
11906 Register SrcPhiReg, CurrSrcReg;
11907 if (IsMemcpy) {
11908 // Current position in the src array
11909 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11910 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11911 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11912 .addUse(OpSrcReg)
11913 .addMBB(TpEntry)
11914 .addUse(CurrSrcReg)
11915 .addMBB(TpLoopBody);
11916 }
11917
11918 // Current position in the dest array
11919 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11920 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11921 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11922 .addUse(OpDestReg)
11923 .addMBB(TpEntry)
11924 .addUse(CurrDestReg)
11925 .addMBB(TpLoopBody);
11926
11927 // Current loop counter
11928 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11929 Register RemainingLoopIterationsReg =
11930 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11931 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11932 .addUse(TotalIterationsReg)
11933 .addMBB(TpEntry)
11934 .addUse(RemainingLoopIterationsReg)
11935 .addMBB(TpLoopBody);
11936
11937 // Predication counter
11938 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11939 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11940 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11941 .addUse(ElementCountReg)
11942 .addMBB(TpEntry)
11943 .addUse(RemainingElementsReg)
11944 .addMBB(TpLoopBody);
11945
11946 // Pass predication counter to VCTP
11947 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11948 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11949 .addUse(PredCounterPhiReg)
11951 .addReg(0)
11952 .addReg(0);
11953
11954 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11955 .addUse(PredCounterPhiReg)
11956 .addImm(16)
11958 .addReg(0);
11959
11960 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11961 Register SrcValueReg;
11962 if (IsMemcpy) {
11963 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11964 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11965 .addDef(CurrSrcReg)
11966 .addDef(SrcValueReg)
11967 .addReg(SrcPhiReg)
11968 .addImm(16)
11970 .addUse(VccrReg)
11971 .addReg(0);
11972 } else
11973 SrcValueReg = OpSrcReg;
11974
11975 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11976 .addDef(CurrDestReg)
11977 .addUse(SrcValueReg)
11978 .addReg(DestPhiReg)
11979 .addImm(16)
11981 .addUse(VccrReg)
11982 .addReg(0);
11983
11984 // Add the pseudoInstrs for decrementing the loop counter and marking the
11985 // end:t2DoLoopDec and t2DoLoopEnd
11986 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11987 .addUse(LoopCounterPhiReg)
11988 .addImm(1);
11989
11990 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11991 .addUse(RemainingLoopIterationsReg)
11992 .addMBB(TpLoopBody);
11993
11994 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11995 .addMBB(TpExit)
11997}
11998
12001 MachineBasicBlock *BB) const {
12002 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
12003 DebugLoc dl = MI.getDebugLoc();
12004 bool isThumb2 = Subtarget->isThumb2();
12005 switch (MI.getOpcode()) {
12006 default: {
12007 MI.print(errs());
12008 llvm_unreachable("Unexpected instr type to insert");
12009 }
12010
12011 // Thumb1 post-indexed loads are really just single-register LDMs.
12012 case ARM::tLDR_postidx: {
12013 MachineOperand Def(MI.getOperand(1));
12014 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12015 .add(Def) // Rn_wb
12016 .add(MI.getOperand(2)) // Rn
12017 .add(MI.getOperand(3)) // PredImm
12018 .add(MI.getOperand(4)) // PredReg
12019 .add(MI.getOperand(0)) // Rt
12020 .cloneMemRefs(MI);
12021 MI.eraseFromParent();
12022 return BB;
12023 }
12024
12025 case ARM::MVE_MEMCPYLOOPINST:
12026 case ARM::MVE_MEMSETLOOPINST: {
12027
12028 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12029 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12030 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12031 // adds the relevant instructions in the TP loop Body for generation of a
12032 // WLSTP loop.
12033
12034 // Below is relevant portion of the CFG after the transformation.
12035 // The Machine Basic Blocks are shown along with branch conditions (in
12036 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12037 // portion of the CFG and may not necessarily be the entry/exit of the
12038 // function.
12039
12040 // (Relevant) CFG after transformation:
12041 // TP entry MBB
12042 // |
12043 // |-----------------|
12044 // (n <= 0) (n > 0)
12045 // | |
12046 // | TP loop Body MBB<--|
12047 // | | |
12048 // \ |___________|
12049 // \ /
12050 // TP exit MBB
12051
12052 MachineFunction *MF = BB->getParent();
12053 MachineFunctionProperties &Properties = MF->getProperties();
12055
12056 Register OpDestReg = MI.getOperand(0).getReg();
12057 Register OpSrcReg = MI.getOperand(1).getReg();
12058 Register OpSizeReg = MI.getOperand(2).getReg();
12059
12060 // Allocate the required MBBs and add to parent function.
12061 MachineBasicBlock *TpEntry = BB;
12062 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12063 MachineBasicBlock *TpExit;
12064
12065 MF->push_back(TpLoopBody);
12066
12067 // If any instructions are present in the current block after
12068 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12069 // move the instructions into the newly created exit block. If there are no
12070 // instructions add an explicit branch to the FallThrough block and then
12071 // split.
12072 //
12073 // The split is required for two reasons:
12074 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12075 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12076 // need to be updated. splitAt() already handles this.
12077 TpExit = BB->splitAt(MI, false);
12078 if (TpExit == BB) {
12079 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12080 "block containing memcpy/memset Pseudo");
12081 TpExit = BB->getFallThrough();
12082 BuildMI(BB, dl, TII->get(ARM::t2B))
12083 .addMBB(TpExit)
12085 TpExit = BB->splitAt(MI, false);
12086 }
12087
12088 // Add logic for iteration count
12089 Register TotalIterationsReg =
12090 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12091
12092 // Add the vectorized (and predicated) loads/store instructions
12093 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12094 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12095 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12096
12097 // Required to avoid conflict with the MachineVerifier during testing.
12099
12100 // Connect the blocks
12101 TpEntry->addSuccessor(TpLoopBody);
12102 TpLoopBody->addSuccessor(TpLoopBody);
12103 TpLoopBody->addSuccessor(TpExit);
12104
12105 // Reorder for a more natural layout
12106 TpLoopBody->moveAfter(TpEntry);
12107 TpExit->moveAfter(TpLoopBody);
12108
12109 // Finally, remove the memcpy Pseudo Instruction
12110 MI.eraseFromParent();
12111
12112 // Return the exit block as it may contain other instructions requiring a
12113 // custom inserter
12114 return TpExit;
12115 }
12116
12117 // The Thumb2 pre-indexed stores have the same MI operands, they just
12118 // define them differently in the .td files from the isel patterns, so
12119 // they need pseudos.
12120 case ARM::t2STR_preidx:
12121 MI.setDesc(TII->get(ARM::t2STR_PRE));
12122 return BB;
12123 case ARM::t2STRB_preidx:
12124 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12125 return BB;
12126 case ARM::t2STRH_preidx:
12127 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12128 return BB;
12129
12130 case ARM::STRi_preidx:
12131 case ARM::STRBi_preidx: {
12132 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12133 : ARM::STRB_PRE_IMM;
12134 // Decode the offset.
12135 unsigned Offset = MI.getOperand(4).getImm();
12136 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12138 if (isSub)
12139 Offset = -Offset;
12140
12141 MachineMemOperand *MMO = *MI.memoperands_begin();
12142 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12143 .add(MI.getOperand(0)) // Rn_wb
12144 .add(MI.getOperand(1)) // Rt
12145 .add(MI.getOperand(2)) // Rn
12146 .addImm(Offset) // offset (skip GPR==zero_reg)
12147 .add(MI.getOperand(5)) // pred
12148 .add(MI.getOperand(6))
12149 .addMemOperand(MMO);
12150 MI.eraseFromParent();
12151 return BB;
12152 }
12153 case ARM::STRr_preidx:
12154 case ARM::STRBr_preidx:
12155 case ARM::STRH_preidx: {
12156 unsigned NewOpc;
12157 switch (MI.getOpcode()) {
12158 default: llvm_unreachable("unexpected opcode!");
12159 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12160 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12161 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12162 }
12163 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12164 for (const MachineOperand &MO : MI.operands())
12165 MIB.add(MO);
12166 MI.eraseFromParent();
12167 return BB;
12168 }
12169
12170 case ARM::tMOVCCr_pseudo: {
12171 // To "insert" a SELECT_CC instruction, we actually have to insert the
12172 // diamond control-flow pattern. The incoming instruction knows the
12173 // destination vreg to set, the condition code register to branch on, the
12174 // true/false values to select between, and a branch opcode to use.
12175 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12177
12178 // thisMBB:
12179 // ...
12180 // TrueVal = ...
12181 // cmpTY ccX, r1, r2
12182 // bCC copy1MBB
12183 // fallthrough --> copy0MBB
12184 MachineBasicBlock *thisMBB = BB;
12185 MachineFunction *F = BB->getParent();
12186 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12187 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12188 F->insert(It, copy0MBB);
12189 F->insert(It, sinkMBB);
12190
12191 // Set the call frame size on entry to the new basic blocks.
12192 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12193 copy0MBB->setCallFrameSize(CallFrameSize);
12194 sinkMBB->setCallFrameSize(CallFrameSize);
12195
12196 // Check whether CPSR is live past the tMOVCCr_pseudo.
12197 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12198 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12199 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12200 copy0MBB->addLiveIn(ARM::CPSR);
12201 sinkMBB->addLiveIn(ARM::CPSR);
12202 }
12203
12204 // Transfer the remainder of BB and its successor edges to sinkMBB.
12205 sinkMBB->splice(sinkMBB->begin(), BB,
12206 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12208
12209 BB->addSuccessor(copy0MBB);
12210 BB->addSuccessor(sinkMBB);
12211
12212 BuildMI(BB, dl, TII->get(ARM::tBcc))
12213 .addMBB(sinkMBB)
12214 .addImm(MI.getOperand(3).getImm())
12215 .addReg(MI.getOperand(4).getReg());
12216
12217 // copy0MBB:
12218 // %FalseValue = ...
12219 // # fallthrough to sinkMBB
12220 BB = copy0MBB;
12221
12222 // Update machine-CFG edges
12223 BB->addSuccessor(sinkMBB);
12224
12225 // sinkMBB:
12226 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12227 // ...
12228 BB = sinkMBB;
12229 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12230 .addReg(MI.getOperand(1).getReg())
12231 .addMBB(copy0MBB)
12232 .addReg(MI.getOperand(2).getReg())
12233 .addMBB(thisMBB);
12234
12235 MI.eraseFromParent(); // The pseudo instruction is gone now.
12236 return BB;
12237 }
12238
12239 case ARM::BCCi64:
12240 case ARM::BCCZi64: {
12241 // If there is an unconditional branch to the other successor, remove it.
12242 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12243
12244 // Compare both parts that make up the double comparison separately for
12245 // equality.
12246 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12247
12248 Register LHS1 = MI.getOperand(1).getReg();
12249 Register LHS2 = MI.getOperand(2).getReg();
12250 if (RHSisZero) {
12251 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12252 .addReg(LHS1)
12253 .addImm(0)
12255 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12256 .addReg(LHS2).addImm(0)
12257 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12258 } else {
12259 Register RHS1 = MI.getOperand(3).getReg();
12260 Register RHS2 = MI.getOperand(4).getReg();
12261 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12262 .addReg(LHS1)
12263 .addReg(RHS1)
12265 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12266 .addReg(LHS2).addReg(RHS2)
12267 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12268 }
12269
12270 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12271 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12272 if (MI.getOperand(0).getImm() == ARMCC::NE)
12273 std::swap(destMBB, exitMBB);
12274
12275 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12276 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12277 if (isThumb2)
12278 BuildMI(BB, dl, TII->get(ARM::t2B))
12279 .addMBB(exitMBB)
12281 else
12282 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12283
12284 MI.eraseFromParent(); // The pseudo instruction is gone now.
12285 return BB;
12286 }
12287
12288 case ARM::Int_eh_sjlj_setjmp:
12289 case ARM::Int_eh_sjlj_setjmp_nofp:
12290 case ARM::tInt_eh_sjlj_setjmp:
12291 case ARM::t2Int_eh_sjlj_setjmp:
12292 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12293 return BB;
12294
12295 case ARM::Int_eh_sjlj_setup_dispatch:
12296 EmitSjLjDispatchBlock(MI, BB);
12297 return BB;
12298
12299 case ARM::ABS:
12300 case ARM::t2ABS: {
12301 // To insert an ABS instruction, we have to insert the
12302 // diamond control-flow pattern. The incoming instruction knows the
12303 // source vreg to test against 0, the destination vreg to set,
12304 // the condition code register to branch on, the
12305 // true/false values to select between, and a branch opcode to use.
12306 // It transforms
12307 // V1 = ABS V0
12308 // into
12309 // V2 = MOVS V0
12310 // BCC (branch to SinkBB if V0 >= 0)
12311 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
12312 // SinkBB: V1 = PHI(V2, V3)
12313 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12315 MachineFunction *Fn = BB->getParent();
12316 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12317 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12318 Fn->insert(BBI, RSBBB);
12319 Fn->insert(BBI, SinkBB);
12320
12321 Register ABSSrcReg = MI.getOperand(1).getReg();
12322 Register ABSDstReg = MI.getOperand(0).getReg();
12323 bool ABSSrcKIll = MI.getOperand(1).isKill();
12324 bool isThumb2 = Subtarget->isThumb2();
12326 // In Thumb mode S must not be specified if source register is the SP or
12327 // PC and if destination register is the SP, so restrict register class
12328 Register NewRsbDstReg = MRI.createVirtualRegister(
12329 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
12330
12331 // Transfer the remainder of BB and its successor edges to sinkMBB.
12332 SinkBB->splice(SinkBB->begin(), BB,
12333 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12335
12336 BB->addSuccessor(RSBBB);
12337 BB->addSuccessor(SinkBB);
12338
12339 // fall through to SinkMBB
12340 RSBBB->addSuccessor(SinkBB);
12341
12342 // insert a cmp at the end of BB
12343 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12344 .addReg(ABSSrcReg)
12345 .addImm(0)
12347
12348 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
12349 BuildMI(BB, dl,
12350 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
12352
12353 // insert rsbri in RSBBB
12354 // Note: BCC and rsbri will be converted into predicated rsbmi
12355 // by if-conversion pass
12356 BuildMI(*RSBBB, RSBBB->begin(), dl,
12357 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
12358 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
12359 .addImm(0)
12361 .add(condCodeOp());
12362
12363 // insert PHI in SinkBB,
12364 // reuse ABSDstReg to not change uses of ABS instruction
12365 BuildMI(*SinkBB, SinkBB->begin(), dl,
12366 TII->get(ARM::PHI), ABSDstReg)
12367 .addReg(NewRsbDstReg).addMBB(RSBBB)
12368 .addReg(ABSSrcReg).addMBB(BB);
12369
12370 // remove ABS instruction
12371 MI.eraseFromParent();
12372
12373 // return last added BB
12374 return SinkBB;
12375 }
12376 case ARM::COPY_STRUCT_BYVAL_I32:
12377 ++NumLoopByVals;
12378 return EmitStructByval(MI, BB);
12379 case ARM::WIN__CHKSTK:
12380 return EmitLowered__chkstk(MI, BB);
12381 case ARM::WIN__DBZCHK:
12382 return EmitLowered__dbzchk(MI, BB);
12383 }
12384}
12385
12386/// Attaches vregs to MEMCPY that it will use as scratch registers
12387/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12388/// instead of as a custom inserter because we need the use list from the SDNode.
12389static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12390 MachineInstr &MI, const SDNode *Node) {
12391 bool isThumb1 = Subtarget->isThumb1Only();
12392
12393 DebugLoc DL = MI.getDebugLoc();
12394 MachineFunction *MF = MI.getParent()->getParent();
12396 MachineInstrBuilder MIB(*MF, MI);
12397
12398 // If the new dst/src is unused mark it as dead.
12399 if (!Node->hasAnyUseOfValue(0)) {
12400 MI.getOperand(0).setIsDead(true);
12401 }
12402 if (!Node->hasAnyUseOfValue(1)) {
12403 MI.getOperand(1).setIsDead(true);
12404 }
12405
12406 // The MEMCPY both defines and kills the scratch registers.
12407 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12408 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12409 : &ARM::GPRRegClass);
12411 }
12412}
12413
12415 SDNode *Node) const {
12416 if (MI.getOpcode() == ARM::MEMCPY) {
12417 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12418 return;
12419 }
12420
12421 const MCInstrDesc *MCID = &MI.getDesc();
12422 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12423 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12424 // operand is still set to noreg. If needed, set the optional operand's
12425 // register to CPSR, and remove the redundant implicit def.
12426 //
12427 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12428
12429 // Rename pseudo opcodes.
12430 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12431 unsigned ccOutIdx;
12432 if (NewOpc) {
12433 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12434 MCID = &TII->get(NewOpc);
12435
12436 assert(MCID->getNumOperands() ==
12437 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12438 && "converted opcode should be the same except for cc_out"
12439 " (and, on Thumb1, pred)");
12440
12441 MI.setDesc(*MCID);
12442
12443 // Add the optional cc_out operand
12444 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12445
12446 // On Thumb1, move all input operands to the end, then add the predicate
12447 if (Subtarget->isThumb1Only()) {
12448 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12449 MI.addOperand(MI.getOperand(1));
12450 MI.removeOperand(1);
12451 }
12452
12453 // Restore the ties
12454 for (unsigned i = MI.getNumOperands(); i--;) {
12455 const MachineOperand& op = MI.getOperand(i);
12456 if (op.isReg() && op.isUse()) {
12457 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12458 if (DefIdx != -1)
12459 MI.tieOperands(DefIdx, i);
12460 }
12461 }
12462
12464 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12465 ccOutIdx = 1;
12466 } else
12467 ccOutIdx = MCID->getNumOperands() - 1;
12468 } else
12469 ccOutIdx = MCID->getNumOperands() - 1;
12470
12471 // Any ARM instruction that sets the 's' bit should specify an optional
12472 // "cc_out" operand in the last operand position.
12473 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12474 assert(!NewOpc && "Optional cc_out operand required");
12475 return;
12476 }
12477 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12478 // since we already have an optional CPSR def.
12479 bool definesCPSR = false;
12480 bool deadCPSR = false;
12481 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12482 ++i) {
12483 const MachineOperand &MO = MI.getOperand(i);
12484 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12485 definesCPSR = true;
12486 if (MO.isDead())
12487 deadCPSR = true;
12488 MI.removeOperand(i);
12489 break;
12490 }
12491 }
12492 if (!definesCPSR) {
12493 assert(!NewOpc && "Optional cc_out operand required");
12494 return;
12495 }
12496 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12497 if (deadCPSR) {
12498 assert(!MI.getOperand(ccOutIdx).getReg() &&
12499 "expect uninitialized optional cc_out operand");
12500 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12501 if (!Subtarget->isThumb1Only())
12502 return;
12503 }
12504
12505 // If this instruction was defined with an optional CPSR def and its dag node
12506 // had a live implicit CPSR def, then activate the optional CPSR def.
12507 MachineOperand &MO = MI.getOperand(ccOutIdx);
12508 MO.setReg(ARM::CPSR);
12509 MO.setIsDef(true);
12510}
12511
12512//===----------------------------------------------------------------------===//
12513// ARM Optimization Hooks
12514//===----------------------------------------------------------------------===//
12515
12516// Helper function that checks if N is a null or all ones constant.
12517static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12519}
12520
12521// Return true if N is conditionally 0 or all ones.
12522// Detects these expressions where cc is an i1 value:
12523//
12524// (select cc 0, y) [AllOnes=0]
12525// (select cc y, 0) [AllOnes=0]
12526// (zext cc) [AllOnes=0]
12527// (sext cc) [AllOnes=0/1]
12528// (select cc -1, y) [AllOnes=1]
12529// (select cc y, -1) [AllOnes=1]
12530//
12531// Invert is set when N is the null/all ones constant when CC is false.
12532// OtherOp is set to the alternative value of N.
12534 SDValue &CC, bool &Invert,
12535 SDValue &OtherOp,
12536 SelectionDAG &DAG) {
12537 switch (N->getOpcode()) {
12538 default: return false;
12539 case ISD::SELECT: {
12540 CC = N->getOperand(0);
12541 SDValue N1 = N->getOperand(1);
12542 SDValue N2 = N->getOperand(2);
12543 if (isZeroOrAllOnes(N1, AllOnes)) {
12544 Invert = false;
12545 OtherOp = N2;
12546 return true;
12547 }
12548 if (isZeroOrAllOnes(N2, AllOnes)) {
12549 Invert = true;
12550 OtherOp = N1;
12551 return true;
12552 }
12553 return false;
12554 }
12555 case ISD::ZERO_EXTEND:
12556 // (zext cc) can never be the all ones value.
12557 if (AllOnes)
12558 return false;
12559 [[fallthrough]];
12560 case ISD::SIGN_EXTEND: {
12561 SDLoc dl(N);
12562 EVT VT = N->getValueType(0);
12563 CC = N->getOperand(0);
12564 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12565 return false;
12566 Invert = !AllOnes;
12567 if (AllOnes)
12568 // When looking for an AllOnes constant, N is an sext, and the 'other'
12569 // value is 0.
12570 OtherOp = DAG.getConstant(0, dl, VT);
12571 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12572 // When looking for a 0 constant, N can be zext or sext.
12573 OtherOp = DAG.getConstant(1, dl, VT);
12574 else
12575 OtherOp = DAG.getAllOnesConstant(dl, VT);
12576 return true;
12577 }
12578 }
12579}
12580
12581// Combine a constant select operand into its use:
12582//
12583// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12584// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12585// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12586// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12587// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12588//
12589// The transform is rejected if the select doesn't have a constant operand that
12590// is null, or all ones when AllOnes is set.
12591//
12592// Also recognize sext/zext from i1:
12593//
12594// (add (zext cc), x) -> (select cc (add x, 1), x)
12595// (add (sext cc), x) -> (select cc (add x, -1), x)
12596//
12597// These transformations eventually create predicated instructions.
12598//
12599// @param N The node to transform.
12600// @param Slct The N operand that is a select.
12601// @param OtherOp The other N operand (x above).
12602// @param DCI Context.
12603// @param AllOnes Require the select constant to be all ones instead of null.
12604// @returns The new node, or SDValue() on failure.
12605static
12608 bool AllOnes = false) {
12609 SelectionDAG &DAG = DCI.DAG;
12610 EVT VT = N->getValueType(0);
12611 SDValue NonConstantVal;
12612 SDValue CCOp;
12613 bool SwapSelectOps;
12614 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12615 NonConstantVal, DAG))
12616 return SDValue();
12617
12618 // Slct is now know to be the desired identity constant when CC is true.
12619 SDValue TrueVal = OtherOp;
12620 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12621 OtherOp, NonConstantVal);
12622 // Unless SwapSelectOps says CC should be false.
12623 if (SwapSelectOps)
12624 std::swap(TrueVal, FalseVal);
12625
12626 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12627 CCOp, TrueVal, FalseVal);
12628}
12629
12630// Attempt combineSelectAndUse on each operand of a commutative operator N.
12631static
12634 SDValue N0 = N->getOperand(0);
12635 SDValue N1 = N->getOperand(1);
12636 if (N0.getNode()->hasOneUse())
12637 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12638 return Result;
12639 if (N1.getNode()->hasOneUse())
12640 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12641 return Result;
12642 return SDValue();
12643}
12644
12646 // VUZP shuffle node.
12647 if (N->getOpcode() == ARMISD::VUZP)
12648 return true;
12649
12650 // "VUZP" on i32 is an alias for VTRN.
12651 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12652 return true;
12653
12654 return false;
12655}
12656
12659 const ARMSubtarget *Subtarget) {
12660 // Look for ADD(VUZP.0, VUZP.1).
12661 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12662 N0 == N1)
12663 return SDValue();
12664
12665 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12666 if (!N->getValueType(0).is64BitVector())
12667 return SDValue();
12668
12669 // Generate vpadd.
12670 SelectionDAG &DAG = DCI.DAG;
12671 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12672 SDLoc dl(N);
12673 SDNode *Unzip = N0.getNode();
12674 EVT VT = N->getValueType(0);
12675
12677 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12678 TLI.getPointerTy(DAG.getDataLayout())));
12679 Ops.push_back(Unzip->getOperand(0));
12680 Ops.push_back(Unzip->getOperand(1));
12681
12682 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12683}
12684
12687 const ARMSubtarget *Subtarget) {
12688 // Check for two extended operands.
12689 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12690 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12691 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12692 N1.getOpcode() == ISD::ZERO_EXTEND))
12693 return SDValue();
12694
12695 SDValue N00 = N0.getOperand(0);
12696 SDValue N10 = N1.getOperand(0);
12697
12698 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12699 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12700 N00 == N10)
12701 return SDValue();
12702
12703 // We only recognize Q register paddl here; this can't be reached until
12704 // after type legalization.
12705 if (!N00.getValueType().is64BitVector() ||
12707 return SDValue();
12708
12709 // Generate vpaddl.
12710 SelectionDAG &DAG = DCI.DAG;
12711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12712 SDLoc dl(N);
12713 EVT VT = N->getValueType(0);
12714
12716 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12717 unsigned Opcode;
12718 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12719 Opcode = Intrinsic::arm_neon_vpaddls;
12720 else
12721 Opcode = Intrinsic::arm_neon_vpaddlu;
12722 Ops.push_back(DAG.getConstant(Opcode, dl,
12723 TLI.getPointerTy(DAG.getDataLayout())));
12724 EVT ElemTy = N00.getValueType().getVectorElementType();
12725 unsigned NumElts = VT.getVectorNumElements();
12726 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12727 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12728 N00.getOperand(0), N00.getOperand(1));
12729 Ops.push_back(Concat);
12730
12731 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12732}
12733
12734// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12735// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12736// much easier to match.
12737static SDValue
12740 const ARMSubtarget *Subtarget) {
12741 // Only perform optimization if after legalize, and if NEON is available. We
12742 // also expected both operands to be BUILD_VECTORs.
12743 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12744 || N0.getOpcode() != ISD::BUILD_VECTOR
12745 || N1.getOpcode() != ISD::BUILD_VECTOR)
12746 return SDValue();
12747
12748 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12749 EVT VT = N->getValueType(0);
12750 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12751 return SDValue();
12752
12753 // Check that the vector operands are of the right form.
12754 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12755 // operands, where N is the size of the formed vector.
12756 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12757 // index such that we have a pair wise add pattern.
12758
12759 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12761 return SDValue();
12762 SDValue Vec = N0->getOperand(0)->getOperand(0);
12763 SDNode *V = Vec.getNode();
12764 unsigned nextIndex = 0;
12765
12766 // For each operands to the ADD which are BUILD_VECTORs,
12767 // check to see if each of their operands are an EXTRACT_VECTOR with
12768 // the same vector and appropriate index.
12769 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12772
12773 SDValue ExtVec0 = N0->getOperand(i);
12774 SDValue ExtVec1 = N1->getOperand(i);
12775
12776 // First operand is the vector, verify its the same.
12777 if (V != ExtVec0->getOperand(0).getNode() ||
12778 V != ExtVec1->getOperand(0).getNode())
12779 return SDValue();
12780
12781 // Second is the constant, verify its correct.
12782 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
12783 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
12784
12785 // For the constant, we want to see all the even or all the odd.
12786 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12787 || C1->getZExtValue() != nextIndex+1)
12788 return SDValue();
12789
12790 // Increment index.
12791 nextIndex+=2;
12792 } else
12793 return SDValue();
12794 }
12795
12796 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12797 // we're using the entire input vector, otherwise there's a size/legality
12798 // mismatch somewhere.
12799 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12801 return SDValue();
12802
12803 // Create VPADDL node.
12804 SelectionDAG &DAG = DCI.DAG;
12805 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12806
12807 SDLoc dl(N);
12808
12809 // Build operand list.
12811 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12812 TLI.getPointerTy(DAG.getDataLayout())));
12813
12814 // Input is the vector.
12815 Ops.push_back(Vec);
12816
12817 // Get widened type and narrowed type.
12818 MVT widenType;
12819 unsigned numElem = VT.getVectorNumElements();
12820
12821 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12822 switch (inputLaneType.getSimpleVT().SimpleTy) {
12823 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12824 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12825 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12826 default:
12827 llvm_unreachable("Invalid vector element type for padd optimization.");
12828 }
12829
12830 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12831 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12832 return DAG.getNode(ExtOp, dl, VT, tmp);
12833}
12834
12836 if (V->getOpcode() == ISD::UMUL_LOHI ||
12837 V->getOpcode() == ISD::SMUL_LOHI)
12838 return V;
12839 return SDValue();
12840}
12841
12842static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12844 const ARMSubtarget *Subtarget) {
12845 if (!Subtarget->hasBaseDSP())
12846 return SDValue();
12847
12848 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12849 // accumulates the product into a 64-bit value. The 16-bit values will
12850 // be sign extended somehow or SRA'd into 32-bit values
12851 // (addc (adde (mul 16bit, 16bit), lo), hi)
12852 SDValue Mul = AddcNode->getOperand(0);
12853 SDValue Lo = AddcNode->getOperand(1);
12854 if (Mul.getOpcode() != ISD::MUL) {
12855 Lo = AddcNode->getOperand(0);
12856 Mul = AddcNode->getOperand(1);
12857 if (Mul.getOpcode() != ISD::MUL)
12858 return SDValue();
12859 }
12860
12861 SDValue SRA = AddeNode->getOperand(0);
12862 SDValue Hi = AddeNode->getOperand(1);
12863 if (SRA.getOpcode() != ISD::SRA) {
12864 SRA = AddeNode->getOperand(1);
12865 Hi = AddeNode->getOperand(0);
12866 if (SRA.getOpcode() != ISD::SRA)
12867 return SDValue();
12868 }
12869 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12870 if (Const->getZExtValue() != 31)
12871 return SDValue();
12872 } else
12873 return SDValue();
12874
12875 if (SRA.getOperand(0) != Mul)
12876 return SDValue();
12877
12878 SelectionDAG &DAG = DCI.DAG;
12879 SDLoc dl(AddcNode);
12880 unsigned Opcode = 0;
12881 SDValue Op0;
12882 SDValue Op1;
12883
12884 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12885 Opcode = ARMISD::SMLALBB;
12886 Op0 = Mul.getOperand(0);
12887 Op1 = Mul.getOperand(1);
12888 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12889 Opcode = ARMISD::SMLALBT;
12890 Op0 = Mul.getOperand(0);
12891 Op1 = Mul.getOperand(1).getOperand(0);
12892 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12893 Opcode = ARMISD::SMLALTB;
12894 Op0 = Mul.getOperand(0).getOperand(0);
12895 Op1 = Mul.getOperand(1);
12896 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12897 Opcode = ARMISD::SMLALTT;
12898 Op0 = Mul->getOperand(0).getOperand(0);
12899 Op1 = Mul->getOperand(1).getOperand(0);
12900 }
12901
12902 if (!Op0 || !Op1)
12903 return SDValue();
12904
12905 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12906 Op0, Op1, Lo, Hi);
12907 // Replace the ADDs' nodes uses by the MLA node's values.
12908 SDValue HiMLALResult(SMLAL.getNode(), 1);
12909 SDValue LoMLALResult(SMLAL.getNode(), 0);
12910
12911 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12912 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12913
12914 // Return original node to notify the driver to stop replacing.
12915 SDValue resNode(AddcNode, 0);
12916 return resNode;
12917}
12918
12921 const ARMSubtarget *Subtarget) {
12922 // Look for multiply add opportunities.
12923 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12924 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12925 // a glue link from the first add to the second add.
12926 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12927 // a S/UMLAL instruction.
12928 // UMUL_LOHI
12929 // / :lo \ :hi
12930 // V \ [no multiline comment]
12931 // loAdd -> ADDC |
12932 // \ :carry /
12933 // V V
12934 // ADDE <- hiAdd
12935 //
12936 // In the special case where only the higher part of a signed result is used
12937 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12938 // a constant with the exact value of 0x80000000, we recognize we are dealing
12939 // with a "rounded multiply and add" (or subtract) and transform it into
12940 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12941
12942 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12943 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12944 "Expect an ADDE or SUBE");
12945
12946 assert(AddeSubeNode->getNumOperands() == 3 &&
12947 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12948 "ADDE node has the wrong inputs");
12949
12950 // Check that we are chained to the right ADDC or SUBC node.
12951 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12952 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12953 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12954 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12955 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12956 return SDValue();
12957
12958 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12959 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12960
12961 // Check if the two operands are from the same mul_lohi node.
12962 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12963 return SDValue();
12964
12965 assert(AddcSubcNode->getNumValues() == 2 &&
12966 AddcSubcNode->getValueType(0) == MVT::i32 &&
12967 "Expect ADDC with two result values. First: i32");
12968
12969 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12970 // maybe a SMLAL which multiplies two 16-bit values.
12971 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12972 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12973 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12974 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12975 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12976 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12977
12978 // Check for the triangle shape.
12979 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12980 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12981
12982 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12983 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12984 return SDValue();
12985
12986 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12987 bool IsLeftOperandMUL = false;
12988 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12989 if (MULOp == SDValue())
12990 MULOp = findMUL_LOHI(AddeSubeOp1);
12991 else
12992 IsLeftOperandMUL = true;
12993 if (MULOp == SDValue())
12994 return SDValue();
12995
12996 // Figure out the right opcode.
12997 unsigned Opc = MULOp->getOpcode();
12998 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12999
13000 // Figure out the high and low input values to the MLAL node.
13001 SDValue *HiAddSub = nullptr;
13002 SDValue *LoMul = nullptr;
13003 SDValue *LowAddSub = nullptr;
13004
13005 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
13006 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
13007 return SDValue();
13008
13009 if (IsLeftOperandMUL)
13010 HiAddSub = &AddeSubeOp1;
13011 else
13012 HiAddSub = &AddeSubeOp0;
13013
13014 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
13015 // whose low result is fed to the ADDC/SUBC we are checking.
13016
13017 if (AddcSubcOp0 == MULOp.getValue(0)) {
13018 LoMul = &AddcSubcOp0;
13019 LowAddSub = &AddcSubcOp1;
13020 }
13021 if (AddcSubcOp1 == MULOp.getValue(0)) {
13022 LoMul = &AddcSubcOp1;
13023 LowAddSub = &AddcSubcOp0;
13024 }
13025
13026 if (!LoMul)
13027 return SDValue();
13028
13029 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
13030 // the replacement below will create a cycle.
13031 if (AddcSubcNode == HiAddSub->getNode() ||
13032 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
13033 return SDValue();
13034
13035 // Create the merged node.
13036 SelectionDAG &DAG = DCI.DAG;
13037
13038 // Start building operand list.
13040 Ops.push_back(LoMul->getOperand(0));
13041 Ops.push_back(LoMul->getOperand(1));
13042
13043 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
13044 // the case, we must be doing signed multiplication and only use the higher
13045 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
13046 // addition or subtraction with the value of 0x800000.
13047 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
13048 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
13049 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
13050 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
13051 0x80000000) {
13052 Ops.push_back(*HiAddSub);
13053 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
13054 FinalOpc = ARMISD::SMMLSR;
13055 } else {
13056 FinalOpc = ARMISD::SMMLAR;
13057 }
13058 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
13059 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
13060
13061 return SDValue(AddeSubeNode, 0);
13062 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
13063 // SMMLS is generated during instruction selection and the rest of this
13064 // function can not handle the case where AddcSubcNode is a SUBC.
13065 return SDValue();
13066
13067 // Finish building the operand list for {U/S}MLAL
13068 Ops.push_back(*LowAddSub);
13069 Ops.push_back(*HiAddSub);
13070
13071 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13072 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13073
13074 // Replace the ADDs' nodes uses by the MLA node's values.
13075 SDValue HiMLALResult(MLALNode.getNode(), 1);
13076 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13077
13078 SDValue LoMLALResult(MLALNode.getNode(), 0);
13079 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13080
13081 // Return original node to notify the driver to stop replacing.
13082 return SDValue(AddeSubeNode, 0);
13083}
13084
13087 const ARMSubtarget *Subtarget) {
13088 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13089 // While trying to combine for the other MLAL nodes, first search for the
13090 // chance to use UMAAL. Check if Addc uses a node which has already
13091 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13092 // as the addend, and it's handled in PerformUMLALCombine.
13093
13094 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13095 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13096
13097 // Check that we have a glued ADDC node.
13098 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13099 if (AddcNode->getOpcode() != ARMISD::ADDC)
13100 return SDValue();
13101
13102 // Find the converted UMAAL or quit if it doesn't exist.
13103 SDNode *UmlalNode = nullptr;
13104 SDValue AddHi;
13105 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13106 UmlalNode = AddcNode->getOperand(0).getNode();
13107 AddHi = AddcNode->getOperand(1);
13108 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13109 UmlalNode = AddcNode->getOperand(1).getNode();
13110 AddHi = AddcNode->getOperand(0);
13111 } else {
13112 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13113 }
13114
13115 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13116 // the ADDC as well as Zero.
13117 if (!isNullConstant(UmlalNode->getOperand(3)))
13118 return SDValue();
13119
13120 if ((isNullConstant(AddeNode->getOperand(0)) &&
13121 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13122 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13123 isNullConstant(AddeNode->getOperand(1)))) {
13124 SelectionDAG &DAG = DCI.DAG;
13125 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13126 UmlalNode->getOperand(2), AddHi };
13127 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13128 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13129
13130 // Replace the ADDs' nodes uses by the UMAAL node's values.
13131 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13132 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13133
13134 // Return original node to notify the driver to stop replacing.
13135 return SDValue(AddeNode, 0);
13136 }
13137 return SDValue();
13138}
13139
13141 const ARMSubtarget *Subtarget) {
13142 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13143 return SDValue();
13144
13145 // Check that we have a pair of ADDC and ADDE as operands.
13146 // Both addends of the ADDE must be zero.
13147 SDNode* AddcNode = N->getOperand(2).getNode();
13148 SDNode* AddeNode = N->getOperand(3).getNode();
13149 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13150 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13151 isNullConstant(AddeNode->getOperand(0)) &&
13152 isNullConstant(AddeNode->getOperand(1)) &&
13153 (AddeNode->getOperand(2).getNode() == AddcNode))
13154 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13155 DAG.getVTList(MVT::i32, MVT::i32),
13156 {N->getOperand(0), N->getOperand(1),
13157 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13158 else
13159 return SDValue();
13160}
13161
13164 const ARMSubtarget *Subtarget) {
13165 SelectionDAG &DAG(DCI.DAG);
13166
13167 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13168 // (SUBC (ADDE 0, 0, C), 1) -> C
13169 SDValue LHS = N->getOperand(0);
13170 SDValue RHS = N->getOperand(1);
13171 if (LHS->getOpcode() == ARMISD::ADDE &&
13172 isNullConstant(LHS->getOperand(0)) &&
13173 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13174 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13175 }
13176 }
13177
13178 if (Subtarget->isThumb1Only()) {
13179 SDValue RHS = N->getOperand(1);
13180 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
13181 int32_t imm = C->getSExtValue();
13182 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13183 SDLoc DL(N);
13184 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13185 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13186 : ARMISD::ADDC;
13187 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13188 }
13189 }
13190 }
13191
13192 return SDValue();
13193}
13194
13197 const ARMSubtarget *Subtarget) {
13198 if (Subtarget->isThumb1Only()) {
13199 SelectionDAG &DAG = DCI.DAG;
13200 SDValue RHS = N->getOperand(1);
13201 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
13202 int64_t imm = C->getSExtValue();
13203 if (imm < 0) {
13204 SDLoc DL(N);
13205
13206 // The with-carry-in form matches bitwise not instead of the negation.
13207 // Effectively, the inverse interpretation of the carry flag already
13208 // accounts for part of the negation.
13209 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13210
13211 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13212 : ARMISD::ADDE;
13213 return DAG.getNode(Opcode, DL, N->getVTList(),
13214 N->getOperand(0), RHS, N->getOperand(2));
13215 }
13216 }
13217 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13218 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13219 }
13220 return SDValue();
13221}
13222
13225 const ARMSubtarget *Subtarget) {
13226 if (!Subtarget->hasMVEIntegerOps())
13227 return SDValue();
13228
13229 SDLoc dl(N);
13230 SDValue SetCC;
13231 SDValue LHS;
13232 SDValue RHS;
13234 SDValue TrueVal;
13235 SDValue FalseVal;
13236
13237 if (N->getOpcode() == ISD::SELECT &&
13238 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13239 SetCC = N->getOperand(0);
13240 LHS = SetCC->getOperand(0);
13241 RHS = SetCC->getOperand(1);
13242 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13243 TrueVal = N->getOperand(1);
13244 FalseVal = N->getOperand(2);
13245 } else if (N->getOpcode() == ISD::SELECT_CC) {
13246 LHS = N->getOperand(0);
13247 RHS = N->getOperand(1);
13248 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13249 TrueVal = N->getOperand(2);
13250 FalseVal = N->getOperand(3);
13251 } else {
13252 return SDValue();
13253 }
13254
13255 unsigned int Opcode = 0;
13256 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13257 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13258 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13259 Opcode = ARMISD::VMINVu;
13260 if (CC == ISD::SETUGT)
13261 std::swap(TrueVal, FalseVal);
13262 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13263 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13264 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13265 Opcode = ARMISD::VMINVs;
13266 if (CC == ISD::SETGT)
13267 std::swap(TrueVal, FalseVal);
13268 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13269 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13270 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13271 Opcode = ARMISD::VMAXVu;
13272 if (CC == ISD::SETULT)
13273 std::swap(TrueVal, FalseVal);
13274 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13275 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13276 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13277 Opcode = ARMISD::VMAXVs;
13278 if (CC == ISD::SETLT)
13279 std::swap(TrueVal, FalseVal);
13280 } else
13281 return SDValue();
13282
13283 // Normalise to the right hand side being the vector reduction
13284 switch (TrueVal->getOpcode()) {
13289 std::swap(LHS, RHS);
13290 std::swap(TrueVal, FalseVal);
13291 break;
13292 }
13293
13294 EVT VectorType = FalseVal->getOperand(0).getValueType();
13295
13296 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13297 VectorType != MVT::v4i32)
13298 return SDValue();
13299
13300 EVT VectorScalarType = VectorType.getVectorElementType();
13301
13302 // The values being selected must also be the ones being compared
13303 if (TrueVal != LHS || FalseVal != RHS)
13304 return SDValue();
13305
13306 EVT LeftType = LHS->getValueType(0);
13307 EVT RightType = RHS->getValueType(0);
13308
13309 // The types must match the reduced type too
13310 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13311 return SDValue();
13312
13313 // Legalise the scalar to an i32
13314 if (VectorScalarType != MVT::i32)
13315 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13316
13317 // Generate the reduction as an i32 for legalisation purposes
13318 auto Reduction =
13319 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13320
13321 // The result isn't actually an i32 so truncate it back to its original type
13322 if (VectorScalarType != MVT::i32)
13323 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13324
13325 return Reduction;
13326}
13327
13328// A special combine for the vqdmulh family of instructions. This is one of the
13329// potential set of patterns that could patch this instruction. The base pattern
13330// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13331// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13332// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13333// the max is unnecessary.
13335 EVT VT = N->getValueType(0);
13336 SDValue Shft;
13337 ConstantSDNode *Clamp;
13338
13339 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13340 return SDValue();
13341
13342 if (N->getOpcode() == ISD::SMIN) {
13343 Shft = N->getOperand(0);
13344 Clamp = isConstOrConstSplat(N->getOperand(1));
13345 } else if (N->getOpcode() == ISD::VSELECT) {
13346 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13347 SDValue Cmp = N->getOperand(0);
13348 if (Cmp.getOpcode() != ISD::SETCC ||
13349 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13350 Cmp.getOperand(0) != N->getOperand(1) ||
13351 Cmp.getOperand(1) != N->getOperand(2))
13352 return SDValue();
13353 Shft = N->getOperand(1);
13354 Clamp = isConstOrConstSplat(N->getOperand(2));
13355 } else
13356 return SDValue();
13357
13358 if (!Clamp)
13359 return SDValue();
13360
13361 MVT ScalarType;
13362 int ShftAmt = 0;
13363 switch (Clamp->getSExtValue()) {
13364 case (1 << 7) - 1:
13365 ScalarType = MVT::i8;
13366 ShftAmt = 7;
13367 break;
13368 case (1 << 15) - 1:
13369 ScalarType = MVT::i16;
13370 ShftAmt = 15;
13371 break;
13372 case (1ULL << 31) - 1:
13373 ScalarType = MVT::i32;
13374 ShftAmt = 31;
13375 break;
13376 default:
13377 return SDValue();
13378 }
13379
13380 if (Shft.getOpcode() != ISD::SRA)
13381 return SDValue();
13383 if (!N1 || N1->getSExtValue() != ShftAmt)
13384 return SDValue();
13385
13386 SDValue Mul = Shft.getOperand(0);
13387 if (Mul.getOpcode() != ISD::MUL)
13388 return SDValue();
13389
13390 SDValue Ext0 = Mul.getOperand(0);
13391 SDValue Ext1 = Mul.getOperand(1);
13392 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13393 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13394 return SDValue();
13395 EVT VecVT = Ext0.getOperand(0).getValueType();
13396 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13397 return SDValue();
13398 if (Ext1.getOperand(0).getValueType() != VecVT ||
13399 VecVT.getScalarType() != ScalarType ||
13400 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13401 return SDValue();
13402
13403 SDLoc DL(Mul);
13404 unsigned LegalLanes = 128 / (ShftAmt + 1);
13405 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13406 // For types smaller than legal vectors extend to be legal and only use needed
13407 // lanes.
13408 if (VecVT.getSizeInBits() < 128) {
13409 EVT ExtVecVT =
13411 VecVT.getVectorNumElements());
13412 SDValue Inp0 =
13413 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13414 SDValue Inp1 =
13415 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13416 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13417 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13418 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13419 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13420 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13421 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13422 }
13423
13424 // For larger types, split into legal sized chunks.
13425 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13426 unsigned NumParts = VecVT.getSizeInBits() / 128;
13428 for (unsigned I = 0; I < NumParts; ++I) {
13429 SDValue Inp0 =
13430 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13431 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13432 SDValue Inp1 =
13433 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13434 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13435 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13436 Parts.push_back(VQDMULH);
13437 }
13438 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13439 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13440}
13441
13444 const ARMSubtarget *Subtarget) {
13445 if (!Subtarget->hasMVEIntegerOps())
13446 return SDValue();
13447
13448 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13449 return V;
13450
13451 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13452 //
13453 // We need to re-implement this optimization here as the implementation in the
13454 // Target-Independent DAGCombiner does not handle the kind of constant we make
13455 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13456 // good reason, allowing truncation there would break other targets).
13457 //
13458 // Currently, this is only done for MVE, as it's the only target that benefits
13459 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13460 if (N->getOperand(0).getOpcode() != ISD::XOR)
13461 return SDValue();
13462 SDValue XOR = N->getOperand(0);
13463
13464 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13465 // It is important to check with truncation allowed as the BUILD_VECTORs we
13466 // generate in those situations will truncate their operands.
13467 ConstantSDNode *Const =
13468 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13469 /*AllowTruncation*/ true);
13470 if (!Const || !Const->isOne())
13471 return SDValue();
13472
13473 // Rewrite into vselect(cond, rhs, lhs).
13474 SDValue Cond = XOR->getOperand(0);
13475 SDValue LHS = N->getOperand(1);
13476 SDValue RHS = N->getOperand(2);
13477 EVT Type = N->getValueType(0);
13478 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13479}
13480
13481// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13484 const ARMSubtarget *Subtarget) {
13485 SDValue Op0 = N->getOperand(0);
13486 SDValue Op1 = N->getOperand(1);
13487 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13488 EVT VT = N->getValueType(0);
13489
13490 if (!Subtarget->hasMVEIntegerOps() ||
13492 return SDValue();
13493
13494 if (CC == ISD::SETUGE) {
13495 std::swap(Op0, Op1);
13496 CC = ISD::SETULT;
13497 }
13498
13499 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13501 return SDValue();
13502
13503 // Check first operand is BuildVector of 0,1,2,...
13504 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13505 if (!Op0.getOperand(I).isUndef() &&
13506 !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
13507 Op0.getConstantOperandVal(I) == I))
13508 return SDValue();
13509 }
13510
13511 // The second is a Splat of Op1S
13512 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13513 if (!Op1S)
13514 return SDValue();
13515
13516 unsigned Opc;
13517 switch (VT.getVectorNumElements()) {
13518 case 2:
13519 Opc = Intrinsic::arm_mve_vctp64;
13520 break;
13521 case 4:
13522 Opc = Intrinsic::arm_mve_vctp32;
13523 break;
13524 case 8:
13525 Opc = Intrinsic::arm_mve_vctp16;
13526 break;
13527 case 16:
13528 Opc = Intrinsic::arm_mve_vctp8;
13529 break;
13530 default:
13531 return SDValue();
13532 }
13533
13534 SDLoc DL(N);
13535 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13536 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13537 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13538}
13539
13540/// PerformADDECombine - Target-specific dag combine transform from
13541/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13542/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13545 const ARMSubtarget *Subtarget) {
13546 // Only ARM and Thumb2 support UMLAL/SMLAL.
13547 if (Subtarget->isThumb1Only())
13548 return PerformAddeSubeCombine(N, DCI, Subtarget);
13549
13550 // Only perform the checks after legalize when the pattern is available.
13551 if (DCI.isBeforeLegalize()) return SDValue();
13552
13553 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13554}
13555
13556/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13557/// operands N0 and N1. This is a helper for PerformADDCombine that is
13558/// called with the default operands, and if that fails, with commuted
13559/// operands.
13562 const ARMSubtarget *Subtarget){
13563 // Attempt to create vpadd for this add.
13564 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13565 return Result;
13566
13567 // Attempt to create vpaddl for this add.
13568 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13569 return Result;
13570 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13571 Subtarget))
13572 return Result;
13573
13574 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13575 if (N0.getNode()->hasOneUse())
13576 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13577 return Result;
13578 return SDValue();
13579}
13580
13582 EVT VT = N->getValueType(0);
13583 SDValue N0 = N->getOperand(0);
13584 SDValue N1 = N->getOperand(1);
13585 SDLoc dl(N);
13586
13587 auto IsVecReduce = [](SDValue Op) {
13588 switch (Op.getOpcode()) {
13589 case ISD::VECREDUCE_ADD:
13590 case ARMISD::VADDVs:
13591 case ARMISD::VADDVu:
13592 case ARMISD::VMLAVs:
13593 case ARMISD::VMLAVu:
13594 return true;
13595 }
13596 return false;
13597 };
13598
13599 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13600 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13601 // add(add(X, vecreduce(Y)), vecreduce(Z))
13602 // to make better use of vaddva style instructions.
13603 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13604 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13605 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13606 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13607 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13608 }
13609 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13610 // add(add(add(A, C), reduce(B)), reduce(D))
13611 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13612 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13613 unsigned N0RedOp = 0;
13614 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13615 N0RedOp = 1;
13616 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13617 return SDValue();
13618 }
13619
13620 unsigned N1RedOp = 0;
13621 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13622 N1RedOp = 1;
13623 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13624 return SDValue();
13625
13626 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13627 N1.getOperand(1 - N1RedOp));
13628 SDValue Add1 =
13629 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13630 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13631 }
13632 return SDValue();
13633 };
13634 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13635 return R;
13636 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13637 return R;
13638
13639 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13640 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13641 // by ascending load offsets. This can help cores prefetch if the order of
13642 // loads is more predictable.
13643 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13644 // Check if two reductions are known to load data where one is before/after
13645 // another. Return negative if N0 loads data before N1, positive if N1 is
13646 // before N0 and 0 otherwise if nothing is known.
13647 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13648 // Look through to the first operand of a MUL, for the VMLA case.
13649 // Currently only looks at the first operand, in the hope they are equal.
13650 if (N0.getOpcode() == ISD::MUL)
13651 N0 = N0.getOperand(0);
13652 if (N1.getOpcode() == ISD::MUL)
13653 N1 = N1.getOperand(0);
13654
13655 // Return true if the two operands are loads to the same object and the
13656 // offset of the first is known to be less than the offset of the second.
13657 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13658 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13659 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13660 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13661 Load1->isIndexed())
13662 return 0;
13663
13664 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13665 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13666
13667 if (!BaseLocDecomp0.getBase() ||
13668 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13669 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13670 return 0;
13671 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13672 return -1;
13673 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13674 return 1;
13675 return 0;
13676 };
13677
13678 SDValue X;
13679 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13680 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13681 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13682 N0.getOperand(1).getOperand(0));
13683 if (IsBefore < 0) {
13684 X = N0.getOperand(0);
13685 N0 = N0.getOperand(1);
13686 } else if (IsBefore > 0) {
13687 X = N0.getOperand(1);
13688 N0 = N0.getOperand(0);
13689 } else
13690 return SDValue();
13691 } else if (IsVecReduce(N0.getOperand(0))) {
13692 X = N0.getOperand(1);
13693 N0 = N0.getOperand(0);
13694 } else if (IsVecReduce(N0.getOperand(1))) {
13695 X = N0.getOperand(0);
13696 N0 = N0.getOperand(1);
13697 } else
13698 return SDValue();
13699 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13700 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13701 // Note this is backward to how you would expect. We create
13702 // add(reduce(load + 16), reduce(load + 0)) so that the
13703 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13704 // the X as VADDV(load + 0)
13705 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13706 } else
13707 return SDValue();
13708
13709 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13710 return SDValue();
13711
13712 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13713 return SDValue();
13714
13715 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13716 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13717 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13718 };
13719 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13720 return R;
13721 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13722 return R;
13723 return SDValue();
13724}
13725
13727 const ARMSubtarget *Subtarget) {
13728 if (!Subtarget->hasMVEIntegerOps())
13729 return SDValue();
13730
13732 return R;
13733
13734 EVT VT = N->getValueType(0);
13735 SDValue N0 = N->getOperand(0);
13736 SDValue N1 = N->getOperand(1);
13737 SDLoc dl(N);
13738
13739 if (VT != MVT::i64)
13740 return SDValue();
13741
13742 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13743 // will look like:
13744 // t1: i32,i32 = ARMISD::VADDLVs x
13745 // t2: i64 = build_pair t1, t1:1
13746 // t3: i64 = add t2, y
13747 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13748 // the add to be simplified separately.
13749 // We also need to check for sext / zext and commutitive adds.
13750 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13751 SDValue NB) {
13752 if (NB->getOpcode() != ISD::BUILD_PAIR)
13753 return SDValue();
13754 SDValue VecRed = NB->getOperand(0);
13755 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13756 VecRed.getResNo() != 0 ||
13757 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13758 return SDValue();
13759
13760 if (VecRed->getOpcode() == OpcodeA) {
13761 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13762 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13763 VecRed.getOperand(0), VecRed.getOperand(1));
13764 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13765 }
13766
13768 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13769
13770 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13771 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13772 Ops.push_back(VecRed->getOperand(I));
13773 SDValue Red =
13774 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13775 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13776 SDValue(Red.getNode(), 1));
13777 };
13778
13779 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13780 return M;
13781 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13782 return M;
13783 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13784 return M;
13785 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13786 return M;
13787 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13788 return M;
13789 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13790 return M;
13791 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13792 return M;
13793 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13794 return M;
13795 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13796 return M;
13797 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13798 return M;
13799 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13800 return M;
13801 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13802 return M;
13803 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13804 return M;
13805 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13806 return M;
13807 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13808 return M;
13809 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13810 return M;
13811 return SDValue();
13812}
13813
13814bool
13816 CombineLevel Level) const {
13817 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13818 N->getOpcode() == ISD::SRL) &&
13819 "Expected shift op");
13820
13821 if (Level == BeforeLegalizeTypes)
13822 return true;
13823
13824 if (N->getOpcode() != ISD::SHL)
13825 return true;
13826
13827 if (Subtarget->isThumb1Only()) {
13828 // Avoid making expensive immediates by commuting shifts. (This logic
13829 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13830 // for free.)
13831 if (N->getOpcode() != ISD::SHL)
13832 return true;
13833 SDValue N1 = N->getOperand(0);
13834 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13835 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13836 return true;
13837 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13838 if (Const->getAPIntValue().ult(256))
13839 return false;
13840 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13841 Const->getAPIntValue().sgt(-256))
13842 return false;
13843 }
13844 return true;
13845 }
13846
13847 // Turn off commute-with-shift transform after legalization, so it doesn't
13848 // conflict with PerformSHLSimplify. (We could try to detect when
13849 // PerformSHLSimplify would trigger more precisely, but it isn't
13850 // really necessary.)
13851 return false;
13852}
13853
13855 const SDNode *N) const {
13856 assert(N->getOpcode() == ISD::XOR &&
13857 (N->getOperand(0).getOpcode() == ISD::SHL ||
13858 N->getOperand(0).getOpcode() == ISD::SRL) &&
13859 "Expected XOR(SHIFT) pattern");
13860
13861 // Only commute if the entire NOT mask is a hidden shifted mask.
13862 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13863 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13864 if (XorC && ShiftC) {
13865 unsigned MaskIdx, MaskLen;
13866 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13867 unsigned ShiftAmt = ShiftC->getZExtValue();
13868 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13869 if (N->getOperand(0).getOpcode() == ISD::SHL)
13870 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13871 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13872 }
13873 }
13874
13875 return false;
13876}
13877
13879 const SDNode *N, CombineLevel Level) const {
13880 assert(((N->getOpcode() == ISD::SHL &&
13881 N->getOperand(0).getOpcode() == ISD::SRL) ||
13882 (N->getOpcode() == ISD::SRL &&
13883 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13884 "Expected shift-shift mask");
13885
13886 if (!Subtarget->isThumb1Only())
13887 return true;
13888
13889 if (Level == BeforeLegalizeTypes)
13890 return true;
13891
13892 return false;
13893}
13894
13896 EVT VT) const {
13897 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT);
13898}
13899
13901 if (!Subtarget->hasNEON()) {
13902 if (Subtarget->isThumb1Only())
13903 return VT.getScalarSizeInBits() <= 32;
13904 return true;
13905 }
13906 return VT.isScalarInteger();
13907}
13908
13910 EVT VT) const {
13911 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13912 return false;
13913
13914 switch (FPVT.getSimpleVT().SimpleTy) {
13915 case MVT::f16:
13916 return Subtarget->hasVFP2Base();
13917 case MVT::f32:
13918 return Subtarget->hasVFP2Base();
13919 case MVT::f64:
13920 return Subtarget->hasFP64();
13921 case MVT::v4f32:
13922 case MVT::v8f16:
13923 return Subtarget->hasMVEFloatOps();
13924 default:
13925 return false;
13926 }
13927}
13928
13931 const ARMSubtarget *ST) {
13932 // Allow the generic combiner to identify potential bswaps.
13933 if (DCI.isBeforeLegalize())
13934 return SDValue();
13935
13936 // DAG combiner will fold:
13937 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13938 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13939 // Other code patterns that can be also be modified have the following form:
13940 // b + ((a << 1) | 510)
13941 // b + ((a << 1) & 510)
13942 // b + ((a << 1) ^ 510)
13943 // b + ((a << 1) + 510)
13944
13945 // Many instructions can perform the shift for free, but it requires both
13946 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13947 // instruction will needed. So, unfold back to the original pattern if:
13948 // - if c1 and c2 are small enough that they don't require mov imms.
13949 // - the user(s) of the node can perform an shl
13950
13951 // No shifted operands for 16-bit instructions.
13952 if (ST->isThumb() && ST->isThumb1Only())
13953 return SDValue();
13954
13955 // Check that all the users could perform the shl themselves.
13956 for (auto *U : N->uses()) {
13957 switch(U->getOpcode()) {
13958 default:
13959 return SDValue();
13960 case ISD::SUB:
13961 case ISD::ADD:
13962 case ISD::AND:
13963 case ISD::OR:
13964 case ISD::XOR:
13965 case ISD::SETCC:
13966 case ARMISD::CMP:
13967 // Check that the user isn't already using a constant because there
13968 // aren't any instructions that support an immediate operand and a
13969 // shifted operand.
13970 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13971 isa<ConstantSDNode>(U->getOperand(1)))
13972 return SDValue();
13973
13974 // Check that it's not already using a shift.
13975 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13976 U->getOperand(1).getOpcode() == ISD::SHL)
13977 return SDValue();
13978 break;
13979 }
13980 }
13981
13982 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13983 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13984 return SDValue();
13985
13986 if (N->getOperand(0).getOpcode() != ISD::SHL)
13987 return SDValue();
13988
13989 SDValue SHL = N->getOperand(0);
13990
13991 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13992 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13993 if (!C1ShlC2 || !C2)
13994 return SDValue();
13995
13996 APInt C2Int = C2->getAPIntValue();
13997 APInt C1Int = C1ShlC2->getAPIntValue();
13998 unsigned C2Width = C2Int.getBitWidth();
13999 if (C2Int.uge(C2Width))
14000 return SDValue();
14001 uint64_t C2Value = C2Int.getZExtValue();
14002
14003 // Check that performing a lshr will not lose any information.
14004 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
14005 if ((C1Int & Mask) != C1Int)
14006 return SDValue();
14007
14008 // Shift the first constant.
14009 C1Int.lshrInPlace(C2Int);
14010
14011 // The immediates are encoded as an 8-bit value that can be rotated.
14012 auto LargeImm = [](const APInt &Imm) {
14013 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
14014 return Imm.getBitWidth() - Zeros > 8;
14015 };
14016
14017 if (LargeImm(C1Int) || LargeImm(C2Int))
14018 return SDValue();
14019
14020 SelectionDAG &DAG = DCI.DAG;
14021 SDLoc dl(N);
14022 SDValue X = SHL.getOperand(0);
14023 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
14024 DAG.getConstant(C1Int, dl, MVT::i32));
14025 // Shift left to compensate for the lshr of C1Int.
14026 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
14027
14028 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
14029 SHL.dump(); N->dump());
14030 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
14031 return Res;
14032}
14033
14034
14035/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
14036///
14039 const ARMSubtarget *Subtarget) {
14040 SDValue N0 = N->getOperand(0);
14041 SDValue N1 = N->getOperand(1);
14042
14043 // Only works one way, because it needs an immediate operand.
14044 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14045 return Result;
14046
14047 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
14048 return Result;
14049
14050 // First try with the default operand order.
14051 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
14052 return Result;
14053
14054 // If that didn't work, try again with the operands commuted.
14055 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
14056}
14057
14058// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14059// providing -X is as cheap as X (currently, just a constant).
14061 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14062 return SDValue();
14063 SDValue CSINC = N->getOperand(1);
14064 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14065 return SDValue();
14066
14067 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14068 if (!X)
14069 return SDValue();
14070
14071 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14072 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14073 CSINC.getOperand(0)),
14074 CSINC.getOperand(1), CSINC.getOperand(2),
14075 CSINC.getOperand(3));
14076}
14077
14078/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14079///
14082 const ARMSubtarget *Subtarget) {
14083 SDValue N0 = N->getOperand(0);
14084 SDValue N1 = N->getOperand(1);
14085
14086 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14087 if (N1.getNode()->hasOneUse())
14088 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14089 return Result;
14090
14091 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14092 return R;
14093
14094 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14095 return SDValue();
14096
14097 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14098 // so that we can readily pattern match more mve instructions which can use
14099 // a scalar operand.
14100 SDValue VDup = N->getOperand(1);
14101 if (VDup->getOpcode() != ARMISD::VDUP)
14102 return SDValue();
14103
14104 SDValue VMov = N->getOperand(0);
14105 if (VMov->getOpcode() == ISD::BITCAST)
14106 VMov = VMov->getOperand(0);
14107
14108 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14109 return SDValue();
14110
14111 SDLoc dl(N);
14112 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14113 DCI.DAG.getConstant(0, dl, MVT::i32),
14114 VDup->getOperand(0));
14115 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14116}
14117
14118/// PerformVMULCombine
14119/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14120/// special multiplier accumulator forwarding.
14121/// vmul d3, d0, d2
14122/// vmla d3, d1, d2
14123/// is faster than
14124/// vadd d3, d0, d1
14125/// vmul d3, d3, d2
14126// However, for (A + B) * (A + B),
14127// vadd d2, d0, d1
14128// vmul d3, d0, d2
14129// vmla d3, d1, d2
14130// is slower than
14131// vadd d2, d0, d1
14132// vmul d3, d2, d2
14135 const ARMSubtarget *Subtarget) {
14136 if (!Subtarget->hasVMLxForwarding())
14137 return SDValue();
14138
14139 SelectionDAG &DAG = DCI.DAG;
14140 SDValue N0 = N->getOperand(0);
14141 SDValue N1 = N->getOperand(1);
14142 unsigned Opcode = N0.getOpcode();
14143 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14144 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14145 Opcode = N1.getOpcode();
14146 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14147 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14148 return SDValue();
14149 std::swap(N0, N1);
14150 }
14151
14152 if (N0 == N1)
14153 return SDValue();
14154
14155 EVT VT = N->getValueType(0);
14156 SDLoc DL(N);
14157 SDValue N00 = N0->getOperand(0);
14158 SDValue N01 = N0->getOperand(1);
14159 return DAG.getNode(Opcode, DL, VT,
14160 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14161 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14162}
14163
14165 const ARMSubtarget *Subtarget) {
14166 EVT VT = N->getValueType(0);
14167 if (VT != MVT::v2i64)
14168 return SDValue();
14169
14170 SDValue N0 = N->getOperand(0);
14171 SDValue N1 = N->getOperand(1);
14172
14173 auto IsSignExt = [&](SDValue Op) {
14174 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14175 return SDValue();
14176 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14177 if (VT.getScalarSizeInBits() == 32)
14178 return Op->getOperand(0);
14179 return SDValue();
14180 };
14181 auto IsZeroExt = [&](SDValue Op) {
14182 // Zero extends are a little more awkward. At the point we are matching
14183 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14184 // That might be before of after a bitcast depending on how the and is
14185 // placed. Because this has to look through bitcasts, it is currently only
14186 // supported on LE.
14187 if (!Subtarget->isLittle())
14188 return SDValue();
14189
14190 SDValue And = Op;
14191 if (And->getOpcode() == ISD::BITCAST)
14192 And = And->getOperand(0);
14193 if (And->getOpcode() != ISD::AND)
14194 return SDValue();
14195 SDValue Mask = And->getOperand(1);
14196 if (Mask->getOpcode() == ISD::BITCAST)
14197 Mask = Mask->getOperand(0);
14198
14199 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14200 Mask.getValueType() != MVT::v4i32)
14201 return SDValue();
14202 if (isAllOnesConstant(Mask->getOperand(0)) &&
14203 isNullConstant(Mask->getOperand(1)) &&
14204 isAllOnesConstant(Mask->getOperand(2)) &&
14205 isNullConstant(Mask->getOperand(3)))
14206 return And->getOperand(0);
14207 return SDValue();
14208 };
14209
14210 SDLoc dl(N);
14211 if (SDValue Op0 = IsSignExt(N0)) {
14212 if (SDValue Op1 = IsSignExt(N1)) {
14213 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14214 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14215 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14216 }
14217 }
14218 if (SDValue Op0 = IsZeroExt(N0)) {
14219 if (SDValue Op1 = IsZeroExt(N1)) {
14220 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14221 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14222 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14223 }
14224 }
14225
14226 return SDValue();
14227}
14228
14231 const ARMSubtarget *Subtarget) {
14232 SelectionDAG &DAG = DCI.DAG;
14233
14234 EVT VT = N->getValueType(0);
14235 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14236 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14237
14238 if (Subtarget->isThumb1Only())
14239 return SDValue();
14240
14241 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14242 return SDValue();
14243
14244 if (VT.is64BitVector() || VT.is128BitVector())
14245 return PerformVMULCombine(N, DCI, Subtarget);
14246 if (VT != MVT::i32)
14247 return SDValue();
14248
14249 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14250 if (!C)
14251 return SDValue();
14252
14253 int64_t MulAmt = C->getSExtValue();
14254 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14255
14256 ShiftAmt = ShiftAmt & (32 - 1);
14257 SDValue V = N->getOperand(0);
14258 SDLoc DL(N);
14259
14260 SDValue Res;
14261 MulAmt >>= ShiftAmt;
14262
14263 if (MulAmt >= 0) {
14264 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14265 // (mul x, 2^N + 1) => (add (shl x, N), x)
14266 Res = DAG.getNode(ISD::ADD, DL, VT,
14267 V,
14268 DAG.getNode(ISD::SHL, DL, VT,
14269 V,
14270 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14271 MVT::i32)));
14272 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14273 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14274 Res = DAG.getNode(ISD::SUB, DL, VT,
14275 DAG.getNode(ISD::SHL, DL, VT,
14276 V,
14277 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14278 MVT::i32)),
14279 V);
14280 } else
14281 return SDValue();
14282 } else {
14283 uint64_t MulAmtAbs = -MulAmt;
14284 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14285 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14286 Res = DAG.getNode(ISD::SUB, DL, VT,
14287 V,
14288 DAG.getNode(ISD::SHL, DL, VT,
14289 V,
14290 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14291 MVT::i32)));
14292 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14293 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14294 Res = DAG.getNode(ISD::ADD, DL, VT,
14295 V,
14296 DAG.getNode(ISD::SHL, DL, VT,
14297 V,
14298 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14299 MVT::i32)));
14300 Res = DAG.getNode(ISD::SUB, DL, VT,
14301 DAG.getConstant(0, DL, MVT::i32), Res);
14302 } else
14303 return SDValue();
14304 }
14305
14306 if (ShiftAmt != 0)
14307 Res = DAG.getNode(ISD::SHL, DL, VT,
14308 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14309
14310 // Do not add new nodes to DAG combiner worklist.
14311 DCI.CombineTo(N, Res, false);
14312 return SDValue();
14313}
14314
14317 const ARMSubtarget *Subtarget) {
14318 // Allow DAGCombine to pattern-match before we touch the canonical form.
14319 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14320 return SDValue();
14321
14322 if (N->getValueType(0) != MVT::i32)
14323 return SDValue();
14324
14325 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14326 if (!N1C)
14327 return SDValue();
14328
14329 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14330 // Don't transform uxtb/uxth.
14331 if (C1 == 255 || C1 == 65535)
14332 return SDValue();
14333
14334 SDNode *N0 = N->getOperand(0).getNode();
14335 if (!N0->hasOneUse())
14336 return SDValue();
14337
14338 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14339 return SDValue();
14340
14341 bool LeftShift = N0->getOpcode() == ISD::SHL;
14342
14343 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
14344 if (!N01C)
14345 return SDValue();
14346
14347 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14348 if (!C2 || C2 >= 32)
14349 return SDValue();
14350
14351 // Clear irrelevant bits in the mask.
14352 if (LeftShift)
14353 C1 &= (-1U << C2);
14354 else
14355 C1 &= (-1U >> C2);
14356
14357 SelectionDAG &DAG = DCI.DAG;
14358 SDLoc DL(N);
14359
14360 // We have a pattern of the form "(and (shl x, c2) c1)" or
14361 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14362 // transform to a pair of shifts, to save materializing c1.
14363
14364 // First pattern: right shift, then mask off leading bits.
14365 // FIXME: Use demanded bits?
14366 if (!LeftShift && isMask_32(C1)) {
14367 uint32_t C3 = llvm::countl_zero(C1);
14368 if (C2 < C3) {
14369 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14370 DAG.getConstant(C3 - C2, DL, MVT::i32));
14371 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14372 DAG.getConstant(C3, DL, MVT::i32));
14373 }
14374 }
14375
14376 // First pattern, reversed: left shift, then mask off trailing bits.
14377 if (LeftShift && isMask_32(~C1)) {
14378 uint32_t C3 = llvm::countr_zero(C1);
14379 if (C2 < C3) {
14380 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14381 DAG.getConstant(C3 - C2, DL, MVT::i32));
14382 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14383 DAG.getConstant(C3, DL, MVT::i32));
14384 }
14385 }
14386
14387 // Second pattern: left shift, then mask off leading bits.
14388 // FIXME: Use demanded bits?
14389 if (LeftShift && isShiftedMask_32(C1)) {
14390 uint32_t Trailing = llvm::countr_zero(C1);
14391 uint32_t C3 = llvm::countl_zero(C1);
14392 if (Trailing == C2 && C2 + C3 < 32) {
14393 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14394 DAG.getConstant(C2 + C3, DL, MVT::i32));
14395 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14396 DAG.getConstant(C3, DL, MVT::i32));
14397 }
14398 }
14399
14400 // Second pattern, reversed: right shift, then mask off trailing bits.
14401 // FIXME: Handle other patterns of known/demanded bits.
14402 if (!LeftShift && isShiftedMask_32(C1)) {
14403 uint32_t Leading = llvm::countl_zero(C1);
14404 uint32_t C3 = llvm::countr_zero(C1);
14405 if (Leading == C2 && C2 + C3 < 32) {
14406 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14407 DAG.getConstant(C2 + C3, DL, MVT::i32));
14408 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14409 DAG.getConstant(C3, DL, MVT::i32));
14410 }
14411 }
14412
14413 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14414 // if "c1 >> c2" is a cheaper immediate than "c1"
14415 if (LeftShift &&
14416 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14417
14418 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14419 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14420 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14421 DAG.getConstant(C2, DL, MVT::i32));
14422 }
14423
14424 return SDValue();
14425}
14426
14429 const ARMSubtarget *Subtarget) {
14430 // Attempt to use immediate-form VBIC
14431 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14432 SDLoc dl(N);
14433 EVT VT = N->getValueType(0);
14434 SelectionDAG &DAG = DCI.DAG;
14435
14436 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14437 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14438 return SDValue();
14439
14440 APInt SplatBits, SplatUndef;
14441 unsigned SplatBitSize;
14442 bool HasAnyUndefs;
14443 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14444 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14445 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14446 SplatBitSize == 64) {
14447 EVT VbicVT;
14448 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14449 SplatUndef.getZExtValue(), SplatBitSize,
14450 DAG, dl, VbicVT, VT, OtherModImm);
14451 if (Val.getNode()) {
14452 SDValue Input =
14453 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
14454 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14455 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
14456 }
14457 }
14458 }
14459
14460 if (!Subtarget->isThumb1Only()) {
14461 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14462 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14463 return Result;
14464
14465 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14466 return Result;
14467 }
14468
14469 if (Subtarget->isThumb1Only())
14470 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14471 return Result;
14472
14473 return SDValue();
14474}
14475
14476// Try combining OR nodes to SMULWB, SMULWT.
14479 const ARMSubtarget *Subtarget) {
14480 if (!Subtarget->hasV6Ops() ||
14481 (Subtarget->isThumb() &&
14482 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14483 return SDValue();
14484
14485 SDValue SRL = OR->getOperand(0);
14486 SDValue SHL = OR->getOperand(1);
14487
14488 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14489 SRL = OR->getOperand(1);
14490 SHL = OR->getOperand(0);
14491 }
14492 if (!isSRL16(SRL) || !isSHL16(SHL))
14493 return SDValue();
14494
14495 // The first operands to the shifts need to be the two results from the
14496 // same smul_lohi node.
14497 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14498 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14499 return SDValue();
14500
14501 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14502 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14503 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14504 return SDValue();
14505
14506 // Now we have:
14507 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14508 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14509 // For SMUWB the 16-bit value will signed extended somehow.
14510 // For SMULWT only the SRA is required.
14511 // Check both sides of SMUL_LOHI
14512 SDValue OpS16 = SMULLOHI->getOperand(0);
14513 SDValue OpS32 = SMULLOHI->getOperand(1);
14514
14515 SelectionDAG &DAG = DCI.DAG;
14516 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14517 OpS16 = OpS32;
14518 OpS32 = SMULLOHI->getOperand(0);
14519 }
14520
14521 SDLoc dl(OR);
14522 unsigned Opcode = 0;
14523 if (isS16(OpS16, DAG))
14524 Opcode = ARMISD::SMULWB;
14525 else if (isSRA16(OpS16)) {
14526 Opcode = ARMISD::SMULWT;
14527 OpS16 = OpS16->getOperand(0);
14528 }
14529 else
14530 return SDValue();
14531
14532 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14533 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14534 return SDValue(OR, 0);
14535}
14536
14539 const ARMSubtarget *Subtarget) {
14540 // BFI is only available on V6T2+
14541 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14542 return SDValue();
14543
14544 EVT VT = N->getValueType(0);
14545 SDValue N0 = N->getOperand(0);
14546 SDValue N1 = N->getOperand(1);
14547 SelectionDAG &DAG = DCI.DAG;
14548 SDLoc DL(N);
14549 // 1) or (and A, mask), val => ARMbfi A, val, mask
14550 // iff (val & mask) == val
14551 //
14552 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14553 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14554 // && mask == ~mask2
14555 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14556 // && ~mask == mask2
14557 // (i.e., copy a bitfield value into another bitfield of the same width)
14558
14559 if (VT != MVT::i32)
14560 return SDValue();
14561
14562 SDValue N00 = N0.getOperand(0);
14563
14564 // The value and the mask need to be constants so we can verify this is
14565 // actually a bitfield set. If the mask is 0xffff, we can do better
14566 // via a movt instruction, so don't use BFI in that case.
14567 SDValue MaskOp = N0.getOperand(1);
14568 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
14569 if (!MaskC)
14570 return SDValue();
14571 unsigned Mask = MaskC->getZExtValue();
14572 if (Mask == 0xffff)
14573 return SDValue();
14574 SDValue Res;
14575 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14576 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
14577 if (N1C) {
14578 unsigned Val = N1C->getZExtValue();
14579 if ((Val & ~Mask) != Val)
14580 return SDValue();
14581
14582 if (ARM::isBitFieldInvertedMask(Mask)) {
14583 Val >>= llvm::countr_zero(~Mask);
14584
14585 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14586 DAG.getConstant(Val, DL, MVT::i32),
14587 DAG.getConstant(Mask, DL, MVT::i32));
14588
14589 DCI.CombineTo(N, Res, false);
14590 // Return value from the original node to inform the combiner than N is
14591 // now dead.
14592 return SDValue(N, 0);
14593 }
14594 } else if (N1.getOpcode() == ISD::AND) {
14595 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14596 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14597 if (!N11C)
14598 return SDValue();
14599 unsigned Mask2 = N11C->getZExtValue();
14600
14601 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14602 // as is to match.
14603 if (ARM::isBitFieldInvertedMask(Mask) &&
14604 (Mask == ~Mask2)) {
14605 // The pack halfword instruction works better for masks that fit it,
14606 // so use that when it's available.
14607 if (Subtarget->hasDSP() &&
14608 (Mask == 0xffff || Mask == 0xffff0000))
14609 return SDValue();
14610 // 2a
14611 unsigned amt = llvm::countr_zero(Mask2);
14612 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14613 DAG.getConstant(amt, DL, MVT::i32));
14614 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14615 DAG.getConstant(Mask, DL, MVT::i32));
14616 DCI.CombineTo(N, Res, false);
14617 // Return value from the original node to inform the combiner than N is
14618 // now dead.
14619 return SDValue(N, 0);
14620 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14621 (~Mask == Mask2)) {
14622 // The pack halfword instruction works better for masks that fit it,
14623 // so use that when it's available.
14624 if (Subtarget->hasDSP() &&
14625 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14626 return SDValue();
14627 // 2b
14628 unsigned lsb = llvm::countr_zero(Mask);
14629 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14630 DAG.getConstant(lsb, DL, MVT::i32));
14631 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14632 DAG.getConstant(Mask2, DL, MVT::i32));
14633 DCI.CombineTo(N, Res, false);
14634 // Return value from the original node to inform the combiner than N is
14635 // now dead.
14636 return SDValue(N, 0);
14637 }
14638 }
14639
14640 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14641 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14643 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14644 // where lsb(mask) == #shamt and masked bits of B are known zero.
14645 SDValue ShAmt = N00.getOperand(1);
14646 unsigned ShAmtC = ShAmt->getAsZExtVal();
14647 unsigned LSB = llvm::countr_zero(Mask);
14648 if (ShAmtC != LSB)
14649 return SDValue();
14650
14651 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14652 DAG.getConstant(~Mask, DL, MVT::i32));
14653
14654 DCI.CombineTo(N, Res, false);
14655 // Return value from the original node to inform the combiner than N is
14656 // now dead.
14657 return SDValue(N, 0);
14658 }
14659
14660 return SDValue();
14661}
14662
14663static bool isValidMVECond(unsigned CC, bool IsFloat) {
14664 switch (CC) {
14665 case ARMCC::EQ:
14666 case ARMCC::NE:
14667 case ARMCC::LE:
14668 case ARMCC::GT:
14669 case ARMCC::GE:
14670 case ARMCC::LT:
14671 return true;
14672 case ARMCC::HS:
14673 case ARMCC::HI:
14674 return !IsFloat;
14675 default:
14676 return false;
14677 };
14678}
14679
14681 if (N->getOpcode() == ARMISD::VCMP)
14682 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14683 else if (N->getOpcode() == ARMISD::VCMPZ)
14684 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14685 else
14686 llvm_unreachable("Not a VCMP/VCMPZ!");
14687}
14688
14691 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14692}
14693
14695 const ARMSubtarget *Subtarget) {
14696 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14697 // together with predicates
14698 EVT VT = N->getValueType(0);
14699 SDLoc DL(N);
14700 SDValue N0 = N->getOperand(0);
14701 SDValue N1 = N->getOperand(1);
14702
14703 auto IsFreelyInvertable = [&](SDValue V) {
14704 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14705 return CanInvertMVEVCMP(V);
14706 return false;
14707 };
14708
14709 // At least one operand must be freely invertable.
14710 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14711 return SDValue();
14712
14713 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14714 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14715 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14716 return DAG.getLogicalNOT(DL, And, VT);
14717}
14718
14719/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14722 const ARMSubtarget *Subtarget) {
14723 // Attempt to use immediate-form VORR
14724 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14725 SDLoc dl(N);
14726 EVT VT = N->getValueType(0);
14727 SelectionDAG &DAG = DCI.DAG;
14728
14729 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14730 return SDValue();
14731
14732 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14733 VT == MVT::v8i1 || VT == MVT::v16i1))
14734 return PerformORCombine_i1(N, DAG, Subtarget);
14735
14736 APInt SplatBits, SplatUndef;
14737 unsigned SplatBitSize;
14738 bool HasAnyUndefs;
14739 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14740 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14741 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14742 SplatBitSize == 64) {
14743 EVT VorrVT;
14744 SDValue Val =
14745 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14746 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14747 if (Val.getNode()) {
14748 SDValue Input =
14749 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
14750 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14751 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
14752 }
14753 }
14754 }
14755
14756 if (!Subtarget->isThumb1Only()) {
14757 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14758 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14759 return Result;
14760 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14761 return Result;
14762 }
14763
14764 SDValue N0 = N->getOperand(0);
14765 SDValue N1 = N->getOperand(1);
14766
14767 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14768 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14770
14771 // The code below optimizes (or (and X, Y), Z).
14772 // The AND operand needs to have a single user to make these optimizations
14773 // profitable.
14774 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14775 return SDValue();
14776
14777 APInt SplatUndef;
14778 unsigned SplatBitSize;
14779 bool HasAnyUndefs;
14780
14781 APInt SplatBits0, SplatBits1;
14782 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
14783 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
14784 // Ensure that the second operand of both ands are constants
14785 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14786 HasAnyUndefs) && !HasAnyUndefs) {
14787 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14788 HasAnyUndefs) && !HasAnyUndefs) {
14789 // Ensure that the bit width of the constants are the same and that
14790 // the splat arguments are logical inverses as per the pattern we
14791 // are trying to simplify.
14792 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14793 SplatBits0 == ~SplatBits1) {
14794 // Canonicalize the vector type to make instruction selection
14795 // simpler.
14796 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14797 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14798 N0->getOperand(1),
14799 N0->getOperand(0),
14800 N1->getOperand(0));
14801 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
14802 }
14803 }
14804 }
14805 }
14806
14807 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14808 // reasonable.
14809 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14810 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14811 return Res;
14812 }
14813
14814 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14815 return Result;
14816
14817 return SDValue();
14818}
14819
14822 const ARMSubtarget *Subtarget) {
14823 EVT VT = N->getValueType(0);
14824 SelectionDAG &DAG = DCI.DAG;
14825
14826 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14827 return SDValue();
14828
14829 if (!Subtarget->isThumb1Only()) {
14830 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14831 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14832 return Result;
14833
14834 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14835 return Result;
14836 }
14837
14838 if (Subtarget->hasMVEIntegerOps()) {
14839 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14840 SDValue N0 = N->getOperand(0);
14841 SDValue N1 = N->getOperand(1);
14842 const TargetLowering *TLI = Subtarget->getTargetLowering();
14843 if (TLI->isConstTrueVal(N1) &&
14844 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14845 if (CanInvertMVEVCMP(N0)) {
14846 SDLoc DL(N0);
14848
14850 Ops.push_back(N0->getOperand(0));
14851 if (N0->getOpcode() == ARMISD::VCMP)
14852 Ops.push_back(N0->getOperand(1));
14853 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14854 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14855 }
14856 }
14857 }
14858
14859 return SDValue();
14860}
14861
14862// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14863// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14864// their position in "to" (Rd).
14865static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14866 assert(N->getOpcode() == ARMISD::BFI);
14867
14868 SDValue From = N->getOperand(1);
14869 ToMask = ~N->getConstantOperandAPInt(2);
14870 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14871
14872 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14873 // #C in the base of the SHR.
14874 if (From->getOpcode() == ISD::SRL &&
14875 isa<ConstantSDNode>(From->getOperand(1))) {
14876 APInt Shift = From->getConstantOperandAPInt(1);
14877 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14878 FromMask <<= Shift.getLimitedValue(31);
14879 From = From->getOperand(0);
14880 }
14881
14882 return From;
14883}
14884
14885// If A and B contain one contiguous set of bits, does A | B == A . B?
14886//
14887// Neither A nor B must be zero.
14888static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14889 unsigned LastActiveBitInA = A.countr_zero();
14890 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14891 return LastActiveBitInA - 1 == FirstActiveBitInB;
14892}
14893
14895 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14896 APInt ToMask, FromMask;
14897 SDValue From = ParseBFI(N, ToMask, FromMask);
14898 SDValue To = N->getOperand(0);
14899
14900 SDValue V = To;
14901 if (V.getOpcode() != ARMISD::BFI)
14902 return SDValue();
14903
14904 APInt NewToMask, NewFromMask;
14905 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14906 if (NewFrom != From)
14907 return SDValue();
14908
14909 // Do the written bits conflict with any we've seen so far?
14910 if ((NewToMask & ToMask).getBoolValue())
14911 // Conflicting bits.
14912 return SDValue();
14913
14914 // Are the new bits contiguous when combined with the old bits?
14915 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14916 BitsProperlyConcatenate(FromMask, NewFromMask))
14917 return V;
14918 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14919 BitsProperlyConcatenate(NewFromMask, FromMask))
14920 return V;
14921
14922 return SDValue();
14923}
14924
14926 SDValue N0 = N->getOperand(0);
14927 SDValue N1 = N->getOperand(1);
14928
14929 if (N1.getOpcode() == ISD::AND) {
14930 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14931 // the bits being cleared by the AND are not demanded by the BFI.
14932 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14933 if (!N11C)
14934 return SDValue();
14935 unsigned InvMask = N->getConstantOperandVal(2);
14936 unsigned LSB = llvm::countr_zero(~InvMask);
14937 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14938 assert(Width <
14939 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14940 "undefined behavior");
14941 unsigned Mask = (1u << Width) - 1;
14942 unsigned Mask2 = N11C->getZExtValue();
14943 if ((Mask & (~Mask2)) == 0)
14944 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14945 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14946 return SDValue();
14947 }
14948
14949 // Look for another BFI to combine with.
14950 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14951 // We've found a BFI.
14952 APInt ToMask1, FromMask1;
14953 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14954
14955 APInt ToMask2, FromMask2;
14956 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14957 assert(From1 == From2);
14958 (void)From2;
14959
14960 // Create a new BFI, combining the two together.
14961 APInt NewFromMask = FromMask1 | FromMask2;
14962 APInt NewToMask = ToMask1 | ToMask2;
14963
14964 EVT VT = N->getValueType(0);
14965 SDLoc dl(N);
14966
14967 if (NewFromMask[0] == 0)
14968 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14969 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14970 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14971 DAG.getConstant(~NewToMask, dl, VT));
14972 }
14973
14974 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14975 // that lower bit insertions are performed first, providing that M1 and M2
14976 // do no overlap. This can allow multiple BFI instructions to be combined
14977 // together by the other folds above.
14978 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14979 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14980 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14981
14982 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14983 ToMask1.countl_zero() < ToMask2.countl_zero())
14984 return SDValue();
14985
14986 EVT VT = N->getValueType(0);
14987 SDLoc dl(N);
14988 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14989 N->getOperand(1), N->getOperand(2));
14990 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14991 N0.getOperand(2));
14992 }
14993
14994 return SDValue();
14995}
14996
14997// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14998// or CMPZ(CMOV(1, 0, CC, $cpsr, X))
14999// return X if valid.
15001 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
15002 return SDValue();
15003 SDValue CSInc = Cmp->getOperand(0);
15004
15005 // Ignore any `And 1` nodes that may not yet have been removed. We are
15006 // looking for a value that produces 1/0, so these have no effect on the
15007 // code.
15008 while (CSInc.getOpcode() == ISD::AND &&
15009 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15010 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15011 CSInc = CSInc.getOperand(0);
15012
15013 if (CSInc.getOpcode() == ARMISD::CSINC &&
15014 isNullConstant(CSInc.getOperand(0)) &&
15015 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15017 return CSInc.getOperand(3);
15018 }
15019 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15020 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15022 return CSInc.getOperand(4);
15023 }
15024 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15025 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15028 return CSInc.getOperand(4);
15029 }
15030 return SDValue();
15031}
15032
15034 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15035 // t92: glue = ARMISD::CMPZ t74, 0
15036 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15037 // t96: glue = ARMISD::CMPZ t93, 0
15038 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15040 if (SDValue C = IsCMPZCSINC(N, Cond))
15041 if (Cond == ARMCC::EQ)
15042 return C;
15043 return SDValue();
15044}
15045
15047 // Fold away an unneccessary CMPZ/CSINC
15048 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15049 // if C1==EQ -> CSXYZ A, B, C2, D
15050 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15052 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15053 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15054 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15055 N->getOperand(1),
15056 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15057 if (N->getConstantOperandVal(2) == ARMCC::NE)
15058 return DAG.getNode(
15059 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15060 N->getOperand(1),
15062 }
15063 return SDValue();
15064}
15065
15066/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15067/// ARMISD::VMOVRRD.
15070 const ARMSubtarget *Subtarget) {
15071 // vmovrrd(vmovdrr x, y) -> x,y
15072 SDValue InDouble = N->getOperand(0);
15073 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15074 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15075
15076 // vmovrrd(load f64) -> (load i32), (load i32)
15077 SDNode *InNode = InDouble.getNode();
15078 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15079 InNode->getValueType(0) == MVT::f64 &&
15080 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15081 !cast<LoadSDNode>(InNode)->isVolatile()) {
15082 // TODO: Should this be done for non-FrameIndex operands?
15083 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15084
15085 SelectionDAG &DAG = DCI.DAG;
15086 SDLoc DL(LD);
15087 SDValue BasePtr = LD->getBasePtr();
15088 SDValue NewLD1 =
15089 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15090 LD->getAlign(), LD->getMemOperand()->getFlags());
15091
15092 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15093 DAG.getConstant(4, DL, MVT::i32));
15094
15095 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15096 LD->getPointerInfo().getWithOffset(4),
15097 commonAlignment(LD->getAlign(), 4),
15098 LD->getMemOperand()->getFlags());
15099
15100 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15101 if (DCI.DAG.getDataLayout().isBigEndian())
15102 std::swap (NewLD1, NewLD2);
15103 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15104 return Result;
15105 }
15106
15107 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15108 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15109 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15110 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15111 SDValue BV = InDouble.getOperand(0);
15112 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15113 // change lane order under big endian.
15114 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15115 while (
15116 (BV.getOpcode() == ISD::BITCAST ||
15118 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15119 BVSwap = BV.getOpcode() == ISD::BITCAST;
15120 BV = BV.getOperand(0);
15121 }
15122 if (BV.getValueType() != MVT::v4i32)
15123 return SDValue();
15124
15125 // Handle buildvectors, pulling out the correct lane depending on
15126 // endianness.
15127 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15128 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15129 SDValue Op0 = BV.getOperand(Offset);
15130 SDValue Op1 = BV.getOperand(Offset + 1);
15131 if (!Subtarget->isLittle() && BVSwap)
15132 std::swap(Op0, Op1);
15133
15134 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15135 }
15136
15137 // A chain of insert_vectors, grabbing the correct value of the chain of
15138 // inserts.
15139 SDValue Op0, Op1;
15140 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15141 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15142 if (BV.getConstantOperandVal(2) == Offset)
15143 Op0 = BV.getOperand(1);
15144 if (BV.getConstantOperandVal(2) == Offset + 1)
15145 Op1 = BV.getOperand(1);
15146 }
15147 BV = BV.getOperand(0);
15148 }
15149 if (!Subtarget->isLittle() && BVSwap)
15150 std::swap(Op0, Op1);
15151 if (Op0 && Op1)
15152 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15153 }
15154
15155 return SDValue();
15156}
15157
15158/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15159/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15161 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15162 SDValue Op0 = N->getOperand(0);
15163 SDValue Op1 = N->getOperand(1);
15164 if (Op0.getOpcode() == ISD::BITCAST)
15165 Op0 = Op0.getOperand(0);
15166 if (Op1.getOpcode() == ISD::BITCAST)
15167 Op1 = Op1.getOperand(0);
15168 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15169 Op0.getNode() == Op1.getNode() &&
15170 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15171 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15172 N->getValueType(0), Op0.getOperand(0));
15173 return SDValue();
15174}
15175
15178 SDValue Op0 = N->getOperand(0);
15179
15180 // VMOVhr (VMOVrh (X)) -> X
15181 if (Op0->getOpcode() == ARMISD::VMOVrh)
15182 return Op0->getOperand(0);
15183
15184 // FullFP16: half values are passed in S-registers, and we don't
15185 // need any of the bitcast and moves:
15186 //
15187 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15188 // t5: i32 = bitcast t2
15189 // t18: f16 = ARMISD::VMOVhr t5
15190 // =>
15191 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15192 if (Op0->getOpcode() == ISD::BITCAST) {
15193 SDValue Copy = Op0->getOperand(0);
15194 if (Copy.getValueType() == MVT::f32 &&
15195 Copy->getOpcode() == ISD::CopyFromReg) {
15196 bool HasGlue = Copy->getNumOperands() == 3;
15197 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15198 HasGlue ? Copy->getOperand(2) : SDValue()};
15199 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15200 SDValue NewCopy =
15202 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15203 ArrayRef(Ops, HasGlue ? 3 : 2));
15204
15205 // Update Users, Chains, and Potential Glue.
15206 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15207 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15208 if (HasGlue)
15209 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15210 NewCopy.getValue(2));
15211
15212 return NewCopy;
15213 }
15214 }
15215
15216 // fold (VMOVhr (load x)) -> (load (f16*)x)
15217 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15218 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15219 LN0->getMemoryVT() == MVT::i16) {
15220 SDValue Load =
15221 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15222 LN0->getBasePtr(), LN0->getMemOperand());
15223 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15224 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15225 return Load;
15226 }
15227 }
15228
15229 // Only the bottom 16 bits of the source register are used.
15230 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15231 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15232 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15233 return SDValue(N, 0);
15234
15235 return SDValue();
15236}
15237
15239 SDValue N0 = N->getOperand(0);
15240 EVT VT = N->getValueType(0);
15241
15242 // fold (VMOVrh (fpconst x)) -> const x
15243 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
15244 APFloat V = C->getValueAPF();
15245 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15246 }
15247
15248 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15249 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15250 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15251
15252 SDValue Load =
15253 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15254 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15255 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15256 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15257 return Load;
15258 }
15259
15260 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15261 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15262 isa<ConstantSDNode>(N0->getOperand(1)))
15263 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15264 N0->getOperand(1));
15265
15266 return SDValue();
15267}
15268
15269/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15270/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15271/// i64 vector to have f64 elements, since the value can then be loaded
15272/// directly into a VFP register.
15274 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15275 for (unsigned i = 0; i < NumElts; ++i) {
15276 SDNode *Elt = N->getOperand(i).getNode();
15277 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15278 return true;
15279 }
15280 return false;
15281}
15282
15283/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15284/// ISD::BUILD_VECTOR.
15287 const ARMSubtarget *Subtarget) {
15288 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15289 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15290 // into a pair of GPRs, which is fine when the value is used as a scalar,
15291 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15292 SelectionDAG &DAG = DCI.DAG;
15293 if (N->getNumOperands() == 2)
15294 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15295 return RV;
15296
15297 // Load i64 elements as f64 values so that type legalization does not split
15298 // them up into i32 values.
15299 EVT VT = N->getValueType(0);
15300 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15301 return SDValue();
15302 SDLoc dl(N);
15304 unsigned NumElts = VT.getVectorNumElements();
15305 for (unsigned i = 0; i < NumElts; ++i) {
15306 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15307 Ops.push_back(V);
15308 // Make the DAGCombiner fold the bitcast.
15309 DCI.AddToWorklist(V.getNode());
15310 }
15311 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15312 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15313 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15314}
15315
15316/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15317static SDValue
15319 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15320 // At that time, we may have inserted bitcasts from integer to float.
15321 // If these bitcasts have survived DAGCombine, change the lowering of this
15322 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15323 // force to use floating point types.
15324
15325 // Make sure we can change the type of the vector.
15326 // This is possible iff:
15327 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15328 // 1.1. Vector is used only once.
15329 // 1.2. Use is a bit convert to an integer type.
15330 // 2. The size of its operands are 32-bits (64-bits are not legal).
15331 EVT VT = N->getValueType(0);
15332 EVT EltVT = VT.getVectorElementType();
15333
15334 // Check 1.1. and 2.
15335 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15336 return SDValue();
15337
15338 // By construction, the input type must be float.
15339 assert(EltVT == MVT::f32 && "Unexpected type!");
15340
15341 // Check 1.2.
15342 SDNode *Use = *N->use_begin();
15343 if (Use->getOpcode() != ISD::BITCAST ||
15344 Use->getValueType(0).isFloatingPoint())
15345 return SDValue();
15346
15347 // Check profitability.
15348 // Model is, if more than half of the relevant operands are bitcast from
15349 // i32, turn the build_vector into a sequence of insert_vector_elt.
15350 // Relevant operands are everything that is not statically
15351 // (i.e., at compile time) bitcasted.
15352 unsigned NumOfBitCastedElts = 0;
15353 unsigned NumElts = VT.getVectorNumElements();
15354 unsigned NumOfRelevantElts = NumElts;
15355 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15356 SDValue Elt = N->getOperand(Idx);
15357 if (Elt->getOpcode() == ISD::BITCAST) {
15358 // Assume only bit cast to i32 will go away.
15359 if (Elt->getOperand(0).getValueType() == MVT::i32)
15360 ++NumOfBitCastedElts;
15361 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15362 // Constants are statically casted, thus do not count them as
15363 // relevant operands.
15364 --NumOfRelevantElts;
15365 }
15366
15367 // Check if more than half of the elements require a non-free bitcast.
15368 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15369 return SDValue();
15370
15371 SelectionDAG &DAG = DCI.DAG;
15372 // Create the new vector type.
15373 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15374 // Check if the type is legal.
15375 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15376 if (!TLI.isTypeLegal(VecVT))
15377 return SDValue();
15378
15379 // Combine:
15380 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15381 // => BITCAST INSERT_VECTOR_ELT
15382 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15383 // (BITCAST EN), N.
15384 SDValue Vec = DAG.getUNDEF(VecVT);
15385 SDLoc dl(N);
15386 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15387 SDValue V = N->getOperand(Idx);
15388 if (V.isUndef())
15389 continue;
15390 if (V.getOpcode() == ISD::BITCAST &&
15391 V->getOperand(0).getValueType() == MVT::i32)
15392 // Fold obvious case.
15393 V = V.getOperand(0);
15394 else {
15395 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15396 // Make the DAGCombiner fold the bitcasts.
15397 DCI.AddToWorklist(V.getNode());
15398 }
15399 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15400 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15401 }
15402 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15403 // Make the DAGCombiner fold the bitcasts.
15404 DCI.AddToWorklist(Vec.getNode());
15405 return Vec;
15406}
15407
15408static SDValue
15410 EVT VT = N->getValueType(0);
15411 SDValue Op = N->getOperand(0);
15412 SDLoc dl(N);
15413
15414 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15415 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15416 // If the valuetypes are the same, we can remove the cast entirely.
15417 if (Op->getOperand(0).getValueType() == VT)
15418 return Op->getOperand(0);
15419 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15420 }
15421
15422 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15423 // more VPNOT which might get folded as else predicates.
15424 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15425 SDValue X =
15426 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15428 DCI.DAG.getConstant(65535, dl, MVT::i32));
15429 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15430 }
15431
15432 // Only the bottom 16 bits of the source register are used.
15433 if (Op.getValueType() == MVT::i32) {
15434 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15435 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15436 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15437 return SDValue(N, 0);
15438 }
15439 return SDValue();
15440}
15441
15443 const ARMSubtarget *ST) {
15444 EVT VT = N->getValueType(0);
15445 SDValue Op = N->getOperand(0);
15446 SDLoc dl(N);
15447
15448 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15449 if (ST->isLittle())
15450 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15451
15452 // VECTOR_REG_CAST undef -> undef
15453 if (Op.isUndef())
15454 return DAG.getUNDEF(VT);
15455
15456 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15457 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15458 // If the valuetypes are the same, we can remove the cast entirely.
15459 if (Op->getOperand(0).getValueType() == VT)
15460 return Op->getOperand(0);
15461 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15462 }
15463
15464 return SDValue();
15465}
15466
15468 const ARMSubtarget *Subtarget) {
15469 if (!Subtarget->hasMVEIntegerOps())
15470 return SDValue();
15471
15472 EVT VT = N->getValueType(0);
15473 SDValue Op0 = N->getOperand(0);
15474 SDValue Op1 = N->getOperand(1);
15475 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15476 SDLoc dl(N);
15477
15478 // vcmp X, 0, cc -> vcmpz X, cc
15479 if (isZeroVector(Op1))
15480 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15481
15482 unsigned SwappedCond = getSwappedCondition(Cond);
15483 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15484 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15485 if (isZeroVector(Op0))
15486 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15487 DAG.getConstant(SwappedCond, dl, MVT::i32));
15488 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15489 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15490 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15491 DAG.getConstant(SwappedCond, dl, MVT::i32));
15492 }
15493
15494 return SDValue();
15495}
15496
15497/// PerformInsertEltCombine - Target-specific dag combine xforms for
15498/// ISD::INSERT_VECTOR_ELT.
15501 // Bitcast an i64 load inserted into a vector to f64.
15502 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15503 EVT VT = N->getValueType(0);
15504 SDNode *Elt = N->getOperand(1).getNode();
15505 if (VT.getVectorElementType() != MVT::i64 ||
15506 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15507 return SDValue();
15508
15509 SelectionDAG &DAG = DCI.DAG;
15510 SDLoc dl(N);
15511 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15513 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15514 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15515 // Make the DAGCombiner fold the bitcasts.
15516 DCI.AddToWorklist(Vec.getNode());
15517 DCI.AddToWorklist(V.getNode());
15518 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15519 Vec, V, N->getOperand(2));
15520 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15521}
15522
15523// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15524// directly or bitcast to an integer if the original is a float vector.
15525// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15526// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15527static SDValue
15529 EVT VT = N->getValueType(0);
15530 SDLoc dl(N);
15531
15532 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15533 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15534 return SDValue();
15535
15536 SDValue Ext = SDValue(N, 0);
15537 if (Ext.getOpcode() == ISD::BITCAST &&
15538 Ext.getOperand(0).getValueType() == MVT::f32)
15539 Ext = Ext.getOperand(0);
15540 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15541 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15542 Ext.getConstantOperandVal(1) % 2 != 0)
15543 return SDValue();
15544 if (Ext->use_size() == 1 &&
15545 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
15546 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
15547 return SDValue();
15548
15549 SDValue Op0 = Ext.getOperand(0);
15550 EVT VecVT = Op0.getValueType();
15551 unsigned ResNo = Op0.getResNo();
15552 unsigned Lane = Ext.getConstantOperandVal(1);
15553 if (VecVT.getVectorNumElements() != 4)
15554 return SDValue();
15555
15556 // Find another extract, of Lane + 1
15557 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
15558 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15559 isa<ConstantSDNode>(V->getOperand(1)) &&
15560 V->getConstantOperandVal(1) == Lane + 1 &&
15561 V->getOperand(0).getResNo() == ResNo;
15562 });
15563 if (OtherIt == Op0->uses().end())
15564 return SDValue();
15565
15566 // For float extracts, we need to be converting to a i32 for both vector
15567 // lanes.
15568 SDValue OtherExt(*OtherIt, 0);
15569 if (OtherExt.getValueType() != MVT::i32) {
15570 if (OtherExt->use_size() != 1 ||
15571 OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
15572 OtherExt->use_begin()->getValueType(0) != MVT::i32)
15573 return SDValue();
15574 OtherExt = SDValue(*OtherExt->use_begin(), 0);
15575 }
15576
15577 // Convert the type to a f64 and extract with a VMOVRRD.
15578 SDValue F64 = DCI.DAG.getNode(
15579 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15580 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15581 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15582 SDValue VMOVRRD =
15583 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15584
15585 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15586 return VMOVRRD;
15587}
15588
15591 const ARMSubtarget *ST) {
15592 SDValue Op0 = N->getOperand(0);
15593 EVT VT = N->getValueType(0);
15594 SDLoc dl(N);
15595
15596 // extract (vdup x) -> x
15597 if (Op0->getOpcode() == ARMISD::VDUP) {
15598 SDValue X = Op0->getOperand(0);
15599 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15600 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15601 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15602 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15603 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15604 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15605
15606 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15607 X = X->getOperand(0);
15608 if (X.getValueType() == VT)
15609 return X;
15610 }
15611
15612 // extract ARM_BUILD_VECTOR -> x
15613 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15614 isa<ConstantSDNode>(N->getOperand(1)) &&
15615 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15616 return Op0.getOperand(N->getConstantOperandVal(1));
15617 }
15618
15619 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15620 if (Op0.getValueType() == MVT::v4i32 &&
15621 isa<ConstantSDNode>(N->getOperand(1)) &&
15622 Op0.getOpcode() == ISD::BITCAST &&
15624 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15625 SDValue BV = Op0.getOperand(0);
15626 unsigned Offset = N->getConstantOperandVal(1);
15627 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15628 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15629 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15630 }
15631
15632 // extract x, n; extract x, n+1 -> VMOVRRD x
15633 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15634 return R;
15635
15636 // extract (MVETrunc(x)) -> extract x
15637 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15638 unsigned Idx = N->getConstantOperandVal(1);
15639 unsigned Vec =
15641 unsigned SubIdx =
15643 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15644 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15645 }
15646
15647 return SDValue();
15648}
15649
15651 SDValue Op = N->getOperand(0);
15652 EVT VT = N->getValueType(0);
15653
15654 // sext_inreg(VGETLANEu) -> VGETLANEs
15655 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15656 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15657 Op.getOperand(0).getValueType().getScalarType())
15658 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15659 Op.getOperand(1));
15660
15661 return SDValue();
15662}
15663
15664static SDValue
15666 SDValue Vec = N->getOperand(0);
15667 SDValue SubVec = N->getOperand(1);
15668 uint64_t IdxVal = N->getConstantOperandVal(2);
15669 EVT VecVT = Vec.getValueType();
15670 EVT SubVT = SubVec.getValueType();
15671
15672 // Only do this for legal fixed vector types.
15673 if (!VecVT.isFixedLengthVector() ||
15674 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15676 return SDValue();
15677
15678 // Ignore widening patterns.
15679 if (IdxVal == 0 && Vec.isUndef())
15680 return SDValue();
15681
15682 // Subvector must be half the width and an "aligned" insertion.
15683 unsigned NumSubElts = SubVT.getVectorNumElements();
15684 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15685 (IdxVal != 0 && IdxVal != NumSubElts))
15686 return SDValue();
15687
15688 // Fold insert_subvector -> concat_vectors
15689 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15690 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15691 SDLoc DL(N);
15692 SDValue Lo, Hi;
15693 if (IdxVal == 0) {
15694 Lo = SubVec;
15695 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15696 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15697 } else {
15698 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15699 DCI.DAG.getVectorIdxConstant(0, DL));
15700 Hi = SubVec;
15701 }
15702 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15703}
15704
15705// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15707 SelectionDAG &DAG) {
15708 SDValue Trunc = N->getOperand(0);
15709 EVT VT = Trunc.getValueType();
15710 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15711 return SDValue();
15712
15713 SDLoc DL(Trunc);
15714 if (isVMOVNTruncMask(N->getMask(), VT, false))
15715 return DAG.getNode(
15716 ARMISD::VMOVN, DL, VT,
15717 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15718 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15719 DAG.getConstant(1, DL, MVT::i32));
15720 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15721 return DAG.getNode(
15722 ARMISD::VMOVN, DL, VT,
15723 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15724 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15725 DAG.getConstant(1, DL, MVT::i32));
15726 return SDValue();
15727}
15728
15729/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15730/// ISD::VECTOR_SHUFFLE.
15732 if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
15733 return R;
15734
15735 // The LLVM shufflevector instruction does not require the shuffle mask
15736 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15737 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15738 // operands do not match the mask length, they are extended by concatenating
15739 // them with undef vectors. That is probably the right thing for other
15740 // targets, but for NEON it is better to concatenate two double-register
15741 // size vector operands into a single quad-register size vector. Do that
15742 // transformation here:
15743 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15744 // shuffle(concat(v1, v2), undef)
15745 SDValue Op0 = N->getOperand(0);
15746 SDValue Op1 = N->getOperand(1);
15747 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15748 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15749 Op0.getNumOperands() != 2 ||
15750 Op1.getNumOperands() != 2)
15751 return SDValue();
15752 SDValue Concat0Op1 = Op0.getOperand(1);
15753 SDValue Concat1Op1 = Op1.getOperand(1);
15754 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15755 return SDValue();
15756 // Skip the transformation if any of the types are illegal.
15757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15758 EVT VT = N->getValueType(0);
15759 if (!TLI.isTypeLegal(VT) ||
15760 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15761 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15762 return SDValue();
15763
15764 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15765 Op0.getOperand(0), Op1.getOperand(0));
15766 // Translate the shuffle mask.
15767 SmallVector<int, 16> NewMask;
15768 unsigned NumElts = VT.getVectorNumElements();
15769 unsigned HalfElts = NumElts/2;
15770 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
15771 for (unsigned n = 0; n < NumElts; ++n) {
15772 int MaskElt = SVN->getMaskElt(n);
15773 int NewElt = -1;
15774 if (MaskElt < (int)HalfElts)
15775 NewElt = MaskElt;
15776 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15777 NewElt = HalfElts + MaskElt - NumElts;
15778 NewMask.push_back(NewElt);
15779 }
15780 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15781 DAG.getUNDEF(VT), NewMask);
15782}
15783
15784/// Load/store instruction that can be merged with a base address
15785/// update
15790 unsigned AddrOpIdx;
15791};
15792
15794 /// Instruction that updates a pointer
15796 /// Pointer increment operand
15798 /// Pointer increment value if it is a constant, or 0 otherwise
15799 unsigned ConstInc;
15800};
15801
15803 struct BaseUpdateUser &User,
15804 bool SimpleConstIncOnly,
15806 SelectionDAG &DAG = DCI.DAG;
15807 SDNode *N = Target.N;
15808 MemSDNode *MemN = cast<MemSDNode>(N);
15809 SDLoc dl(N);
15810
15811 // Find the new opcode for the updating load/store.
15812 bool isLoadOp = true;
15813 bool isLaneOp = false;
15814 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15815 // as an operand.
15816 bool hasAlignment = true;
15817 unsigned NewOpc = 0;
15818 unsigned NumVecs = 0;
15819 if (Target.isIntrinsic) {
15820 unsigned IntNo = N->getConstantOperandVal(1);
15821 switch (IntNo) {
15822 default:
15823 llvm_unreachable("unexpected intrinsic for Neon base update");
15824 case Intrinsic::arm_neon_vld1:
15825 NewOpc = ARMISD::VLD1_UPD;
15826 NumVecs = 1;
15827 break;
15828 case Intrinsic::arm_neon_vld2:
15829 NewOpc = ARMISD::VLD2_UPD;
15830 NumVecs = 2;
15831 break;
15832 case Intrinsic::arm_neon_vld3:
15833 NewOpc = ARMISD::VLD3_UPD;
15834 NumVecs = 3;
15835 break;
15836 case Intrinsic::arm_neon_vld4:
15837 NewOpc = ARMISD::VLD4_UPD;
15838 NumVecs = 4;
15839 break;
15840 case Intrinsic::arm_neon_vld1x2:
15841 NewOpc = ARMISD::VLD1x2_UPD;
15842 NumVecs = 2;
15843 hasAlignment = false;
15844 break;
15845 case Intrinsic::arm_neon_vld1x3:
15846 NewOpc = ARMISD::VLD1x3_UPD;
15847 NumVecs = 3;
15848 hasAlignment = false;
15849 break;
15850 case Intrinsic::arm_neon_vld1x4:
15851 NewOpc = ARMISD::VLD1x4_UPD;
15852 NumVecs = 4;
15853 hasAlignment = false;
15854 break;
15855 case Intrinsic::arm_neon_vld2dup:
15856 NewOpc = ARMISD::VLD2DUP_UPD;
15857 NumVecs = 2;
15858 break;
15859 case Intrinsic::arm_neon_vld3dup:
15860 NewOpc = ARMISD::VLD3DUP_UPD;
15861 NumVecs = 3;
15862 break;
15863 case Intrinsic::arm_neon_vld4dup:
15864 NewOpc = ARMISD::VLD4DUP_UPD;
15865 NumVecs = 4;
15866 break;
15867 case Intrinsic::arm_neon_vld2lane:
15868 NewOpc = ARMISD::VLD2LN_UPD;
15869 NumVecs = 2;
15870 isLaneOp = true;
15871 break;
15872 case Intrinsic::arm_neon_vld3lane:
15873 NewOpc = ARMISD::VLD3LN_UPD;
15874 NumVecs = 3;
15875 isLaneOp = true;
15876 break;
15877 case Intrinsic::arm_neon_vld4lane:
15878 NewOpc = ARMISD::VLD4LN_UPD;
15879 NumVecs = 4;
15880 isLaneOp = true;
15881 break;
15882 case Intrinsic::arm_neon_vst1:
15883 NewOpc = ARMISD::VST1_UPD;
15884 NumVecs = 1;
15885 isLoadOp = false;
15886 break;
15887 case Intrinsic::arm_neon_vst2:
15888 NewOpc = ARMISD::VST2_UPD;
15889 NumVecs = 2;
15890 isLoadOp = false;
15891 break;
15892 case Intrinsic::arm_neon_vst3:
15893 NewOpc = ARMISD::VST3_UPD;
15894 NumVecs = 3;
15895 isLoadOp = false;
15896 break;
15897 case Intrinsic::arm_neon_vst4:
15898 NewOpc = ARMISD::VST4_UPD;
15899 NumVecs = 4;
15900 isLoadOp = false;
15901 break;
15902 case Intrinsic::arm_neon_vst2lane:
15903 NewOpc = ARMISD::VST2LN_UPD;
15904 NumVecs = 2;
15905 isLoadOp = false;
15906 isLaneOp = true;
15907 break;
15908 case Intrinsic::arm_neon_vst3lane:
15909 NewOpc = ARMISD::VST3LN_UPD;
15910 NumVecs = 3;
15911 isLoadOp = false;
15912 isLaneOp = true;
15913 break;
15914 case Intrinsic::arm_neon_vst4lane:
15915 NewOpc = ARMISD::VST4LN_UPD;
15916 NumVecs = 4;
15917 isLoadOp = false;
15918 isLaneOp = true;
15919 break;
15920 case Intrinsic::arm_neon_vst1x2:
15921 NewOpc = ARMISD::VST1x2_UPD;
15922 NumVecs = 2;
15923 isLoadOp = false;
15924 hasAlignment = false;
15925 break;
15926 case Intrinsic::arm_neon_vst1x3:
15927 NewOpc = ARMISD::VST1x3_UPD;
15928 NumVecs = 3;
15929 isLoadOp = false;
15930 hasAlignment = false;
15931 break;
15932 case Intrinsic::arm_neon_vst1x4:
15933 NewOpc = ARMISD::VST1x4_UPD;
15934 NumVecs = 4;
15935 isLoadOp = false;
15936 hasAlignment = false;
15937 break;
15938 }
15939 } else {
15940 isLaneOp = true;
15941 switch (N->getOpcode()) {
15942 default:
15943 llvm_unreachable("unexpected opcode for Neon base update");
15944 case ARMISD::VLD1DUP:
15945 NewOpc = ARMISD::VLD1DUP_UPD;
15946 NumVecs = 1;
15947 break;
15948 case ARMISD::VLD2DUP:
15949 NewOpc = ARMISD::VLD2DUP_UPD;
15950 NumVecs = 2;
15951 break;
15952 case ARMISD::VLD3DUP:
15953 NewOpc = ARMISD::VLD3DUP_UPD;
15954 NumVecs = 3;
15955 break;
15956 case ARMISD::VLD4DUP:
15957 NewOpc = ARMISD::VLD4DUP_UPD;
15958 NumVecs = 4;
15959 break;
15960 case ISD::LOAD:
15961 NewOpc = ARMISD::VLD1_UPD;
15962 NumVecs = 1;
15963 isLaneOp = false;
15964 break;
15965 case ISD::STORE:
15966 NewOpc = ARMISD::VST1_UPD;
15967 NumVecs = 1;
15968 isLaneOp = false;
15969 isLoadOp = false;
15970 break;
15971 }
15972 }
15973
15974 // Find the size of memory referenced by the load/store.
15975 EVT VecTy;
15976 if (isLoadOp) {
15977 VecTy = N->getValueType(0);
15978 } else if (Target.isIntrinsic) {
15979 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15980 } else {
15981 assert(Target.isStore &&
15982 "Node has to be a load, a store, or an intrinsic!");
15983 VecTy = N->getOperand(1).getValueType();
15984 }
15985
15986 bool isVLDDUPOp =
15987 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15988 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15989
15990 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15991 if (isLaneOp || isVLDDUPOp)
15992 NumBytes /= VecTy.getVectorNumElements();
15993
15994 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15995 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15996 // separate instructions that make it harder to use a non-constant update.
15997 return false;
15998 }
15999
16000 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16001 return false;
16002
16003 // OK, we found an ADD we can fold into the base update.
16004 // Now, create a _UPD node, taking care of not breaking alignment.
16005
16006 EVT AlignedVecTy = VecTy;
16007 Align Alignment = MemN->getAlign();
16008
16009 // If this is a less-than-standard-aligned load/store, change the type to
16010 // match the standard alignment.
16011 // The alignment is overlooked when selecting _UPD variants; and it's
16012 // easier to introduce bitcasts here than fix that.
16013 // There are 3 ways to get to this base-update combine:
16014 // - intrinsics: they are assumed to be properly aligned (to the standard
16015 // alignment of the memory type), so we don't need to do anything.
16016 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16017 // intrinsics, so, likewise, there's nothing to do.
16018 // - generic load/store instructions: the alignment is specified as an
16019 // explicit operand, rather than implicitly as the standard alignment
16020 // of the memory type (like the intrisics). We need to change the
16021 // memory type to match the explicit alignment. That way, we don't
16022 // generate non-standard-aligned ARMISD::VLDx nodes.
16023 if (isa<LSBaseSDNode>(N)) {
16024 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16025 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16026 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16027 assert(!isLaneOp && "Unexpected generic load/store lane.");
16028 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16029 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16030 }
16031 // Don't set an explicit alignment on regular load/stores that we want
16032 // to transform to VLD/VST 1_UPD nodes.
16033 // This matches the behavior of regular load/stores, which only get an
16034 // explicit alignment if the MMO alignment is larger than the standard
16035 // alignment of the memory type.
16036 // Intrinsics, however, always get an explicit alignment, set to the
16037 // alignment of the MMO.
16038 Alignment = Align(1);
16039 }
16040
16041 // Create the new updating load/store node.
16042 // First, create an SDVTList for the new updating node's results.
16043 EVT Tys[6];
16044 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16045 unsigned n;
16046 for (n = 0; n < NumResultVecs; ++n)
16047 Tys[n] = AlignedVecTy;
16048 Tys[n++] = MVT::i32;
16049 Tys[n] = MVT::Other;
16050 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16051
16052 // Then, gather the new node's operands.
16054 Ops.push_back(N->getOperand(0)); // incoming chain
16055 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16056 Ops.push_back(User.Inc);
16057
16058 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16059 // Try to match the intrinsic's signature
16060 Ops.push_back(StN->getValue());
16061 } else {
16062 // Loads (and of course intrinsics) match the intrinsics' signature,
16063 // so just add all but the alignment operand.
16064 unsigned LastOperand =
16065 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16066 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16067 Ops.push_back(N->getOperand(i));
16068 }
16069
16070 // For all node types, the alignment operand is always the last one.
16071 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16072
16073 // If this is a non-standard-aligned STORE, the penultimate operand is the
16074 // stored value. Bitcast it to the aligned type.
16075 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16076 SDValue &StVal = Ops[Ops.size() - 2];
16077 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16078 }
16079
16080 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16081 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16082 MemN->getMemOperand());
16083
16084 // Update the uses.
16085 SmallVector<SDValue, 5> NewResults;
16086 for (unsigned i = 0; i < NumResultVecs; ++i)
16087 NewResults.push_back(SDValue(UpdN.getNode(), i));
16088
16089 // If this is an non-standard-aligned LOAD, the first result is the loaded
16090 // value. Bitcast it to the expected result type.
16091 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16092 SDValue &LdVal = NewResults[0];
16093 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16094 }
16095
16096 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16097 DCI.CombineTo(N, NewResults);
16098 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16099
16100 return true;
16101}
16102
16103// If (opcode ptr inc) is and ADD-like instruction, return the
16104// increment value. Otherwise return 0.
16105static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16106 SDValue Inc, const SelectionDAG &DAG) {
16107 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
16108 if (!CInc)
16109 return 0;
16110
16111 switch (Opcode) {
16112 case ARMISD::VLD1_UPD:
16113 case ISD::ADD:
16114 return CInc->getZExtValue();
16115 case ISD::OR: {
16116 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16117 // (OR ptr inc) is the same as (ADD ptr inc)
16118 return CInc->getZExtValue();
16119 }
16120 return 0;
16121 }
16122 default:
16123 return 0;
16124 }
16125}
16126
16128 switch (N->getOpcode()) {
16129 case ISD::ADD:
16130 case ISD::OR: {
16131 if (isa<ConstantSDNode>(N->getOperand(1))) {
16132 *Ptr = N->getOperand(0);
16133 *CInc = N->getOperand(1);
16134 return true;
16135 }
16136 return false;
16137 }
16138 case ARMISD::VLD1_UPD: {
16139 if (isa<ConstantSDNode>(N->getOperand(2))) {
16140 *Ptr = N->getOperand(1);
16141 *CInc = N->getOperand(2);
16142 return true;
16143 }
16144 return false;
16145 }
16146 default:
16147 return false;
16148 }
16149}
16150
16152 // Check that the add is independent of the load/store.
16153 // Otherwise, folding it would create a cycle. Search through Addr
16154 // as well, since the User may not be a direct user of Addr and
16155 // only share a base pointer.
16158 Worklist.push_back(N);
16159 Worklist.push_back(User);
16160 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16161 SDNode::hasPredecessorHelper(User, Visited, Worklist))
16162 return false;
16163 return true;
16164}
16165
16166/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16167/// NEON load/store intrinsics, and generic vector load/stores, to merge
16168/// base address updates.
16169/// For generic load/stores, the memory type is assumed to be a vector.
16170/// The caller is assumed to have checked legality.
16173 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16174 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16175 const bool isStore = N->getOpcode() == ISD::STORE;
16176 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16177 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16178
16179 SDValue Addr = N->getOperand(AddrOpIdx);
16180
16182
16183 // Search for a use of the address operand that is an increment.
16184 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
16185 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
16186 SDNode *User = *UI;
16187 if (UI.getUse().getResNo() != Addr.getResNo() ||
16188 User->getNumOperands() != 2)
16189 continue;
16190
16191 SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
16192 unsigned ConstInc =
16193 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16194
16195 if (ConstInc || User->getOpcode() == ISD::ADD)
16196 BaseUpdates.push_back({User, Inc, ConstInc});
16197 }
16198
16199 // If the address is a constant pointer increment itself, find
16200 // another constant increment that has the same base operand
16201 SDValue Base;
16202 SDValue CInc;
16203 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16204 unsigned Offset =
16205 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16206 for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
16207 UI != UE; ++UI) {
16208
16209 SDNode *User = *UI;
16210 if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
16211 User->getNumOperands() != 2)
16212 continue;
16213
16214 SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
16215 unsigned UserOffset =
16216 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16217
16218 if (!UserOffset || UserOffset <= Offset)
16219 continue;
16220
16221 unsigned NewConstInc = UserOffset - Offset;
16222 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16223 BaseUpdates.push_back({User, NewInc, NewConstInc});
16224 }
16225 }
16226
16227 // Try to fold the load/store with an update that matches memory
16228 // access size. This should work well for sequential loads.
16229 //
16230 // Filter out invalid updates as well.
16231 unsigned NumValidUpd = BaseUpdates.size();
16232 for (unsigned I = 0; I < NumValidUpd;) {
16233 BaseUpdateUser &User = BaseUpdates[I];
16234 if (!isValidBaseUpdate(N, User.N)) {
16235 --NumValidUpd;
16236 std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
16237 continue;
16238 }
16239
16240 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16241 return SDValue();
16242 ++I;
16243 }
16244 BaseUpdates.resize(NumValidUpd);
16245
16246 // Try to fold with other users. Non-constant updates are considered
16247 // first, and constant updates are sorted to not break a sequence of
16248 // strided accesses (if there is any).
16249 std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(),
16250 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16251 return LHS.ConstInc < RHS.ConstInc;
16252 });
16253 for (BaseUpdateUser &User : BaseUpdates) {
16254 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16255 return SDValue();
16256 }
16257 return SDValue();
16258}
16259
16262 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16263 return SDValue();
16264
16265 return CombineBaseUpdate(N, DCI);
16266}
16267
16270 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16271 return SDValue();
16272
16273 SelectionDAG &DAG = DCI.DAG;
16274 SDValue Addr = N->getOperand(2);
16275 MemSDNode *MemN = cast<MemSDNode>(N);
16276 SDLoc dl(N);
16277
16278 // For the stores, where there are multiple intrinsics we only actually want
16279 // to post-inc the last of the them.
16280 unsigned IntNo = N->getConstantOperandVal(1);
16281 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16282 return SDValue();
16283 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16284 return SDValue();
16285
16286 // Search for a use of the address operand that is an increment.
16287 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
16288 UE = Addr.getNode()->use_end();
16289 UI != UE; ++UI) {
16290 SDNode *User = *UI;
16291 if (User->getOpcode() != ISD::ADD ||
16292 UI.getUse().getResNo() != Addr.getResNo())
16293 continue;
16294
16295 // Check that the add is independent of the load/store. Otherwise, folding
16296 // it would create a cycle. We can avoid searching through Addr as it's a
16297 // predecessor to both.
16300 Visited.insert(Addr.getNode());
16301 Worklist.push_back(N);
16302 Worklist.push_back(User);
16303 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16304 SDNode::hasPredecessorHelper(User, Visited, Worklist))
16305 continue;
16306
16307 // Find the new opcode for the updating load/store.
16308 bool isLoadOp = true;
16309 unsigned NewOpc = 0;
16310 unsigned NumVecs = 0;
16311 switch (IntNo) {
16312 default:
16313 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16314 case Intrinsic::arm_mve_vld2q:
16315 NewOpc = ARMISD::VLD2_UPD;
16316 NumVecs = 2;
16317 break;
16318 case Intrinsic::arm_mve_vld4q:
16319 NewOpc = ARMISD::VLD4_UPD;
16320 NumVecs = 4;
16321 break;
16322 case Intrinsic::arm_mve_vst2q:
16323 NewOpc = ARMISD::VST2_UPD;
16324 NumVecs = 2;
16325 isLoadOp = false;
16326 break;
16327 case Intrinsic::arm_mve_vst4q:
16328 NewOpc = ARMISD::VST4_UPD;
16329 NumVecs = 4;
16330 isLoadOp = false;
16331 break;
16332 }
16333
16334 // Find the size of memory referenced by the load/store.
16335 EVT VecTy;
16336 if (isLoadOp) {
16337 VecTy = N->getValueType(0);
16338 } else {
16339 VecTy = N->getOperand(3).getValueType();
16340 }
16341
16342 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16343
16344 // If the increment is a constant, it must match the memory ref size.
16345 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16346 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
16347 if (!CInc || CInc->getZExtValue() != NumBytes)
16348 continue;
16349
16350 // Create the new updating load/store node.
16351 // First, create an SDVTList for the new updating node's results.
16352 EVT Tys[6];
16353 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16354 unsigned n;
16355 for (n = 0; n < NumResultVecs; ++n)
16356 Tys[n] = VecTy;
16357 Tys[n++] = MVT::i32;
16358 Tys[n] = MVT::Other;
16359 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16360
16361 // Then, gather the new node's operands.
16363 Ops.push_back(N->getOperand(0)); // incoming chain
16364 Ops.push_back(N->getOperand(2)); // ptr
16365 Ops.push_back(Inc);
16366
16367 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16368 Ops.push_back(N->getOperand(i));
16369
16370 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16371 MemN->getMemOperand());
16372
16373 // Update the uses.
16374 SmallVector<SDValue, 5> NewResults;
16375 for (unsigned i = 0; i < NumResultVecs; ++i)
16376 NewResults.push_back(SDValue(UpdN.getNode(), i));
16377
16378 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16379 DCI.CombineTo(N, NewResults);
16380 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16381
16382 break;
16383 }
16384
16385 return SDValue();
16386}
16387
16388/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16389/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16390/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16391/// return true.
16393 SelectionDAG &DAG = DCI.DAG;
16394 EVT VT = N->getValueType(0);
16395 // vldN-dup instructions only support 64-bit vectors for N > 1.
16396 if (!VT.is64BitVector())
16397 return false;
16398
16399 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16400 SDNode *VLD = N->getOperand(0).getNode();
16401 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16402 return false;
16403 unsigned NumVecs = 0;
16404 unsigned NewOpc = 0;
16405 unsigned IntNo = VLD->getConstantOperandVal(1);
16406 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16407 NumVecs = 2;
16408 NewOpc = ARMISD::VLD2DUP;
16409 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16410 NumVecs = 3;
16411 NewOpc = ARMISD::VLD3DUP;
16412 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16413 NumVecs = 4;
16414 NewOpc = ARMISD::VLD4DUP;
16415 } else {
16416 return false;
16417 }
16418
16419 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16420 // numbers match the load.
16421 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16422 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
16423 UI != UE; ++UI) {
16424 // Ignore uses of the chain result.
16425 if (UI.getUse().getResNo() == NumVecs)
16426 continue;
16427 SDNode *User = *UI;
16428 if (User->getOpcode() != ARMISD::VDUPLANE ||
16429 VLDLaneNo != User->getConstantOperandVal(1))
16430 return false;
16431 }
16432
16433 // Create the vldN-dup node.
16434 EVT Tys[5];
16435 unsigned n;
16436 for (n = 0; n < NumVecs; ++n)
16437 Tys[n] = VT;
16438 Tys[n] = MVT::Other;
16439 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16440 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16441 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
16442 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16443 Ops, VLDMemInt->getMemoryVT(),
16444 VLDMemInt->getMemOperand());
16445
16446 // Update the uses.
16447 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
16448 UI != UE; ++UI) {
16449 unsigned ResNo = UI.getUse().getResNo();
16450 // Ignore uses of the chain result.
16451 if (ResNo == NumVecs)
16452 continue;
16453 SDNode *User = *UI;
16454 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
16455 }
16456
16457 // Now the vldN-lane intrinsic is dead except for its chain result.
16458 // Update uses of the chain.
16459 std::vector<SDValue> VLDDupResults;
16460 for (unsigned n = 0; n < NumVecs; ++n)
16461 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16462 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16463 DCI.CombineTo(VLD, VLDDupResults);
16464
16465 return true;
16466}
16467
16468/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16469/// ARMISD::VDUPLANE.
16472 const ARMSubtarget *Subtarget) {
16473 SDValue Op = N->getOperand(0);
16474 EVT VT = N->getValueType(0);
16475
16476 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16477 if (Subtarget->hasMVEIntegerOps()) {
16478 EVT ExtractVT = VT.getVectorElementType();
16479 // We need to ensure we are creating a legal type.
16480 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16481 ExtractVT = MVT::i32;
16482 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16483 N->getOperand(0), N->getOperand(1));
16484 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16485 }
16486
16487 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16488 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16489 if (CombineVLDDUP(N, DCI))
16490 return SDValue(N, 0);
16491
16492 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16493 // redundant. Ignore bit_converts for now; element sizes are checked below.
16494 while (Op.getOpcode() == ISD::BITCAST)
16495 Op = Op.getOperand(0);
16496 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16497 return SDValue();
16498
16499 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16500 unsigned EltSize = Op.getScalarValueSizeInBits();
16501 // The canonical VMOV for a zero vector uses a 32-bit element size.
16502 unsigned Imm = Op.getConstantOperandVal(0);
16503 unsigned EltBits;
16504 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16505 EltSize = 8;
16506 if (EltSize > VT.getScalarSizeInBits())
16507 return SDValue();
16508
16509 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16510}
16511
16512/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16514 const ARMSubtarget *Subtarget) {
16515 SDValue Op = N->getOperand(0);
16516 SDLoc dl(N);
16517
16518 if (Subtarget->hasMVEIntegerOps()) {
16519 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16520 // need to come from a GPR.
16521 if (Op.getValueType() == MVT::f32)
16522 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16523 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16524 else if (Op.getValueType() == MVT::f16)
16525 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16526 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16527 }
16528
16529 if (!Subtarget->hasNEON())
16530 return SDValue();
16531
16532 // Match VDUP(LOAD) -> VLD1DUP.
16533 // We match this pattern here rather than waiting for isel because the
16534 // transform is only legal for unindexed loads.
16535 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16536 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16537 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16538 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16539 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16540 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16541 SDValue VLDDup =
16542 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
16543 LD->getMemoryVT(), LD->getMemOperand());
16544 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16545 return VLDDup;
16546 }
16547
16548 return SDValue();
16549}
16550
16553 const ARMSubtarget *Subtarget) {
16554 EVT VT = N->getValueType(0);
16555
16556 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16557 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16559 return CombineBaseUpdate(N, DCI);
16560
16561 return SDValue();
16562}
16563
16564// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16565// pack all of the elements in one place. Next, store to memory in fewer
16566// chunks.
16568 SelectionDAG &DAG) {
16569 SDValue StVal = St->getValue();
16570 EVT VT = StVal.getValueType();
16571 if (!St->isTruncatingStore() || !VT.isVector())
16572 return SDValue();
16573 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16574 EVT StVT = St->getMemoryVT();
16575 unsigned NumElems = VT.getVectorNumElements();
16576 assert(StVT != VT && "Cannot truncate to the same type");
16577 unsigned FromEltSz = VT.getScalarSizeInBits();
16578 unsigned ToEltSz = StVT.getScalarSizeInBits();
16579
16580 // From, To sizes and ElemCount must be pow of two
16581 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16582 return SDValue();
16583
16584 // We are going to use the original vector elt for storing.
16585 // Accumulated smaller vector elements must be a multiple of the store size.
16586 if (0 != (NumElems * FromEltSz) % ToEltSz)
16587 return SDValue();
16588
16589 unsigned SizeRatio = FromEltSz / ToEltSz;
16590 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16591
16592 // Create a type on which we perform the shuffle.
16593 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16594 NumElems * SizeRatio);
16595 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16596
16597 SDLoc DL(St);
16598 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16599 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16600 for (unsigned i = 0; i < NumElems; ++i)
16601 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16602 : i * SizeRatio;
16603
16604 // Can't shuffle using an illegal type.
16605 if (!TLI.isTypeLegal(WideVecVT))
16606 return SDValue();
16607
16608 SDValue Shuff = DAG.getVectorShuffle(
16609 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16610 // At this point all of the data is stored at the bottom of the
16611 // register. We now need to save it to mem.
16612
16613 // Find the largest store unit
16614 MVT StoreType = MVT::i8;
16615 for (MVT Tp : MVT::integer_valuetypes()) {
16616 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16617 StoreType = Tp;
16618 }
16619 // Didn't find a legal store type.
16620 if (!TLI.isTypeLegal(StoreType))
16621 return SDValue();
16622
16623 // Bitcast the original vector into a vector of store-size units
16624 EVT StoreVecVT =
16625 EVT::getVectorVT(*DAG.getContext(), StoreType,
16626 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16627 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16628 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16630 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16631 TLI.getPointerTy(DAG.getDataLayout()));
16632 SDValue BasePtr = St->getBasePtr();
16633
16634 // Perform one or more big stores into memory.
16635 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16636 for (unsigned I = 0; I < E; I++) {
16637 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16638 ShuffWide, DAG.getIntPtrConstant(I, DL));
16639 SDValue Ch =
16640 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16641 St->getAlign(), St->getMemOperand()->getFlags());
16642 BasePtr =
16643 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16644 Chains.push_back(Ch);
16645 }
16646 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16647}
16648
16649// Try taking a single vector store from an fpround (which would otherwise turn
16650// into an expensive buildvector) and splitting it into a series of narrowing
16651// stores.
16653 SelectionDAG &DAG) {
16654 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16655 return SDValue();
16656 SDValue Trunc = St->getValue();
16657 if (Trunc->getOpcode() != ISD::FP_ROUND)
16658 return SDValue();
16659 EVT FromVT = Trunc->getOperand(0).getValueType();
16660 EVT ToVT = Trunc.getValueType();
16661 if (!ToVT.isVector())
16662 return SDValue();
16664 EVT ToEltVT = ToVT.getVectorElementType();
16665 EVT FromEltVT = FromVT.getVectorElementType();
16666
16667 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16668 return SDValue();
16669
16670 unsigned NumElements = 4;
16671 if (FromVT.getVectorNumElements() % NumElements != 0)
16672 return SDValue();
16673
16674 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16675 // use the VMOVN over splitting the store. We are looking for patterns of:
16676 // !rev: 0 N 1 N+1 2 N+2 ...
16677 // rev: N 0 N+1 1 N+2 2 ...
16678 // The shuffle may either be a single source (in which case N = NumElts/2) or
16679 // two inputs extended with concat to the same size (in which case N =
16680 // NumElts).
16681 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16682 ArrayRef<int> M = SVN->getMask();
16683 unsigned NumElts = ToVT.getVectorNumElements();
16684 if (SVN->getOperand(1).isUndef())
16685 NumElts /= 2;
16686
16687 unsigned Off0 = Rev ? NumElts : 0;
16688 unsigned Off1 = Rev ? 0 : NumElts;
16689
16690 for (unsigned I = 0; I < NumElts; I += 2) {
16691 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16692 return false;
16693 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16694 return false;
16695 }
16696
16697 return true;
16698 };
16699
16700 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16701 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16702 return SDValue();
16703
16704 LLVMContext &C = *DAG.getContext();
16705 SDLoc DL(St);
16706 // Details about the old store
16707 SDValue Ch = St->getChain();
16708 SDValue BasePtr = St->getBasePtr();
16709 Align Alignment = St->getOriginalAlign();
16711 AAMDNodes AAInfo = St->getAAInfo();
16712
16713 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16714 // and then stored as truncating integer stores.
16715 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16716 EVT NewToVT = EVT::getVectorVT(
16717 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16718
16720 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16721 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16722 SDValue NewPtr =
16723 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16724
16725 SDValue Extract =
16726 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16727 DAG.getConstant(i * NumElements, DL, MVT::i32));
16728
16729 SDValue FPTrunc =
16730 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16731 Extract, DAG.getConstant(0, DL, MVT::i32));
16732 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16733
16734 SDValue Store = DAG.getTruncStore(
16735 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16736 NewToVT, Alignment, MMOFlags, AAInfo);
16737 Stores.push_back(Store);
16738 }
16739 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16740}
16741
16742// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16743// into an expensive buildvector) and splitting it into a series of narrowing
16744// stores.
16746 SelectionDAG &DAG) {
16747 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16748 return SDValue();
16749 SDValue Trunc = St->getValue();
16750 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16751 return SDValue();
16752 EVT FromVT = Trunc->getOperand(0).getValueType();
16753 EVT ToVT = Trunc.getValueType();
16754
16755 LLVMContext &C = *DAG.getContext();
16756 SDLoc DL(St);
16757 // Details about the old store
16758 SDValue Ch = St->getChain();
16759 SDValue BasePtr = St->getBasePtr();
16760 Align Alignment = St->getOriginalAlign();
16762 AAMDNodes AAInfo = St->getAAInfo();
16763
16764 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16765 FromVT.getVectorNumElements());
16766
16768 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16769 unsigned NewOffset =
16770 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16771 SDValue NewPtr =
16772 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16773
16774 SDValue Extract = Trunc.getOperand(i);
16775 SDValue Store = DAG.getTruncStore(
16776 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16777 NewToVT, Alignment, MMOFlags, AAInfo);
16778 Stores.push_back(Store);
16779 }
16780 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16781}
16782
16783// Given a floating point store from an extracted vector, with an integer
16784// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16785// help reduce fp register pressure, doesn't require the fp extract and allows
16786// use of more integer post-inc stores not available with vstr.
16788 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16789 return SDValue();
16790 SDValue Extract = St->getValue();
16791 EVT VT = Extract.getValueType();
16792 // For now only uses f16. This may be useful for f32 too, but that will
16793 // be bitcast(extract), not the VGETLANEu we currently check here.
16794 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16795 return SDValue();
16796
16797 SDNode *GetLane =
16798 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16799 {Extract.getOperand(0), Extract.getOperand(1)});
16800 if (!GetLane)
16801 return SDValue();
16802
16803 LLVMContext &C = *DAG.getContext();
16804 SDLoc DL(St);
16805 // Create a new integer store to replace the existing floating point version.
16806 SDValue Ch = St->getChain();
16807 SDValue BasePtr = St->getBasePtr();
16808 Align Alignment = St->getOriginalAlign();
16810 AAMDNodes AAInfo = St->getAAInfo();
16811 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16812 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16813 St->getPointerInfo(), NewToVT, Alignment,
16814 MMOFlags, AAInfo);
16815
16816 return Store;
16817}
16818
16819/// PerformSTORECombine - Target-specific dag combine xforms for
16820/// ISD::STORE.
16823 const ARMSubtarget *Subtarget) {
16824 StoreSDNode *St = cast<StoreSDNode>(N);
16825 if (St->isVolatile())
16826 return SDValue();
16827 SDValue StVal = St->getValue();
16828 EVT VT = StVal.getValueType();
16829
16830 if (Subtarget->hasNEON())
16831 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16832 return Store;
16833
16834 if (Subtarget->hasMVEFloatOps())
16835 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16836 return NewToken;
16837
16838 if (Subtarget->hasMVEIntegerOps()) {
16839 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16840 return NewChain;
16841 if (SDValue NewToken =
16843 return NewToken;
16844 }
16845
16846 if (!ISD::isNormalStore(St))
16847 return SDValue();
16848
16849 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16850 // ARM stores of arguments in the same cache line.
16851 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16852 StVal.getNode()->hasOneUse()) {
16853 SelectionDAG &DAG = DCI.DAG;
16854 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16855 SDLoc DL(St);
16856 SDValue BasePtr = St->getBasePtr();
16857 SDValue NewST1 = DAG.getStore(
16858 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16859 BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
16860 St->getMemOperand()->getFlags());
16861
16862 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16863 DAG.getConstant(4, DL, MVT::i32));
16864 return DAG.getStore(NewST1.getValue(0), DL,
16865 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16866 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16867 St->getOriginalAlign(),
16868 St->getMemOperand()->getFlags());
16869 }
16870
16871 if (StVal.getValueType() == MVT::i64 &&
16873
16874 // Bitcast an i64 store extracted from a vector to f64.
16875 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16876 SelectionDAG &DAG = DCI.DAG;
16877 SDLoc dl(StVal);
16878 SDValue IntVec = StVal.getOperand(0);
16879 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16881 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16882 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16883 Vec, StVal.getOperand(1));
16884 dl = SDLoc(N);
16885 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16886 // Make the DAGCombiner fold the bitcasts.
16887 DCI.AddToWorklist(Vec.getNode());
16888 DCI.AddToWorklist(ExtElt.getNode());
16889 DCI.AddToWorklist(V.getNode());
16890 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16891 St->getPointerInfo(), St->getAlign(),
16892 St->getMemOperand()->getFlags(), St->getAAInfo());
16893 }
16894
16895 // If this is a legal vector store, try to combine it into a VST1_UPD.
16896 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16898 return CombineBaseUpdate(N, DCI);
16899
16900 return SDValue();
16901}
16902
16903/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16904/// can replace combinations of VMUL and VCVT (floating-point to integer)
16905/// when the VMUL has a constant operand that is a power of 2.
16906///
16907/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16908/// vmul.f32 d16, d17, d16
16909/// vcvt.s32.f32 d16, d16
16910/// becomes:
16911/// vcvt.s32.f32 d16, d16, #3
16913 const ARMSubtarget *Subtarget) {
16914 if (!Subtarget->hasNEON())
16915 return SDValue();
16916
16917 SDValue Op = N->getOperand(0);
16918 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16919 Op.getOpcode() != ISD::FMUL)
16920 return SDValue();
16921
16922 SDValue ConstVec = Op->getOperand(1);
16923 if (!isa<BuildVectorSDNode>(ConstVec))
16924 return SDValue();
16925
16926 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16927 uint32_t FloatBits = FloatTy.getSizeInBits();
16928 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16929 uint32_t IntBits = IntTy.getSizeInBits();
16930 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16931 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16932 // These instructions only exist converting from f32 to i32. We can handle
16933 // smaller integers by generating an extra truncate, but larger ones would
16934 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16935 // these intructions only support v2i32/v4i32 types.
16936 return SDValue();
16937 }
16938
16939 BitVector UndefElements;
16940 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
16941 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16942 if (C == -1 || C == 0 || C > 32)
16943 return SDValue();
16944
16945 SDLoc dl(N);
16946 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16947 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16948 Intrinsic::arm_neon_vcvtfp2fxu;
16949 SDValue FixConv = DAG.getNode(
16950 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16951 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16952 DAG.getConstant(C, dl, MVT::i32));
16953
16954 if (IntBits < FloatBits)
16955 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16956
16957 return FixConv;
16958}
16959
16961 const ARMSubtarget *Subtarget) {
16962 if (!Subtarget->hasMVEFloatOps())
16963 return SDValue();
16964
16965 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16966 // The second form can be more easily turned into a predicated vadd, and
16967 // possibly combined into a fma to become a predicated vfma.
16968 SDValue Op0 = N->getOperand(0);
16969 SDValue Op1 = N->getOperand(1);
16970 EVT VT = N->getValueType(0);
16971 SDLoc DL(N);
16972
16973 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16974 // which these VMOV's represent.
16975 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16976 if (Op.getOpcode() != ISD::BITCAST ||
16977 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16978 return false;
16979 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16980 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16981 return true;
16982 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16983 return true;
16984 return false;
16985 };
16986
16987 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16988 std::swap(Op0, Op1);
16989
16990 if (Op1.getOpcode() != ISD::VSELECT)
16991 return SDValue();
16992
16993 SDNodeFlags FaddFlags = N->getFlags();
16994 bool NSZ = FaddFlags.hasNoSignedZeros();
16995 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16996 return SDValue();
16997
16998 SDValue FAdd =
16999 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
17000 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
17001}
17002
17004 SDValue LHS = N->getOperand(0);
17005 SDValue RHS = N->getOperand(1);
17006 EVT VT = N->getValueType(0);
17007 SDLoc DL(N);
17008
17009 if (!N->getFlags().hasAllowReassociation())
17010 return SDValue();
17011
17012 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17013 auto ReassocComplex = [&](SDValue A, SDValue B) {
17014 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17015 return SDValue();
17016 unsigned Opc = A.getConstantOperandVal(0);
17017 if (Opc != Intrinsic::arm_mve_vcmlaq)
17018 return SDValue();
17019 SDValue VCMLA = DAG.getNode(
17020 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17021 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17022 A.getOperand(3), A.getOperand(4));
17023 VCMLA->setFlags(A->getFlags());
17024 return VCMLA;
17025 };
17026 if (SDValue R = ReassocComplex(LHS, RHS))
17027 return R;
17028 if (SDValue R = ReassocComplex(RHS, LHS))
17029 return R;
17030
17031 return SDValue();
17032}
17033
17035 const ARMSubtarget *Subtarget) {
17036 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17037 return S;
17038 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17039 return S;
17040 return SDValue();
17041}
17042
17043/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17044/// can replace combinations of VCVT (integer to floating-point) and VMUL
17045/// when the VMUL has a constant operand that is a power of 2.
17046///
17047/// Example (assume d17 = <float 0.125, float 0.125>):
17048/// vcvt.f32.s32 d16, d16
17049/// vmul.f32 d16, d16, d17
17050/// becomes:
17051/// vcvt.f32.s32 d16, d16, #3
17053 const ARMSubtarget *Subtarget) {
17054 if (!Subtarget->hasNEON())
17055 return SDValue();
17056
17057 SDValue Op = N->getOperand(0);
17058 unsigned OpOpcode = Op.getNode()->getOpcode();
17059 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17060 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17061 return SDValue();
17062
17063 SDValue ConstVec = N->getOperand(1);
17064 if (!isa<BuildVectorSDNode>(ConstVec))
17065 return SDValue();
17066
17067 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17068 uint32_t FloatBits = FloatTy.getSizeInBits();
17069 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17070 uint32_t IntBits = IntTy.getSizeInBits();
17071 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17072 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17073 // These instructions only exist converting from i32 to f32. We can handle
17074 // smaller integers by generating an extra extend, but larger ones would
17075 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17076 // these intructions only support v2i32/v4i32 types.
17077 return SDValue();
17078 }
17079
17080 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17081 APFloat Recip(0.0f);
17082 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17083 return SDValue();
17084
17085 bool IsExact;
17086 APSInt IntVal(33);
17087 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17088 APFloat::opOK ||
17089 !IsExact)
17090 return SDValue();
17091
17092 int32_t C = IntVal.exactLogBase2();
17093 if (C == -1 || C == 0 || C > 32)
17094 return SDValue();
17095
17096 SDLoc DL(N);
17097 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17098 SDValue ConvInput = Op.getOperand(0);
17099 if (IntBits < FloatBits)
17101 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17102
17103 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17104 : Intrinsic::arm_neon_vcvtfxu2fp;
17105 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17106 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17107 DAG.getConstant(C, DL, MVT::i32));
17108}
17109
17111 const ARMSubtarget *ST) {
17112 if (!ST->hasMVEIntegerOps())
17113 return SDValue();
17114
17115 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17116 EVT ResVT = N->getValueType(0);
17117 SDValue N0 = N->getOperand(0);
17118 SDLoc dl(N);
17119
17120 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17121 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17122 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17123 N0.getValueType() == MVT::v16i8)) {
17124 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17125 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17126 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17127 }
17128
17129 // We are looking for something that will have illegal types if left alone,
17130 // but that we can convert to a single instruction under MVE. For example
17131 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17132 // or
17133 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17134
17135 // The legal cases are:
17136 // VADDV u/s 8/16/32
17137 // VMLAV u/s 8/16/32
17138 // VADDLV u/s 32
17139 // VMLALV u/s 16/32
17140
17141 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17142 // extend it and use v4i32 instead.
17143 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17144 EVT AVT = A.getValueType();
17145 return any_of(ExtTypes, [&](MVT Ty) {
17146 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17147 AVT.bitsLE(Ty);
17148 });
17149 };
17150 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17151 EVT AVT = A.getValueType();
17152 if (!AVT.is128BitVector())
17153 A = DAG.getNode(ExtendCode, dl,
17155 128 / AVT.getVectorMinNumElements())),
17156 A);
17157 return A;
17158 };
17159 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17160 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17161 return SDValue();
17162 SDValue A = N0->getOperand(0);
17163 if (ExtTypeMatches(A, ExtTypes))
17164 return ExtendIfNeeded(A, ExtendCode);
17165 return SDValue();
17166 };
17167 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17168 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17169 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17171 return SDValue();
17172 Mask = N0->getOperand(0);
17173 SDValue Ext = N0->getOperand(1);
17174 if (Ext->getOpcode() != ExtendCode)
17175 return SDValue();
17176 SDValue A = Ext->getOperand(0);
17177 if (ExtTypeMatches(A, ExtTypes))
17178 return ExtendIfNeeded(A, ExtendCode);
17179 return SDValue();
17180 };
17181 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17182 SDValue &A, SDValue &B) {
17183 // For a vmla we are trying to match a larger pattern:
17184 // ExtA = sext/zext A
17185 // ExtB = sext/zext B
17186 // Mul = mul ExtA, ExtB
17187 // vecreduce.add Mul
17188 // There might also be en extra extend between the mul and the addreduce, so
17189 // long as the bitwidth is high enough to make them equivalent (for example
17190 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17191 if (ResVT != RetTy)
17192 return false;
17193 SDValue Mul = N0;
17194 if (Mul->getOpcode() == ExtendCode &&
17195 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17196 ResVT.getScalarSizeInBits())
17197 Mul = Mul->getOperand(0);
17198 if (Mul->getOpcode() != ISD::MUL)
17199 return false;
17200 SDValue ExtA = Mul->getOperand(0);
17201 SDValue ExtB = Mul->getOperand(1);
17202 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17203 return false;
17204 A = ExtA->getOperand(0);
17205 B = ExtB->getOperand(0);
17206 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17207 A = ExtendIfNeeded(A, ExtendCode);
17208 B = ExtendIfNeeded(B, ExtendCode);
17209 return true;
17210 }
17211 return false;
17212 };
17213 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17214 SDValue &A, SDValue &B, SDValue &Mask) {
17215 // Same as the pattern above with a select for the zero predicated lanes
17216 // ExtA = sext/zext A
17217 // ExtB = sext/zext B
17218 // Mul = mul ExtA, ExtB
17219 // N0 = select Mask, Mul, 0
17220 // vecreduce.add N0
17221 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17223 return false;
17224 Mask = N0->getOperand(0);
17225 SDValue Mul = N0->getOperand(1);
17226 if (Mul->getOpcode() == ExtendCode &&
17227 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17228 ResVT.getScalarSizeInBits())
17229 Mul = Mul->getOperand(0);
17230 if (Mul->getOpcode() != ISD::MUL)
17231 return false;
17232 SDValue ExtA = Mul->getOperand(0);
17233 SDValue ExtB = Mul->getOperand(1);
17234 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17235 return false;
17236 A = ExtA->getOperand(0);
17237 B = ExtB->getOperand(0);
17238 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17239 A = ExtendIfNeeded(A, ExtendCode);
17240 B = ExtendIfNeeded(B, ExtendCode);
17241 return true;
17242 }
17243 return false;
17244 };
17245 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17246 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17247 // reductions. The operands are extended with MVEEXT, but as they are
17248 // reductions the lane orders do not matter. MVEEXT may be combined with
17249 // loads to produce two extending loads, or else they will be expanded to
17250 // VREV/VMOVL.
17251 EVT VT = Ops[0].getValueType();
17252 if (VT == MVT::v16i8) {
17253 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17254 "Unexpected illegal long reduction opcode");
17255 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17256
17257 SDValue Ext0 =
17258 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17259 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17260 SDValue Ext1 =
17261 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17262 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17263
17264 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17265 Ext0, Ext1);
17266 SDValue MLA1 =
17267 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17268 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17269 Ext0.getValue(1), Ext1.getValue(1));
17270 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17271 }
17272 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17273 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17274 SDValue(Node.getNode(), 1));
17275 };
17276
17277 SDValue A, B;
17278 SDValue Mask;
17279 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17280 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17281 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17282 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17283 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17284 A, B))
17285 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17286 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17287 A, B))
17288 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17289 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17290 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17291 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17292 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17293 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17294 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17295
17296 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17297 Mask))
17298 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17299 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17300 Mask))
17301 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17302 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17303 Mask))
17304 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17305 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17306 Mask))
17307 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17308 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17309 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17310 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17311 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17312 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17313 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17314
17315 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17316 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17317 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17318 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17319 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17320 return Create64bitNode(ARMISD::VADDLVs, {A});
17321 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17322 return Create64bitNode(ARMISD::VADDLVu, {A});
17323 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17324 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17325 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17326 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17327 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17328 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17329
17330 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17331 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17332 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17333 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17334 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17335 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17336 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17337 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17338 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17339 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17340 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17341 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17342 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17343 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17344
17345 // Some complications. We can get a case where the two inputs of the mul are
17346 // the same, then the output sext will have been helpfully converted to a
17347 // zext. Turn it back.
17348 SDValue Op = N0;
17349 if (Op->getOpcode() == ISD::VSELECT)
17350 Op = Op->getOperand(1);
17351 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17352 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17353 SDValue Mul = Op->getOperand(0);
17354 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17355 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17356 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17357 if (Op != N0)
17358 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17359 N0->getOperand(0), Ext, N0->getOperand(2));
17360 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17361 }
17362 }
17363
17364 return SDValue();
17365}
17366
17367// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17368// the lanes are used. Due to the reduction being commutative the shuffle can be
17369// removed.
17371 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17372 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17373 if (!Shuf || !Shuf->getOperand(1).isUndef())
17374 return SDValue();
17375
17376 // Check all elements are used once in the mask.
17377 ArrayRef<int> Mask = Shuf->getMask();
17378 APInt SetElts(Mask.size(), 0);
17379 for (int E : Mask) {
17380 if (E < 0 || E >= (int)Mask.size())
17381 return SDValue();
17382 SetElts.setBit(E);
17383 }
17384 if (!SetElts.isAllOnes())
17385 return SDValue();
17386
17387 if (N->getNumOperands() != VecOp + 1) {
17388 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17389 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17390 return SDValue();
17391 }
17392
17394 for (SDValue Op : N->ops()) {
17395 if (Op.getValueType().isVector())
17396 Ops.push_back(Op.getOperand(0));
17397 else
17398 Ops.push_back(Op);
17399 }
17400 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17401}
17402
17405 SDValue Op0 = N->getOperand(0);
17406 SDValue Op1 = N->getOperand(1);
17407 unsigned IsTop = N->getConstantOperandVal(2);
17408
17409 // VMOVNT a undef -> a
17410 // VMOVNB a undef -> a
17411 // VMOVNB undef a -> a
17412 if (Op1->isUndef())
17413 return Op0;
17414 if (Op0->isUndef() && !IsTop)
17415 return Op1;
17416
17417 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17418 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17419 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17420 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17421 Op1->getConstantOperandVal(2) == 0)
17422 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17423 Op0, Op1->getOperand(1), N->getOperand(2));
17424
17425 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17426 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17427 // into the top or bottom lanes.
17428 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17429 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17430 APInt Op0DemandedElts =
17431 IsTop ? Op1DemandedElts
17432 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17433
17434 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17435 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17436 return SDValue(N, 0);
17437 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17438 return SDValue(N, 0);
17439
17440 return SDValue();
17441}
17442
17445 SDValue Op0 = N->getOperand(0);
17446 unsigned IsTop = N->getConstantOperandVal(2);
17447
17448 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17449 APInt Op0DemandedElts =
17450 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17451 : APInt::getHighBitsSet(2, 1));
17452
17453 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17454 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17455 return SDValue(N, 0);
17456 return SDValue();
17457}
17458
17461 EVT VT = N->getValueType(0);
17462 SDValue LHS = N->getOperand(0);
17463 SDValue RHS = N->getOperand(1);
17464
17465 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17466 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17467 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17468 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17469 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17470 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17471 SDLoc DL(N);
17472 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17473 LHS.getOperand(0), RHS.getOperand(0));
17474 SDValue UndefV = LHS.getOperand(1);
17475 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17476 }
17477 return SDValue();
17478}
17479
17481 SDLoc DL(N);
17482 SDValue Op0 = N->getOperand(0);
17483 SDValue Op1 = N->getOperand(1);
17484
17485 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17486 // uses of the intrinsics.
17487 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17488 int ShiftAmt = C->getSExtValue();
17489 if (ShiftAmt == 0) {
17490 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17491 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17492 return SDValue();
17493 }
17494
17495 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17496 unsigned NewOpcode =
17497 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17498 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17499 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17500 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17501 return NewShift;
17502 }
17503 }
17504
17505 return SDValue();
17506}
17507
17508/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17510 DAGCombinerInfo &DCI) const {
17511 SelectionDAG &DAG = DCI.DAG;
17512 unsigned IntNo = N->getConstantOperandVal(0);
17513 switch (IntNo) {
17514 default:
17515 // Don't do anything for most intrinsics.
17516 break;
17517
17518 // Vector shifts: check for immediate versions and lower them.
17519 // Note: This is done during DAG combining instead of DAG legalizing because
17520 // the build_vectors for 64-bit vector element shift counts are generally
17521 // not legal, and it is hard to see their values after they get legalized to
17522 // loads from a constant pool.
17523 case Intrinsic::arm_neon_vshifts:
17524 case Intrinsic::arm_neon_vshiftu:
17525 case Intrinsic::arm_neon_vrshifts:
17526 case Intrinsic::arm_neon_vrshiftu:
17527 case Intrinsic::arm_neon_vrshiftn:
17528 case Intrinsic::arm_neon_vqshifts:
17529 case Intrinsic::arm_neon_vqshiftu:
17530 case Intrinsic::arm_neon_vqshiftsu:
17531 case Intrinsic::arm_neon_vqshiftns:
17532 case Intrinsic::arm_neon_vqshiftnu:
17533 case Intrinsic::arm_neon_vqshiftnsu:
17534 case Intrinsic::arm_neon_vqrshiftns:
17535 case Intrinsic::arm_neon_vqrshiftnu:
17536 case Intrinsic::arm_neon_vqrshiftnsu: {
17537 EVT VT = N->getOperand(1).getValueType();
17538 int64_t Cnt;
17539 unsigned VShiftOpc = 0;
17540
17541 switch (IntNo) {
17542 case Intrinsic::arm_neon_vshifts:
17543 case Intrinsic::arm_neon_vshiftu:
17544 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17545 VShiftOpc = ARMISD::VSHLIMM;
17546 break;
17547 }
17548 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17549 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17551 break;
17552 }
17553 return SDValue();
17554
17555 case Intrinsic::arm_neon_vrshifts:
17556 case Intrinsic::arm_neon_vrshiftu:
17557 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17558 break;
17559 return SDValue();
17560
17561 case Intrinsic::arm_neon_vqshifts:
17562 case Intrinsic::arm_neon_vqshiftu:
17563 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17564 break;
17565 return SDValue();
17566
17567 case Intrinsic::arm_neon_vqshiftsu:
17568 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17569 break;
17570 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17571
17572 case Intrinsic::arm_neon_vrshiftn:
17573 case Intrinsic::arm_neon_vqshiftns:
17574 case Intrinsic::arm_neon_vqshiftnu:
17575 case Intrinsic::arm_neon_vqshiftnsu:
17576 case Intrinsic::arm_neon_vqrshiftns:
17577 case Intrinsic::arm_neon_vqrshiftnu:
17578 case Intrinsic::arm_neon_vqrshiftnsu:
17579 // Narrowing shifts require an immediate right shift.
17580 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17581 break;
17582 llvm_unreachable("invalid shift count for narrowing vector shift "
17583 "intrinsic");
17584
17585 default:
17586 llvm_unreachable("unhandled vector shift");
17587 }
17588
17589 switch (IntNo) {
17590 case Intrinsic::arm_neon_vshifts:
17591 case Intrinsic::arm_neon_vshiftu:
17592 // Opcode already set above.
17593 break;
17594 case Intrinsic::arm_neon_vrshifts:
17595 VShiftOpc = ARMISD::VRSHRsIMM;
17596 break;
17597 case Intrinsic::arm_neon_vrshiftu:
17598 VShiftOpc = ARMISD::VRSHRuIMM;
17599 break;
17600 case Intrinsic::arm_neon_vrshiftn:
17601 VShiftOpc = ARMISD::VRSHRNIMM;
17602 break;
17603 case Intrinsic::arm_neon_vqshifts:
17604 VShiftOpc = ARMISD::VQSHLsIMM;
17605 break;
17606 case Intrinsic::arm_neon_vqshiftu:
17607 VShiftOpc = ARMISD::VQSHLuIMM;
17608 break;
17609 case Intrinsic::arm_neon_vqshiftsu:
17610 VShiftOpc = ARMISD::VQSHLsuIMM;
17611 break;
17612 case Intrinsic::arm_neon_vqshiftns:
17613 VShiftOpc = ARMISD::VQSHRNsIMM;
17614 break;
17615 case Intrinsic::arm_neon_vqshiftnu:
17616 VShiftOpc = ARMISD::VQSHRNuIMM;
17617 break;
17618 case Intrinsic::arm_neon_vqshiftnsu:
17619 VShiftOpc = ARMISD::VQSHRNsuIMM;
17620 break;
17621 case Intrinsic::arm_neon_vqrshiftns:
17622 VShiftOpc = ARMISD::VQRSHRNsIMM;
17623 break;
17624 case Intrinsic::arm_neon_vqrshiftnu:
17625 VShiftOpc = ARMISD::VQRSHRNuIMM;
17626 break;
17627 case Intrinsic::arm_neon_vqrshiftnsu:
17628 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17629 break;
17630 }
17631
17632 SDLoc dl(N);
17633 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17634 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17635 }
17636
17637 case Intrinsic::arm_neon_vshiftins: {
17638 EVT VT = N->getOperand(1).getValueType();
17639 int64_t Cnt;
17640 unsigned VShiftOpc = 0;
17641
17642 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17643 VShiftOpc = ARMISD::VSLIIMM;
17644 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17645 VShiftOpc = ARMISD::VSRIIMM;
17646 else {
17647 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17648 }
17649
17650 SDLoc dl(N);
17651 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17652 N->getOperand(1), N->getOperand(2),
17653 DAG.getConstant(Cnt, dl, MVT::i32));
17654 }
17655
17656 case Intrinsic::arm_neon_vqrshifts:
17657 case Intrinsic::arm_neon_vqrshiftu:
17658 // No immediate versions of these to check for.
17659 break;
17660
17661 case Intrinsic::arm_mve_vqdmlah:
17662 case Intrinsic::arm_mve_vqdmlash:
17663 case Intrinsic::arm_mve_vqrdmlah:
17664 case Intrinsic::arm_mve_vqrdmlash:
17665 case Intrinsic::arm_mve_vmla_n_predicated:
17666 case Intrinsic::arm_mve_vmlas_n_predicated:
17667 case Intrinsic::arm_mve_vqdmlah_predicated:
17668 case Intrinsic::arm_mve_vqdmlash_predicated:
17669 case Intrinsic::arm_mve_vqrdmlah_predicated:
17670 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17671 // These intrinsics all take an i32 scalar operand which is narrowed to the
17672 // size of a single lane of the vector type they return. So we don't need
17673 // any bits of that operand above that point, which allows us to eliminate
17674 // uxth/sxth.
17675 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17676 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17677 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17678 return SDValue();
17679 break;
17680 }
17681
17682 case Intrinsic::arm_mve_minv:
17683 case Intrinsic::arm_mve_maxv:
17684 case Intrinsic::arm_mve_minav:
17685 case Intrinsic::arm_mve_maxav:
17686 case Intrinsic::arm_mve_minv_predicated:
17687 case Intrinsic::arm_mve_maxv_predicated:
17688 case Intrinsic::arm_mve_minav_predicated:
17689 case Intrinsic::arm_mve_maxav_predicated: {
17690 // These intrinsics all take an i32 scalar operand which is narrowed to the
17691 // size of a single lane of the vector type they take as the other input.
17692 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17693 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17694 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17695 return SDValue();
17696 break;
17697 }
17698
17699 case Intrinsic::arm_mve_addv: {
17700 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17701 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17702 bool Unsigned = N->getConstantOperandVal(2);
17703 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17704 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17705 }
17706
17707 case Intrinsic::arm_mve_addlv:
17708 case Intrinsic::arm_mve_addlv_predicated: {
17709 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17710 // which recombines the two outputs into an i64
17711 bool Unsigned = N->getConstantOperandVal(2);
17712 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17715
17717 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17718 if (i != 2) // skip the unsigned flag
17719 Ops.push_back(N->getOperand(i));
17720
17721 SDLoc dl(N);
17722 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17723 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17724 val.getValue(1));
17725 }
17726 }
17727
17728 return SDValue();
17729}
17730
17731/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17732/// lowers them. As with the vector shift intrinsics, this is done during DAG
17733/// combining instead of DAG legalizing because the build_vectors for 64-bit
17734/// vector element shift counts are generally not legal, and it is hard to see
17735/// their values after they get legalized to loads from a constant pool.
17738 const ARMSubtarget *ST) {
17739 SelectionDAG &DAG = DCI.DAG;
17740 EVT VT = N->getValueType(0);
17741
17742 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17743 N->getOperand(0)->getOpcode() == ISD::AND &&
17744 N->getOperand(0)->hasOneUse()) {
17745 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17746 return SDValue();
17747 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17748 // usually show up because instcombine prefers to canonicalize it to
17749 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17750 // out of GEP lowering in some cases.
17751 SDValue N0 = N->getOperand(0);
17752 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17753 if (!ShiftAmtNode)
17754 return SDValue();
17755 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17756 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17757 if (!AndMaskNode)
17758 return SDValue();
17759 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17760 // Don't transform uxtb/uxth.
17761 if (AndMask == 255 || AndMask == 65535)
17762 return SDValue();
17763 if (isMask_32(AndMask)) {
17764 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17765 if (MaskedBits > ShiftAmt) {
17766 SDLoc DL(N);
17767 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17768 DAG.getConstant(MaskedBits, DL, MVT::i32));
17769 return DAG.getNode(
17770 ISD::SRL, DL, MVT::i32, SHL,
17771 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17772 }
17773 }
17774 }
17775
17776 // Nothing to be done for scalar shifts.
17777 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17778 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17779 return SDValue();
17780 if (ST->hasMVEIntegerOps())
17781 return SDValue();
17782
17783 int64_t Cnt;
17784
17785 switch (N->getOpcode()) {
17786 default: llvm_unreachable("unexpected shift opcode");
17787
17788 case ISD::SHL:
17789 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17790 SDLoc dl(N);
17791 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17792 DAG.getConstant(Cnt, dl, MVT::i32));
17793 }
17794 break;
17795
17796 case ISD::SRA:
17797 case ISD::SRL:
17798 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17799 unsigned VShiftOpc =
17800 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17801 SDLoc dl(N);
17802 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17803 DAG.getConstant(Cnt, dl, MVT::i32));
17804 }
17805 }
17806 return SDValue();
17807}
17808
17809// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17810// split into multiple extending loads, which are simpler to deal with than an
17811// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17812// to convert the type to an f32.
17814 SDValue N0 = N->getOperand(0);
17815 if (N0.getOpcode() != ISD::LOAD)
17816 return SDValue();
17817 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
17818 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17819 LD->getExtensionType() != ISD::NON_EXTLOAD)
17820 return SDValue();
17821 EVT FromVT = LD->getValueType(0);
17822 EVT ToVT = N->getValueType(0);
17823 if (!ToVT.isVector())
17824 return SDValue();
17826 EVT ToEltVT = ToVT.getVectorElementType();
17827 EVT FromEltVT = FromVT.getVectorElementType();
17828
17829 unsigned NumElements = 0;
17830 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17831 NumElements = 4;
17832 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17833 NumElements = 4;
17834 if (NumElements == 0 ||
17835 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17836 FromVT.getVectorNumElements() % NumElements != 0 ||
17837 !isPowerOf2_32(NumElements))
17838 return SDValue();
17839
17840 LLVMContext &C = *DAG.getContext();
17841 SDLoc DL(LD);
17842 // Details about the old load
17843 SDValue Ch = LD->getChain();
17844 SDValue BasePtr = LD->getBasePtr();
17845 Align Alignment = LD->getOriginalAlign();
17846 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17847 AAMDNodes AAInfo = LD->getAAInfo();
17848
17849 ISD::LoadExtType NewExtType =
17850 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17851 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17852 EVT NewFromVT = EVT::getVectorVT(
17853 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17854 EVT NewToVT = EVT::getVectorVT(
17855 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17856
17859 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17860 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17861 SDValue NewPtr =
17862 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17863
17864 SDValue NewLoad =
17865 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17866 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17867 Alignment, MMOFlags, AAInfo);
17868 Loads.push_back(NewLoad);
17869 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17870 }
17871
17872 // Float truncs need to extended with VCVTB's into their floating point types.
17873 if (FromEltVT == MVT::f16) {
17875
17876 for (unsigned i = 0; i < Loads.size(); i++) {
17877 SDValue LoadBC =
17878 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17879 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17880 DAG.getConstant(0, DL, MVT::i32));
17881 Extends.push_back(FPExt);
17882 }
17883
17884 Loads = Extends;
17885 }
17886
17887 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17888 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17889 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17890}
17891
17892/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17893/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17895 const ARMSubtarget *ST) {
17896 SDValue N0 = N->getOperand(0);
17897
17898 // Check for sign- and zero-extensions of vector extract operations of 8- and
17899 // 16-bit vector elements. NEON and MVE support these directly. They are
17900 // handled during DAG combining because type legalization will promote them
17901 // to 32-bit types and it is messy to recognize the operations after that.
17902 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17904 SDValue Vec = N0.getOperand(0);
17905 SDValue Lane = N0.getOperand(1);
17906 EVT VT = N->getValueType(0);
17907 EVT EltVT = N0.getValueType();
17908 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17909
17910 if (VT == MVT::i32 &&
17911 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17912 TLI.isTypeLegal(Vec.getValueType()) &&
17913 isa<ConstantSDNode>(Lane)) {
17914
17915 unsigned Opc = 0;
17916 switch (N->getOpcode()) {
17917 default: llvm_unreachable("unexpected opcode");
17918 case ISD::SIGN_EXTEND:
17919 Opc = ARMISD::VGETLANEs;
17920 break;
17921 case ISD::ZERO_EXTEND:
17922 case ISD::ANY_EXTEND:
17923 Opc = ARMISD::VGETLANEu;
17924 break;
17925 }
17926 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17927 }
17928 }
17929
17930 if (ST->hasMVEIntegerOps())
17931 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17932 return NewLoad;
17933
17934 return SDValue();
17935}
17936
17938 const ARMSubtarget *ST) {
17939 if (ST->hasMVEFloatOps())
17940 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17941 return NewLoad;
17942
17943 return SDValue();
17944}
17945
17946// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17947// constant bounds.
17949 const ARMSubtarget *Subtarget) {
17950 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17951 !Subtarget->isThumb2())
17952 return SDValue();
17953
17954 EVT VT = Op.getValueType();
17955 SDValue Op0 = Op.getOperand(0);
17956
17957 if (VT != MVT::i32 ||
17958 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17959 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17960 !isa<ConstantSDNode>(Op0.getOperand(1)))
17961 return SDValue();
17962
17963 SDValue Min = Op;
17964 SDValue Max = Op0;
17965 SDValue Input = Op0.getOperand(0);
17966 if (Min.getOpcode() == ISD::SMAX)
17967 std::swap(Min, Max);
17968
17969 APInt MinC = Min.getConstantOperandAPInt(1);
17970 APInt MaxC = Max.getConstantOperandAPInt(1);
17971
17972 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17973 !(MinC + 1).isPowerOf2())
17974 return SDValue();
17975
17976 SDLoc DL(Op);
17977 if (MinC == ~MaxC)
17978 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17979 DAG.getConstant(MinC.countr_one(), DL, VT));
17980 if (MaxC == 0)
17981 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17982 DAG.getConstant(MinC.countr_one(), DL, VT));
17983
17984 return SDValue();
17985}
17986
17987/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17988/// saturates.
17990 const ARMSubtarget *ST) {
17991 EVT VT = N->getValueType(0);
17992 SDValue N0 = N->getOperand(0);
17993
17994 if (VT == MVT::i32)
17995 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17996
17997 if (!ST->hasMVEIntegerOps())
17998 return SDValue();
17999
18000 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18001 return V;
18002
18003 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18004 return SDValue();
18005
18006 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18007 // Check one is a smin and the other is a smax
18008 if (Min->getOpcode() != ISD::SMIN)
18009 std::swap(Min, Max);
18010 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18011 return false;
18012
18013 APInt SaturateC;
18014 if (VT == MVT::v4i32)
18015 SaturateC = APInt(32, (1 << 15) - 1, true);
18016 else //if (VT == MVT::v8i16)
18017 SaturateC = APInt(16, (1 << 7) - 1, true);
18018
18019 APInt MinC, MaxC;
18020 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18021 MinC != SaturateC)
18022 return false;
18023 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18024 MaxC != ~SaturateC)
18025 return false;
18026 return true;
18027 };
18028
18029 if (IsSignedSaturate(N, N0.getNode())) {
18030 SDLoc DL(N);
18031 MVT ExtVT, HalfVT;
18032 if (VT == MVT::v4i32) {
18033 HalfVT = MVT::v8i16;
18034 ExtVT = MVT::v4i16;
18035 } else { // if (VT == MVT::v8i16)
18036 HalfVT = MVT::v16i8;
18037 ExtVT = MVT::v8i8;
18038 }
18039
18040 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18041 // half. That extend will hopefully be removed if only the bottom bits are
18042 // demanded (though a truncating store, for example).
18043 SDValue VQMOVN =
18044 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18045 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18046 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18047 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18048 DAG.getValueType(ExtVT));
18049 }
18050
18051 auto IsUnsignedSaturate = [&](SDNode *Min) {
18052 // For unsigned, we just need to check for <= 0xffff
18053 if (Min->getOpcode() != ISD::UMIN)
18054 return false;
18055
18056 APInt SaturateC;
18057 if (VT == MVT::v4i32)
18058 SaturateC = APInt(32, (1 << 16) - 1, true);
18059 else //if (VT == MVT::v8i16)
18060 SaturateC = APInt(16, (1 << 8) - 1, true);
18061
18062 APInt MinC;
18063 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18064 MinC != SaturateC)
18065 return false;
18066 return true;
18067 };
18068
18069 if (IsUnsignedSaturate(N)) {
18070 SDLoc DL(N);
18071 MVT HalfVT;
18072 unsigned ExtConst;
18073 if (VT == MVT::v4i32) {
18074 HalfVT = MVT::v8i16;
18075 ExtConst = 0x0000FFFF;
18076 } else { //if (VT == MVT::v8i16)
18077 HalfVT = MVT::v16i8;
18078 ExtConst = 0x00FF;
18079 }
18080
18081 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18082 // an AND. That extend will hopefully be removed if only the bottom bits are
18083 // demanded (though a truncating store, for example).
18084 SDValue VQMOVN =
18085 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18086 DAG.getConstant(0, DL, MVT::i32));
18087 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18088 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18089 DAG.getConstant(ExtConst, DL, VT));
18090 }
18091
18092 return SDValue();
18093}
18094
18096 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
18097 if (!C)
18098 return nullptr;
18099 const APInt *CV = &C->getAPIntValue();
18100 return CV->isPowerOf2() ? CV : nullptr;
18101}
18102
18104 // If we have a CMOV, OR and AND combination such as:
18105 // if (x & CN)
18106 // y |= CM;
18107 //
18108 // And:
18109 // * CN is a single bit;
18110 // * All bits covered by CM are known zero in y
18111 //
18112 // Then we can convert this into a sequence of BFI instructions. This will
18113 // always be a win if CM is a single bit, will always be no worse than the
18114 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18115 // three bits (due to the extra IT instruction).
18116
18117 SDValue Op0 = CMOV->getOperand(0);
18118 SDValue Op1 = CMOV->getOperand(1);
18119 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18120 SDValue CmpZ = CMOV->getOperand(4);
18121
18122 // The compare must be against zero.
18123 if (!isNullConstant(CmpZ->getOperand(1)))
18124 return SDValue();
18125
18126 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18127 SDValue And = CmpZ->getOperand(0);
18128 if (And->getOpcode() != ISD::AND)
18129 return SDValue();
18130 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18131 if (!AndC)
18132 return SDValue();
18133 SDValue X = And->getOperand(0);
18134
18135 if (CC == ARMCC::EQ) {
18136 // We're performing an "equal to zero" compare. Swap the operands so we
18137 // canonicalize on a "not equal to zero" compare.
18138 std::swap(Op0, Op1);
18139 } else {
18140 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18141 }
18142
18143 if (Op1->getOpcode() != ISD::OR)
18144 return SDValue();
18145
18146 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
18147 if (!OrC)
18148 return SDValue();
18149 SDValue Y = Op1->getOperand(0);
18150
18151 if (Op0 != Y)
18152 return SDValue();
18153
18154 // Now, is it profitable to continue?
18155 APInt OrCI = OrC->getAPIntValue();
18156 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18157 if (OrCI.popcount() > Heuristic)
18158 return SDValue();
18159
18160 // Lastly, can we determine that the bits defined by OrCI
18161 // are zero in Y?
18162 KnownBits Known = DAG.computeKnownBits(Y);
18163 if ((OrCI & Known.Zero) != OrCI)
18164 return SDValue();
18165
18166 // OK, we can do the combine.
18167 SDValue V = Y;
18168 SDLoc dl(X);
18169 EVT VT = X.getValueType();
18170 unsigned BitInX = AndC->logBase2();
18171
18172 if (BitInX != 0) {
18173 // We must shift X first.
18174 X = DAG.getNode(ISD::SRL, dl, VT, X,
18175 DAG.getConstant(BitInX, dl, VT));
18176 }
18177
18178 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18179 BitInY < NumActiveBits; ++BitInY) {
18180 if (OrCI[BitInY] == 0)
18181 continue;
18182 APInt Mask(VT.getSizeInBits(), 0);
18183 Mask.setBit(BitInY);
18184 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18185 // Confusingly, the operand is an *inverted* mask.
18186 DAG.getConstant(~Mask, dl, VT));
18187 }
18188
18189 return V;
18190}
18191
18192// Given N, the value controlling the conditional branch, search for the loop
18193// intrinsic, returning it, along with how the value is used. We need to handle
18194// patterns such as the following:
18195// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18196// (brcond (setcc (loop.decrement), 0, eq), exit)
18197// (brcond (setcc (loop.decrement), 0, ne), header)
18199 bool &Negate) {
18200 switch (N->getOpcode()) {
18201 default:
18202 break;
18203 case ISD::XOR: {
18204 if (!isa<ConstantSDNode>(N.getOperand(1)))
18205 return SDValue();
18206 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18207 return SDValue();
18208 Negate = !Negate;
18209 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18210 }
18211 case ISD::SETCC: {
18212 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18213 if (!Const)
18214 return SDValue();
18215 if (Const->isZero())
18216 Imm = 0;
18217 else if (Const->isOne())
18218 Imm = 1;
18219 else
18220 return SDValue();
18221 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18222 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18223 }
18225 unsigned IntOp = N.getConstantOperandVal(1);
18226 if (IntOp != Intrinsic::test_start_loop_iterations &&
18227 IntOp != Intrinsic::loop_decrement_reg)
18228 return SDValue();
18229 return N;
18230 }
18231 }
18232 return SDValue();
18233}
18234
18237 const ARMSubtarget *ST) {
18238
18239 // The hwloop intrinsics that we're interested are used for control-flow,
18240 // either for entering or exiting the loop:
18241 // - test.start.loop.iterations will test whether its operand is zero. If it
18242 // is zero, the proceeding branch should not enter the loop.
18243 // - loop.decrement.reg also tests whether its operand is zero. If it is
18244 // zero, the proceeding branch should not branch back to the beginning of
18245 // the loop.
18246 // So here, we need to check that how the brcond is using the result of each
18247 // of the intrinsics to ensure that we're branching to the right place at the
18248 // right time.
18249
18251 SDValue Cond;
18252 int Imm = 1;
18253 bool Negate = false;
18254 SDValue Chain = N->getOperand(0);
18255 SDValue Dest;
18256
18257 if (N->getOpcode() == ISD::BRCOND) {
18258 CC = ISD::SETEQ;
18259 Cond = N->getOperand(1);
18260 Dest = N->getOperand(2);
18261 } else {
18262 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18263 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18264 Cond = N->getOperand(2);
18265 Dest = N->getOperand(4);
18266 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18267 if (!Const->isOne() && !Const->isZero())
18268 return SDValue();
18269 Imm = Const->getZExtValue();
18270 } else
18271 return SDValue();
18272 }
18273
18274 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18275 if (!Int)
18276 return SDValue();
18277
18278 if (Negate)
18279 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18280
18281 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18282 return (CC == ISD::SETEQ && Imm == 0) ||
18283 (CC == ISD::SETNE && Imm == 1) ||
18284 (CC == ISD::SETLT && Imm == 1) ||
18285 (CC == ISD::SETULT && Imm == 1);
18286 };
18287
18288 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18289 return (CC == ISD::SETEQ && Imm == 1) ||
18290 (CC == ISD::SETNE && Imm == 0) ||
18291 (CC == ISD::SETGT && Imm == 0) ||
18292 (CC == ISD::SETUGT && Imm == 0) ||
18293 (CC == ISD::SETGE && Imm == 1) ||
18294 (CC == ISD::SETUGE && Imm == 1);
18295 };
18296
18297 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18298 "unsupported condition");
18299
18300 SDLoc dl(Int);
18301 SelectionDAG &DAG = DCI.DAG;
18302 SDValue Elements = Int.getOperand(2);
18303 unsigned IntOp = Int->getConstantOperandVal(1);
18304 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
18305 && "expected single br user");
18306 SDNode *Br = *N->use_begin();
18307 SDValue OtherTarget = Br->getOperand(1);
18308
18309 // Update the unconditional branch to branch to the given Dest.
18310 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18311 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18312 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18313 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18314 };
18315
18316 if (IntOp == Intrinsic::test_start_loop_iterations) {
18317 SDValue Res;
18318 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18319 // We expect this 'instruction' to branch when the counter is zero.
18320 if (IsTrueIfZero(CC, Imm)) {
18321 SDValue Ops[] = {Chain, Setup, Dest};
18322 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18323 } else {
18324 // The logic is the reverse of what we need for WLS, so find the other
18325 // basic block target: the target of the proceeding br.
18326 UpdateUncondBr(Br, Dest, DAG);
18327
18328 SDValue Ops[] = {Chain, Setup, OtherTarget};
18329 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18330 }
18331 // Update LR count to the new value
18332 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18333 // Update chain
18334 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18335 return Res;
18336 } else {
18337 SDValue Size =
18338 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18339 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18340 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18341 DAG.getVTList(MVT::i32, MVT::Other), Args);
18342 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18343
18344 // We expect this instruction to branch when the count is not zero.
18345 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18346
18347 // Update the unconditional branch to target the loop preheader if we've
18348 // found the condition has been reversed.
18349 if (Target == OtherTarget)
18350 UpdateUncondBr(Br, Dest, DAG);
18351
18352 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18353 SDValue(LoopDec.getNode(), 1), Chain);
18354
18355 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18356 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18357 }
18358 return SDValue();
18359}
18360
18361/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18362SDValue
18364 SDValue Cmp = N->getOperand(4);
18365 if (Cmp.getOpcode() != ARMISD::CMPZ)
18366 // Only looking at NE cases.
18367 return SDValue();
18368
18369 EVT VT = N->getValueType(0);
18370 SDLoc dl(N);
18371 SDValue LHS = Cmp.getOperand(0);
18372 SDValue RHS = Cmp.getOperand(1);
18373 SDValue Chain = N->getOperand(0);
18374 SDValue BB = N->getOperand(1);
18375 SDValue ARMcc = N->getOperand(2);
18377
18378 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
18379 // -> (brcond Chain BB CC CPSR Cmp)
18380 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18381 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18382 LHS->getOperand(0)->hasOneUse() &&
18383 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18384 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18385 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18386 return DAG.getNode(
18387 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
18388 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
18389 }
18390
18391 return SDValue();
18392}
18393
18394/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18395SDValue
18397 SDValue Cmp = N->getOperand(4);
18398 if (Cmp.getOpcode() != ARMISD::CMPZ)
18399 // Only looking at EQ and NE cases.
18400 return SDValue();
18401
18402 EVT VT = N->getValueType(0);
18403 SDLoc dl(N);
18404 SDValue LHS = Cmp.getOperand(0);
18405 SDValue RHS = Cmp.getOperand(1);
18406 SDValue FalseVal = N->getOperand(0);
18407 SDValue TrueVal = N->getOperand(1);
18408 SDValue ARMcc = N->getOperand(2);
18410
18411 // BFI is only available on V6T2+.
18412 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18414 if (R)
18415 return R;
18416 }
18417
18418 // Simplify
18419 // mov r1, r0
18420 // cmp r1, x
18421 // mov r0, y
18422 // moveq r0, x
18423 // to
18424 // cmp r0, x
18425 // movne r0, y
18426 //
18427 // mov r1, r0
18428 // cmp r1, x
18429 // mov r0, x
18430 // movne r0, y
18431 // to
18432 // cmp r0, x
18433 // movne r0, y
18434 /// FIXME: Turn this into a target neutral optimization?
18435 SDValue Res;
18436 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18437 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
18438 N->getOperand(3), Cmp);
18439 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18440 SDValue ARMcc;
18441 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18442 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
18443 N->getOperand(3), NewCmp);
18444 }
18445
18446 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
18447 // -> (cmov F T CC CPSR Cmp)
18448 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18449 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18451 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18452 LHS->getOperand(2), LHS->getOperand(3),
18453 LHS->getOperand(4));
18454 }
18455
18456 if (!VT.isInteger())
18457 return SDValue();
18458
18459 // Fold away an unneccessary CMPZ/CMOV
18460 // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18461 // if C1==EQ -> CMOV A, B, C2, $cpsr, D
18462 // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D
18463 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18464 N->getConstantOperandVal(2) == ARMCC::NE) {
18466 if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) {
18467 if (N->getConstantOperandVal(2) == ARMCC::NE)
18469 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18470 N->getOperand(1),
18471 DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),
18472 N->getOperand(3), C);
18473 }
18474 }
18475
18476 // Materialize a boolean comparison for integers so we can avoid branching.
18477 if (isNullConstant(FalseVal)) {
18478 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18479 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18480 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18481 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18482 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18483 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18484 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18485 DAG.getConstant(5, dl, MVT::i32));
18486 } else {
18487 // CMOV 0, 1, ==, (CMPZ x, y) ->
18488 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18489 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18490 //
18491 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18492 // x != y. In other words, a carry C == 1 when x == y, C == 0
18493 // otherwise.
18494 // The final UADDO_CARRY computes
18495 // x - y + (0 - (x - y)) + C == C
18496 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18497 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18498 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18499 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18500 // actually.
18501 SDValue Carry =
18502 DAG.getNode(ISD::SUB, dl, MVT::i32,
18503 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18504 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18505 }
18506 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18507 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18508 // This seems pointless but will allow us to combine it further below.
18509 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18510 SDValue Sub =
18511 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18512 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
18513 Sub.getValue(1), SDValue());
18514 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18515 N->getOperand(3), CPSRGlue.getValue(1));
18516 FalseVal = Sub;
18517 }
18518 } else if (isNullConstant(TrueVal)) {
18519 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18520 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18521 // This seems pointless but will allow us to combine it further below
18522 // Note that we change == for != as this is the dual for the case above.
18523 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18524 SDValue Sub =
18525 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18526 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
18527 Sub.getValue(1), SDValue());
18528 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18529 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18530 N->getOperand(3), CPSRGlue.getValue(1));
18531 FalseVal = Sub;
18532 }
18533 }
18534
18535 // On Thumb1, the DAG above may be further combined if z is a power of 2
18536 // (z == 2 ^ K).
18537 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18538 // t1 = (USUBO (SUB x, y), 1)
18539 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18540 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18541 //
18542 // This also handles the special case of comparing against zero; it's
18543 // essentially, the same pattern, except there's no SUBC:
18544 // CMOV x, z, !=, (CMPZ x, 0) ->
18545 // t1 = (USUBO x, 1)
18546 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18547 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18548 const APInt *TrueConst;
18549 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18550 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18551 FalseVal.getOperand(1) == RHS) ||
18552 (FalseVal == LHS && isNullConstant(RHS))) &&
18553 (TrueConst = isPowerOf2Constant(TrueVal))) {
18554 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18555 unsigned ShiftAmount = TrueConst->logBase2();
18556 if (ShiftAmount)
18557 TrueVal = DAG.getConstant(1, dl, VT);
18558 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18559 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18560 Subc.getValue(1));
18561
18562 if (ShiftAmount)
18563 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18564 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18565 }
18566
18567 if (Res.getNode()) {
18568 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18569 // Capture demanded bits information that would be otherwise lost.
18570 if (Known.Zero == 0xfffffffe)
18571 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18572 DAG.getValueType(MVT::i1));
18573 else if (Known.Zero == 0xffffff00)
18574 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18575 DAG.getValueType(MVT::i8));
18576 else if (Known.Zero == 0xffff0000)
18577 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18578 DAG.getValueType(MVT::i16));
18579 }
18580
18581 return Res;
18582}
18583
18586 const ARMSubtarget *ST) {
18587 SelectionDAG &DAG = DCI.DAG;
18588 SDValue Src = N->getOperand(0);
18589 EVT DstVT = N->getValueType(0);
18590
18591 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18592 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18593 EVT SrcVT = Src.getValueType();
18594 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18595 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18596 }
18597
18598 // We may have a bitcast of something that has already had this bitcast
18599 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18600 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
18601 Src = Src.getOperand(0);
18602
18603 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18604 // would be generated is at least the width of the element type.
18605 EVT SrcVT = Src.getValueType();
18606 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18607 Src.getOpcode() == ARMISD::VMVNIMM ||
18608 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18609 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18610 DAG.getDataLayout().isBigEndian())
18611 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18612
18613 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18614 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18615 return R;
18616
18617 return SDValue();
18618}
18619
18620// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18621// node into stack operations after legalizeOps.
18624 SelectionDAG &DAG = DCI.DAG;
18625 EVT VT = N->getValueType(0);
18626 SDLoc DL(N);
18627
18628 // MVETrunc(Undef, Undef) -> Undef
18629 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18630 return DAG.getUNDEF(VT);
18631
18632 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18633 if (N->getNumOperands() == 2 &&
18634 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18635 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18636 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18637 N->getOperand(0).getOperand(1),
18638 N->getOperand(1).getOperand(0),
18639 N->getOperand(1).getOperand(1));
18640
18641 // MVETrunc(shuffle, shuffle) -> VMOVN
18642 if (N->getNumOperands() == 2 &&
18643 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18644 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18645 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18646 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18647
18648 if (S0->getOperand(0) == S1->getOperand(0) &&
18649 S0->getOperand(1) == S1->getOperand(1)) {
18650 // Construct complete shuffle mask
18651 SmallVector<int, 8> Mask(S0->getMask());
18652 Mask.append(S1->getMask().begin(), S1->getMask().end());
18653
18654 if (isVMOVNTruncMask(Mask, VT, false))
18655 return DAG.getNode(
18656 ARMISD::VMOVN, DL, VT,
18657 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18658 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18659 DAG.getConstant(1, DL, MVT::i32));
18660 if (isVMOVNTruncMask(Mask, VT, true))
18661 return DAG.getNode(
18662 ARMISD::VMOVN, DL, VT,
18663 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18664 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18665 DAG.getConstant(1, DL, MVT::i32));
18666 }
18667 }
18668
18669 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18670 // truncate to a buildvector to allow the generic optimisations to kick in.
18671 if (all_of(N->ops(), [](SDValue Op) {
18672 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18673 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18674 (Op.getOpcode() == ISD::BITCAST &&
18675 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18676 })) {
18677 SmallVector<SDValue, 8> Extracts;
18678 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18679 SDValue O = N->getOperand(Op);
18680 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18681 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18682 DAG.getConstant(i, DL, MVT::i32));
18683 Extracts.push_back(Ext);
18684 }
18685 }
18686 return DAG.getBuildVector(VT, DL, Extracts);
18687 }
18688
18689 // If we are late in the legalization process and nothing has optimised
18690 // the trunc to anything better, lower it to a stack store and reload,
18691 // performing the truncation whilst keeping the lanes in the correct order:
18692 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18693 if (!DCI.isAfterLegalizeDAG())
18694 return SDValue();
18695
18696 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18697 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18698 int NumIns = N->getNumOperands();
18699 assert((NumIns == 2 || NumIns == 4) &&
18700 "Expected 2 or 4 inputs to an MVETrunc");
18701 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18702 if (N->getNumOperands() == 4)
18703 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18704
18705 SmallVector<SDValue> Chains;
18706 for (int I = 0; I < NumIns; I++) {
18707 SDValue Ptr = DAG.getNode(
18708 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18709 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18711 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18712 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18713 Ptr, MPI, StoreVT, Align(4));
18714 Chains.push_back(Ch);
18715 }
18716
18717 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18718 MachinePointerInfo MPI =
18720 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18721}
18722
18723// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18725 SelectionDAG &DAG) {
18726 SDValue N0 = N->getOperand(0);
18727 LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
18728 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18729 return SDValue();
18730
18731 EVT FromVT = LD->getMemoryVT();
18732 EVT ToVT = N->getValueType(0);
18733 if (!ToVT.isVector())
18734 return SDValue();
18735 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18736 EVT ToEltVT = ToVT.getVectorElementType();
18737 EVT FromEltVT = FromVT.getVectorElementType();
18738
18739 unsigned NumElements = 0;
18740 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18741 NumElements = 4;
18742 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18743 NumElements = 8;
18744 assert(NumElements != 0);
18745
18746 ISD::LoadExtType NewExtType =
18747 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18748 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18749 LD->getExtensionType() != ISD::EXTLOAD &&
18750 LD->getExtensionType() != NewExtType)
18751 return SDValue();
18752
18753 LLVMContext &C = *DAG.getContext();
18754 SDLoc DL(LD);
18755 // Details about the old load
18756 SDValue Ch = LD->getChain();
18757 SDValue BasePtr = LD->getBasePtr();
18758 Align Alignment = LD->getOriginalAlign();
18759 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18760 AAMDNodes AAInfo = LD->getAAInfo();
18761
18762 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18763 EVT NewFromVT = EVT::getVectorVT(
18764 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18765 EVT NewToVT = EVT::getVectorVT(
18766 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18767
18770 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18771 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18772 SDValue NewPtr =
18773 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18774
18775 SDValue NewLoad =
18776 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18777 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18778 Alignment, MMOFlags, AAInfo);
18779 Loads.push_back(NewLoad);
18780 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18781 }
18782
18783 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18784 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18785 return DAG.getMergeValues(Loads, DL);
18786}
18787
18788// Perform combines for MVEEXT. If it has not be optimized to anything better
18789// before lowering, it gets converted to stack store and extloads performing the
18790// extend whilst still keeping the same lane ordering.
18793 SelectionDAG &DAG = DCI.DAG;
18794 EVT VT = N->getValueType(0);
18795 SDLoc DL(N);
18796 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18797 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18798
18799 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18800 *DAG.getContext());
18801 auto Extend = [&](SDValue V) {
18802 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18803 return N->getOpcode() == ARMISD::MVESEXT
18804 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18805 DAG.getValueType(ExtVT))
18806 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18807 };
18808
18809 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18810 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18811 SDValue Ext = Extend(N->getOperand(0));
18812 return DAG.getMergeValues({Ext, Ext}, DL);
18813 }
18814
18815 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18816 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18817 ArrayRef<int> Mask = SVN->getMask();
18818 assert(Mask.size() == 2 * VT.getVectorNumElements());
18819 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18820 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18821 SDValue Op0 = SVN->getOperand(0);
18822 SDValue Op1 = SVN->getOperand(1);
18823
18824 auto CheckInregMask = [&](int Start, int Offset) {
18825 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18826 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18827 return false;
18828 return true;
18829 };
18830 SDValue V0 = SDValue(N, 0);
18831 SDValue V1 = SDValue(N, 1);
18832 if (CheckInregMask(0, 0))
18833 V0 = Extend(Op0);
18834 else if (CheckInregMask(0, 1))
18835 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18836 else if (CheckInregMask(0, Mask.size()))
18837 V0 = Extend(Op1);
18838 else if (CheckInregMask(0, Mask.size() + 1))
18839 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18840
18841 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18842 V1 = Extend(Op1);
18843 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18844 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18845 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18846 V1 = Extend(Op0);
18847 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18848 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18849
18850 if (V0.getNode() != N || V1.getNode() != N)
18851 return DAG.getMergeValues({V0, V1}, DL);
18852 }
18853
18854 // MVEEXT(load) -> extload, extload
18855 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18857 return L;
18858
18859 if (!DCI.isAfterLegalizeDAG())
18860 return SDValue();
18861
18862 // Lower to a stack store and reload:
18863 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18864 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18865 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18866 int NumOuts = N->getNumValues();
18867 assert((NumOuts == 2 || NumOuts == 4) &&
18868 "Expected 2 or 4 outputs to an MVEEXT");
18869 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18870 *DAG.getContext());
18871 if (N->getNumOperands() == 4)
18872 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18873
18874 MachinePointerInfo MPI =
18876 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18877 StackPtr, MPI, Align(4));
18878
18880 for (int I = 0; I < NumOuts; I++) {
18881 SDValue Ptr = DAG.getNode(
18882 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18883 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18885 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18886 SDValue Load = DAG.getExtLoad(
18887 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18888 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18889 Loads.push_back(Load);
18890 }
18891
18892 return DAG.getMergeValues(Loads, DL);
18893}
18894
18896 DAGCombinerInfo &DCI) const {
18897 switch (N->getOpcode()) {
18898 default: break;
18899 case ISD::SELECT_CC:
18900 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18901 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18902 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18903 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18904 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18905 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18906 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18907 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18908 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18909 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18910 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18911 case ISD::BRCOND:
18912 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18913 case ARMISD::ADDC:
18914 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18915 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18916 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18917 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18918 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18919 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18920 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18921 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18922 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18925 return PerformExtractEltCombine(N, DCI, Subtarget);
18929 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18930 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18931 case ISD::FP_TO_SINT:
18932 case ISD::FP_TO_UINT:
18933 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18934 case ISD::FADD:
18935 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18936 case ISD::FMUL:
18937 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18939 return PerformIntrinsicCombine(N, DCI);
18940 case ISD::SHL:
18941 case ISD::SRA:
18942 case ISD::SRL:
18943 return PerformShiftCombine(N, DCI, Subtarget);
18944 case ISD::SIGN_EXTEND:
18945 case ISD::ZERO_EXTEND:
18946 case ISD::ANY_EXTEND:
18947 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18948 case ISD::FP_EXTEND:
18949 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18950 case ISD::SMIN:
18951 case ISD::UMIN:
18952 case ISD::SMAX:
18953 case ISD::UMAX:
18954 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18955 case ARMISD::CMOV:
18956 return PerformCMOVCombine(N, DCI.DAG);
18957 case ARMISD::BRCOND:
18958 return PerformBRCONDCombine(N, DCI.DAG);
18959 case ARMISD::CMPZ:
18960 return PerformCMPZCombine(N, DCI.DAG);
18961 case ARMISD::CSINC:
18962 case ARMISD::CSINV:
18963 case ARMISD::CSNEG:
18964 return PerformCSETCombine(N, DCI.DAG);
18965 case ISD::LOAD:
18966 return PerformLOADCombine(N, DCI, Subtarget);
18967 case ARMISD::VLD1DUP:
18968 case ARMISD::VLD2DUP:
18969 case ARMISD::VLD3DUP:
18970 case ARMISD::VLD4DUP:
18971 return PerformVLDCombine(N, DCI);
18973 return PerformARMBUILD_VECTORCombine(N, DCI);
18974 case ISD::BITCAST:
18975 return PerformBITCASTCombine(N, DCI, Subtarget);
18977 return PerformPREDICATE_CASTCombine(N, DCI);
18979 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18980 case ARMISD::MVETRUNC:
18981 return PerformMVETruncCombine(N, DCI);
18982 case ARMISD::MVESEXT:
18983 case ARMISD::MVEZEXT:
18984 return PerformMVEExtCombine(N, DCI);
18985 case ARMISD::VCMP:
18986 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18987 case ISD::VECREDUCE_ADD:
18988 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18989 case ARMISD::VADDVs:
18990 case ARMISD::VADDVu:
18991 case ARMISD::VADDLVs:
18992 case ARMISD::VADDLVu:
18993 case ARMISD::VADDLVAs:
18994 case ARMISD::VADDLVAu:
18995 case ARMISD::VMLAVs:
18996 case ARMISD::VMLAVu:
18997 case ARMISD::VMLALVs:
18998 case ARMISD::VMLALVu:
18999 case ARMISD::VMLALVAs:
19000 case ARMISD::VMLALVAu:
19001 return PerformReduceShuffleCombine(N, DCI.DAG);
19002 case ARMISD::VMOVN:
19003 return PerformVMOVNCombine(N, DCI);
19004 case ARMISD::VQMOVNs:
19005 case ARMISD::VQMOVNu:
19006 return PerformVQMOVNCombine(N, DCI);
19007 case ARMISD::VQDMULH:
19008 return PerformVQDMULHCombine(N, DCI);
19009 case ARMISD::ASRL:
19010 case ARMISD::LSRL:
19011 case ARMISD::LSLL:
19012 return PerformLongShiftCombine(N, DCI.DAG);
19013 case ARMISD::SMULWB: {
19014 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19015 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19016 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19017 return SDValue();
19018 break;
19019 }
19020 case ARMISD::SMULWT: {
19021 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19022 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19023 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19024 return SDValue();
19025 break;
19026 }
19027 case ARMISD::SMLALBB:
19028 case ARMISD::QADD16b:
19029 case ARMISD::QSUB16b:
19030 case ARMISD::UQADD16b:
19031 case ARMISD::UQSUB16b: {
19032 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19033 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19034 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19035 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19036 return SDValue();
19037 break;
19038 }
19039 case ARMISD::SMLALBT: {
19040 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19041 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19042 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19043 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19044 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19045 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19046 return SDValue();
19047 break;
19048 }
19049 case ARMISD::SMLALTB: {
19050 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19051 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19052 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19053 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19054 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19055 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19056 return SDValue();
19057 break;
19058 }
19059 case ARMISD::SMLALTT: {
19060 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19061 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19062 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19063 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19064 return SDValue();
19065 break;
19066 }
19067 case ARMISD::QADD8b:
19068 case ARMISD::QSUB8b:
19069 case ARMISD::UQADD8b:
19070 case ARMISD::UQSUB8b: {
19071 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19072 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19073 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19074 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19075 return SDValue();
19076 break;
19077 }
19080 switch (N->getConstantOperandVal(1)) {
19081 case Intrinsic::arm_neon_vld1:
19082 case Intrinsic::arm_neon_vld1x2:
19083 case Intrinsic::arm_neon_vld1x3:
19084 case Intrinsic::arm_neon_vld1x4:
19085 case Intrinsic::arm_neon_vld2:
19086 case Intrinsic::arm_neon_vld3:
19087 case Intrinsic::arm_neon_vld4:
19088 case Intrinsic::arm_neon_vld2lane:
19089 case Intrinsic::arm_neon_vld3lane:
19090 case Intrinsic::arm_neon_vld4lane:
19091 case Intrinsic::arm_neon_vld2dup:
19092 case Intrinsic::arm_neon_vld3dup:
19093 case Intrinsic::arm_neon_vld4dup:
19094 case Intrinsic::arm_neon_vst1:
19095 case Intrinsic::arm_neon_vst1x2:
19096 case Intrinsic::arm_neon_vst1x3:
19097 case Intrinsic::arm_neon_vst1x4:
19098 case Intrinsic::arm_neon_vst2:
19099 case Intrinsic::arm_neon_vst3:
19100 case Intrinsic::arm_neon_vst4:
19101 case Intrinsic::arm_neon_vst2lane:
19102 case Intrinsic::arm_neon_vst3lane:
19103 case Intrinsic::arm_neon_vst4lane:
19104 return PerformVLDCombine(N, DCI);
19105 case Intrinsic::arm_mve_vld2q:
19106 case Intrinsic::arm_mve_vld4q:
19107 case Intrinsic::arm_mve_vst2q:
19108 case Intrinsic::arm_mve_vst4q:
19109 return PerformMVEVLDCombine(N, DCI);
19110 default: break;
19111 }
19112 break;
19113 }
19114 return SDValue();
19115}
19116
19118 EVT VT) const {
19119 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19120}
19121
19123 Align Alignment,
19125 unsigned *Fast) const {
19126 // Depends what it gets converted into if the type is weird.
19127 if (!VT.isSimple())
19128 return false;
19129
19130 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19131 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19132 auto Ty = VT.getSimpleVT().SimpleTy;
19133
19134 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19135 // Unaligned access can use (for example) LRDB, LRDH, LDR
19136 if (AllowsUnaligned) {
19137 if (Fast)
19138 *Fast = Subtarget->hasV7Ops();
19139 return true;
19140 }
19141 }
19142
19143 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19144 // For any little-endian targets with neon, we can support unaligned ld/st
19145 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19146 // A big-endian target may also explicitly support unaligned accesses
19147 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19148 if (Fast)
19149 *Fast = 1;
19150 return true;
19151 }
19152 }
19153
19154 if (!Subtarget->hasMVEIntegerOps())
19155 return false;
19156
19157 // These are for predicates
19158 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19159 Ty == MVT::v2i1)) {
19160 if (Fast)
19161 *Fast = 1;
19162 return true;
19163 }
19164
19165 // These are for truncated stores/narrowing loads. They are fine so long as
19166 // the alignment is at least the size of the item being loaded
19167 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19168 Alignment >= VT.getScalarSizeInBits() / 8) {
19169 if (Fast)
19170 *Fast = true;
19171 return true;
19172 }
19173
19174 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19175 // VSTRW.U32 all store the vector register in exactly the same format, and
19176 // differ only in the range of their immediate offset field and the required
19177 // alignment. So there is always a store that can be used, regardless of
19178 // actual type.
19179 //
19180 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19181 // VREV64.8) pair and get the same effect. This will likely be better than
19182 // aligning the vector through the stack.
19183 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19184 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19185 Ty == MVT::v2f64) {
19186 if (Fast)
19187 *Fast = 1;
19188 return true;
19189 }
19190
19191 return false;
19192}
19193
19194
19196 const MemOp &Op, const AttributeList &FuncAttributes) const {
19197 // See if we can use NEON instructions for this...
19198 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19199 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19200 unsigned Fast;
19201 if (Op.size() >= 16 &&
19202 (Op.isAligned(Align(16)) ||
19203 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19205 Fast))) {
19206 return MVT::v2f64;
19207 } else if (Op.size() >= 8 &&
19208 (Op.isAligned(Align(8)) ||
19210 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19211 Fast))) {
19212 return MVT::f64;
19213 }
19214 }
19215
19216 // Let the target-independent logic figure it out.
19217 return MVT::Other;
19218}
19219
19220// 64-bit integers are split into their high and low parts and held in two
19221// different registers, so the trunc is free since the low register can just
19222// be used.
19223bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19224 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19225 return false;
19226 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19227 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19228 return (SrcBits == 64 && DestBits == 32);
19229}
19230
19232 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19233 !DstVT.isInteger())
19234 return false;
19235 unsigned SrcBits = SrcVT.getSizeInBits();
19236 unsigned DestBits = DstVT.getSizeInBits();
19237 return (SrcBits == 64 && DestBits == 32);
19238}
19239
19241 if (Val.getOpcode() != ISD::LOAD)
19242 return false;
19243
19244 EVT VT1 = Val.getValueType();
19245 if (!VT1.isSimple() || !VT1.isInteger() ||
19246 !VT2.isSimple() || !VT2.isInteger())
19247 return false;
19248
19249 switch (VT1.getSimpleVT().SimpleTy) {
19250 default: break;
19251 case MVT::i1:
19252 case MVT::i8:
19253 case MVT::i16:
19254 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19255 return true;
19256 }
19257
19258 return false;
19259}
19260
19262 if (!VT.isSimple())
19263 return false;
19264
19265 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19266 // negate values directly (fneg is free). So, we don't want to let the DAG
19267 // combiner rewrite fneg into xors and some other instructions. For f16 and
19268 // FullFP16 argument passing, some bitcast nodes may be introduced,
19269 // triggering this DAG combine rewrite, so we are avoiding that with this.
19270 switch (VT.getSimpleVT().SimpleTy) {
19271 default: break;
19272 case MVT::f16:
19273 return Subtarget->hasFullFP16();
19274 }
19275
19276 return false;
19277}
19278
19279/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
19280/// of the vector elements.
19281static bool areExtractExts(Value *Ext1, Value *Ext2) {
19282 auto areExtDoubled = [](Instruction *Ext) {
19283 return Ext->getType()->getScalarSizeInBits() ==
19284 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
19285 };
19286
19287 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
19288 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
19289 !areExtDoubled(cast<Instruction>(Ext1)) ||
19290 !areExtDoubled(cast<Instruction>(Ext2)))
19291 return false;
19292
19293 return true;
19294}
19295
19296/// Check if sinking \p I's operands to I's basic block is profitable, because
19297/// the operands can be folded into a target instruction, e.g.
19298/// sext/zext can be folded into vsubl.
19300 SmallVectorImpl<Use *> &Ops) const {
19301 if (!I->getType()->isVectorTy())
19302 return false;
19303
19304 if (Subtarget->hasNEON()) {
19305 switch (I->getOpcode()) {
19306 case Instruction::Sub:
19307 case Instruction::Add: {
19308 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
19309 return false;
19310 Ops.push_back(&I->getOperandUse(0));
19311 Ops.push_back(&I->getOperandUse(1));
19312 return true;
19313 }
19314 default:
19315 return false;
19316 }
19317 }
19318
19319 if (!Subtarget->hasMVEIntegerOps())
19320 return false;
19321
19322 auto IsFMSMul = [&](Instruction *I) {
19323 if (!I->hasOneUse())
19324 return false;
19325 auto *Sub = cast<Instruction>(*I->users().begin());
19326 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
19327 };
19328 auto IsFMS = [&](Instruction *I) {
19329 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
19330 match(I->getOperand(1), m_FNeg(m_Value())))
19331 return true;
19332 return false;
19333 };
19334
19335 auto IsSinker = [&](Instruction *I, int Operand) {
19336 switch (I->getOpcode()) {
19337 case Instruction::Add:
19338 case Instruction::Mul:
19339 case Instruction::FAdd:
19340 case Instruction::ICmp:
19341 case Instruction::FCmp:
19342 return true;
19343 case Instruction::FMul:
19344 return !IsFMSMul(I);
19345 case Instruction::Sub:
19346 case Instruction::FSub:
19347 case Instruction::Shl:
19348 case Instruction::LShr:
19349 case Instruction::AShr:
19350 return Operand == 1;
19351 case Instruction::Call:
19352 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
19353 switch (II->getIntrinsicID()) {
19354 case Intrinsic::fma:
19355 return !IsFMS(I);
19356 case Intrinsic::sadd_sat:
19357 case Intrinsic::uadd_sat:
19358 case Intrinsic::arm_mve_add_predicated:
19359 case Intrinsic::arm_mve_mul_predicated:
19360 case Intrinsic::arm_mve_qadd_predicated:
19361 case Intrinsic::arm_mve_vhadd:
19362 case Intrinsic::arm_mve_hadd_predicated:
19363 case Intrinsic::arm_mve_vqdmull:
19364 case Intrinsic::arm_mve_vqdmull_predicated:
19365 case Intrinsic::arm_mve_vqdmulh:
19366 case Intrinsic::arm_mve_qdmulh_predicated:
19367 case Intrinsic::arm_mve_vqrdmulh:
19368 case Intrinsic::arm_mve_qrdmulh_predicated:
19369 case Intrinsic::arm_mve_fma_predicated:
19370 return true;
19371 case Intrinsic::ssub_sat:
19372 case Intrinsic::usub_sat:
19373 case Intrinsic::arm_mve_sub_predicated:
19374 case Intrinsic::arm_mve_qsub_predicated:
19375 case Intrinsic::arm_mve_hsub_predicated:
19376 case Intrinsic::arm_mve_vhsub:
19377 return Operand == 1;
19378 default:
19379 return false;
19380 }
19381 }
19382 return false;
19383 default:
19384 return false;
19385 }
19386 };
19387
19388 for (auto OpIdx : enumerate(I->operands())) {
19389 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
19390 // Make sure we are not already sinking this operand
19391 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
19392 continue;
19393
19394 Instruction *Shuffle = Op;
19395 if (Shuffle->getOpcode() == Instruction::BitCast)
19396 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
19397 // We are looking for a splat that can be sunk.
19398 if (!Shuffle ||
19399 !match(Shuffle, m_Shuffle(
19401 m_Undef(), m_ZeroMask())))
19402 continue;
19403 if (!IsSinker(I, OpIdx.index()))
19404 continue;
19405
19406 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
19407 // and vector registers
19408 for (Use &U : Op->uses()) {
19409 Instruction *Insn = cast<Instruction>(U.getUser());
19410 if (!IsSinker(Insn, U.getOperandNo()))
19411 return false;
19412 }
19413
19414 Ops.push_back(&Shuffle->getOperandUse(0));
19415 if (Shuffle != Op)
19416 Ops.push_back(&Op->getOperandUse(0));
19417 Ops.push_back(&OpIdx.value());
19418 }
19419 return true;
19420}
19421
19423 if (!Subtarget->hasMVEIntegerOps())
19424 return nullptr;
19425 Type *SVIType = SVI->getType();
19426 Type *ScalarType = SVIType->getScalarType();
19427
19428 if (ScalarType->isFloatTy())
19429 return Type::getInt32Ty(SVIType->getContext());
19430 if (ScalarType->isHalfTy())
19431 return Type::getInt16Ty(SVIType->getContext());
19432 return nullptr;
19433}
19434
19436 EVT VT = ExtVal.getValueType();
19437
19438 if (!isTypeLegal(VT))
19439 return false;
19440
19441 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19442 if (Ld->isExpandingLoad())
19443 return false;
19444 }
19445
19446 if (Subtarget->hasMVEIntegerOps())
19447 return true;
19448
19449 // Don't create a loadext if we can fold the extension into a wide/long
19450 // instruction.
19451 // If there's more than one user instruction, the loadext is desirable no
19452 // matter what. There can be two uses by the same instruction.
19453 if (ExtVal->use_empty() ||
19454 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
19455 return true;
19456
19457 SDNode *U = *ExtVal->use_begin();
19458 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19459 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19460 return false;
19461
19462 return true;
19463}
19464
19466 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19467 return false;
19468
19469 if (!isTypeLegal(EVT::getEVT(Ty1)))
19470 return false;
19471
19472 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19473
19474 // Assuming the caller doesn't have a zeroext or signext return parameter,
19475 // truncation all the way down to i1 is valid.
19476 return true;
19477}
19478
19479/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19480/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19481/// expanded to FMAs when this method returns true, otherwise fmuladd is
19482/// expanded to fmul + fadd.
19483///
19484/// ARM supports both fused and unfused multiply-add operations; we already
19485/// lower a pair of fmul and fadd to the latter so it's not clear that there
19486/// would be a gain or that the gain would be worthwhile enough to risk
19487/// correctness bugs.
19488///
19489/// For MVE, we set this to true as it helps simplify the need for some
19490/// patterns (and we don't have the non-fused floating point instruction).
19491bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19492 EVT VT) const {
19493 if (!VT.isSimple())
19494 return false;
19495
19496 switch (VT.getSimpleVT().SimpleTy) {
19497 case MVT::v4f32:
19498 case MVT::v8f16:
19499 return Subtarget->hasMVEFloatOps();
19500 case MVT::f16:
19501 return Subtarget->useFPVFMx16();
19502 case MVT::f32:
19503 return Subtarget->useFPVFMx();
19504 case MVT::f64:
19505 return Subtarget->useFPVFMx64();
19506 default:
19507 break;
19508 }
19509
19510 return false;
19511}
19512
19513static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19514 if (V < 0)
19515 return false;
19516
19517 unsigned Scale = 1;
19518 switch (VT.getSimpleVT().SimpleTy) {
19519 case MVT::i1:
19520 case MVT::i8:
19521 // Scale == 1;
19522 break;
19523 case MVT::i16:
19524 // Scale == 2;
19525 Scale = 2;
19526 break;
19527 default:
19528 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19529 // Scale == 4;
19530 Scale = 4;
19531 break;
19532 }
19533
19534 if ((V & (Scale - 1)) != 0)
19535 return false;
19536 return isUInt<5>(V / Scale);
19537}
19538
19539static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19540 const ARMSubtarget *Subtarget) {
19541 if (!VT.isInteger() && !VT.isFloatingPoint())
19542 return false;
19543 if (VT.isVector() && Subtarget->hasNEON())
19544 return false;
19545 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19546 !Subtarget->hasMVEFloatOps())
19547 return false;
19548
19549 bool IsNeg = false;
19550 if (V < 0) {
19551 IsNeg = true;
19552 V = -V;
19553 }
19554
19555 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19556
19557 // MVE: size * imm7
19558 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19559 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19560 case MVT::i32:
19561 case MVT::f32:
19562 return isShiftedUInt<7,2>(V);
19563 case MVT::i16:
19564 case MVT::f16:
19565 return isShiftedUInt<7,1>(V);
19566 case MVT::i8:
19567 return isUInt<7>(V);
19568 default:
19569 return false;
19570 }
19571 }
19572
19573 // half VLDR: 2 * imm8
19574 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19575 return isShiftedUInt<8, 1>(V);
19576 // VLDR and LDRD: 4 * imm8
19577 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19578 return isShiftedUInt<8, 2>(V);
19579
19580 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19581 // + imm12 or - imm8
19582 if (IsNeg)
19583 return isUInt<8>(V);
19584 return isUInt<12>(V);
19585 }
19586
19587 return false;
19588}
19589
19590/// isLegalAddressImmediate - Return true if the integer value can be used
19591/// as the offset of the target addressing mode for load / store of the
19592/// given type.
19593static bool isLegalAddressImmediate(int64_t V, EVT VT,
19594 const ARMSubtarget *Subtarget) {
19595 if (V == 0)
19596 return true;
19597
19598 if (!VT.isSimple())
19599 return false;
19600
19601 if (Subtarget->isThumb1Only())
19602 return isLegalT1AddressImmediate(V, VT);
19603 else if (Subtarget->isThumb2())
19604 return isLegalT2AddressImmediate(V, VT, Subtarget);
19605
19606 // ARM mode.
19607 if (V < 0)
19608 V = - V;
19609 switch (VT.getSimpleVT().SimpleTy) {
19610 default: return false;
19611 case MVT::i1:
19612 case MVT::i8:
19613 case MVT::i32:
19614 // +- imm12
19615 return isUInt<12>(V);
19616 case MVT::i16:
19617 // +- imm8
19618 return isUInt<8>(V);
19619 case MVT::f32:
19620 case MVT::f64:
19621 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19622 return false;
19623 return isShiftedUInt<8, 2>(V);
19624 }
19625}
19626
19628 EVT VT) const {
19629 int Scale = AM.Scale;
19630 if (Scale < 0)
19631 return false;
19632
19633 switch (VT.getSimpleVT().SimpleTy) {
19634 default: return false;
19635 case MVT::i1:
19636 case MVT::i8:
19637 case MVT::i16:
19638 case MVT::i32:
19639 if (Scale == 1)
19640 return true;
19641 // r + r << imm
19642 Scale = Scale & ~1;
19643 return Scale == 2 || Scale == 4 || Scale == 8;
19644 case MVT::i64:
19645 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19646 // version in Thumb mode.
19647 // r + r
19648 if (Scale == 1)
19649 return true;
19650 // r * 2 (this can be lowered to r + r).
19651 if (!AM.HasBaseReg && Scale == 2)
19652 return true;
19653 return false;
19654 case MVT::isVoid:
19655 // Note, we allow "void" uses (basically, uses that aren't loads or
19656 // stores), because arm allows folding a scale into many arithmetic
19657 // operations. This should be made more precise and revisited later.
19658
19659 // Allow r << imm, but the imm has to be a multiple of two.
19660 if (Scale & 1) return false;
19661 return isPowerOf2_32(Scale);
19662 }
19663}
19664
19666 EVT VT) const {
19667 const int Scale = AM.Scale;
19668
19669 // Negative scales are not supported in Thumb1.
19670 if (Scale < 0)
19671 return false;
19672
19673 // Thumb1 addressing modes do not support register scaling excepting the
19674 // following cases:
19675 // 1. Scale == 1 means no scaling.
19676 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19677 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19678}
19679
19680/// isLegalAddressingMode - Return true if the addressing mode represented
19681/// by AM is legal for this target, for a load/store of the specified type.
19683 const AddrMode &AM, Type *Ty,
19684 unsigned AS, Instruction *I) const {
19685 EVT VT = getValueType(DL, Ty, true);
19686 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19687 return false;
19688
19689 // Can never fold addr of global into load/store.
19690 if (AM.BaseGV)
19691 return false;
19692
19693 switch (AM.Scale) {
19694 case 0: // no scale reg, must be "r+i" or "r", or "i".
19695 break;
19696 default:
19697 // ARM doesn't support any R+R*scale+imm addr modes.
19698 if (AM.BaseOffs)
19699 return false;
19700
19701 if (!VT.isSimple())
19702 return false;
19703
19704 if (Subtarget->isThumb1Only())
19705 return isLegalT1ScaledAddressingMode(AM, VT);
19706
19707 if (Subtarget->isThumb2())
19708 return isLegalT2ScaledAddressingMode(AM, VT);
19709
19710 int Scale = AM.Scale;
19711 switch (VT.getSimpleVT().SimpleTy) {
19712 default: return false;
19713 case MVT::i1:
19714 case MVT::i8:
19715 case MVT::i32:
19716 if (Scale < 0) Scale = -Scale;
19717 if (Scale == 1)
19718 return true;
19719 // r + r << imm
19720 return isPowerOf2_32(Scale & ~1);
19721 case MVT::i16:
19722 case MVT::i64:
19723 // r +/- r
19724 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19725 return true;
19726 // r * 2 (this can be lowered to r + r).
19727 if (!AM.HasBaseReg && Scale == 2)
19728 return true;
19729 return false;
19730
19731 case MVT::isVoid:
19732 // Note, we allow "void" uses (basically, uses that aren't loads or
19733 // stores), because arm allows folding a scale into many arithmetic
19734 // operations. This should be made more precise and revisited later.
19735
19736 // Allow r << imm, but the imm has to be a multiple of two.
19737 if (Scale & 1) return false;
19738 return isPowerOf2_32(Scale);
19739 }
19740 }
19741 return true;
19742}
19743
19744/// isLegalICmpImmediate - Return true if the specified immediate is legal
19745/// icmp immediate, that is the target has icmp instructions which can compare
19746/// a register against the immediate without having to materialize the
19747/// immediate into a register.
19749 // Thumb2 and ARM modes can use cmn for negative immediates.
19750 if (!Subtarget->isThumb())
19751 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19752 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19753 if (Subtarget->isThumb2())
19754 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19755 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19756 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19757 return Imm >= 0 && Imm <= 255;
19758}
19759
19760/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19761/// *or sub* immediate, that is the target has add or sub instructions which can
19762/// add a register with the immediate without having to materialize the
19763/// immediate into a register.
19765 // Same encoding for add/sub, just flip the sign.
19766 int64_t AbsImm = std::abs(Imm);
19767 if (!Subtarget->isThumb())
19768 return ARM_AM::getSOImmVal(AbsImm) != -1;
19769 if (Subtarget->isThumb2())
19770 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19771 // Thumb1 only has 8-bit unsigned immediate.
19772 return AbsImm >= 0 && AbsImm <= 255;
19773}
19774
19775// Return false to prevent folding
19776// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19777// if the folding leads to worse code.
19779 SDValue ConstNode) const {
19780 // Let the DAGCombiner decide for vector types and large types.
19781 const EVT VT = AddNode.getValueType();
19782 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19783 return true;
19784
19785 // It is worse if c0 is legal add immediate, while c1*c0 is not
19786 // and has to be composed by at least two instructions.
19787 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19788 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19789 const int64_t C0 = C0Node->getSExtValue();
19790 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19792 return true;
19793 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19794 return false;
19795
19796 // Default to true and let the DAGCombiner decide.
19797 return true;
19798}
19799
19801 bool isSEXTLoad, SDValue &Base,
19802 SDValue &Offset, bool &isInc,
19803 SelectionDAG &DAG) {
19804 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19805 return false;
19806
19807 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19808 // AddressingMode 3
19809 Base = Ptr->getOperand(0);
19810 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19811 int RHSC = (int)RHS->getZExtValue();
19812 if (RHSC < 0 && RHSC > -256) {
19813 assert(Ptr->getOpcode() == ISD::ADD);
19814 isInc = false;
19815 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19816 return true;
19817 }
19818 }
19819 isInc = (Ptr->getOpcode() == ISD::ADD);
19820 Offset = Ptr->getOperand(1);
19821 return true;
19822 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19823 // AddressingMode 2
19824 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19825 int RHSC = (int)RHS->getZExtValue();
19826 if (RHSC < 0 && RHSC > -0x1000) {
19827 assert(Ptr->getOpcode() == ISD::ADD);
19828 isInc = false;
19829 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19830 Base = Ptr->getOperand(0);
19831 return true;
19832 }
19833 }
19834
19835 if (Ptr->getOpcode() == ISD::ADD) {
19836 isInc = true;
19837 ARM_AM::ShiftOpc ShOpcVal=
19838 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19839 if (ShOpcVal != ARM_AM::no_shift) {
19840 Base = Ptr->getOperand(1);
19841 Offset = Ptr->getOperand(0);
19842 } else {
19843 Base = Ptr->getOperand(0);
19844 Offset = Ptr->getOperand(1);
19845 }
19846 return true;
19847 }
19848
19849 isInc = (Ptr->getOpcode() == ISD::ADD);
19850 Base = Ptr->getOperand(0);
19851 Offset = Ptr->getOperand(1);
19852 return true;
19853 }
19854
19855 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19856 return false;
19857}
19858
19860 bool isSEXTLoad, SDValue &Base,
19861 SDValue &Offset, bool &isInc,
19862 SelectionDAG &DAG) {
19863 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19864 return false;
19865
19866 Base = Ptr->getOperand(0);
19867 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19868 int RHSC = (int)RHS->getZExtValue();
19869 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19870 assert(Ptr->getOpcode() == ISD::ADD);
19871 isInc = false;
19872 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19873 return true;
19874 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19875 isInc = Ptr->getOpcode() == ISD::ADD;
19876 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19877 return true;
19878 }
19879 }
19880
19881 return false;
19882}
19883
19884static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19885 bool isSEXTLoad, bool IsMasked, bool isLE,
19887 bool &isInc, SelectionDAG &DAG) {
19888 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19889 return false;
19890 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19891 return false;
19892
19893 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19894 // as opposed to a vldrw.32). This can allow extra addressing modes or
19895 // alignments for what is otherwise an equivalent instruction.
19896 bool CanChangeType = isLE && !IsMasked;
19897
19898 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19899 int RHSC = (int)RHS->getZExtValue();
19900
19901 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19902 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19903 assert(Ptr->getOpcode() == ISD::ADD);
19904 isInc = false;
19905 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19906 return true;
19907 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19908 isInc = Ptr->getOpcode() == ISD::ADD;
19909 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19910 return true;
19911 }
19912 return false;
19913 };
19914
19915 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19916 // (in BE/masked) type.
19917 Base = Ptr->getOperand(0);
19918 if (VT == MVT::v4i16) {
19919 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19920 return true;
19921 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19922 if (IsInRange(RHSC, 0x80, 1))
19923 return true;
19924 } else if (Alignment >= 4 &&
19925 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19926 IsInRange(RHSC, 0x80, 4))
19927 return true;
19928 else if (Alignment >= 2 &&
19929 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19930 IsInRange(RHSC, 0x80, 2))
19931 return true;
19932 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19933 return true;
19934 return false;
19935}
19936
19937/// getPreIndexedAddressParts - returns true by value, base pointer and
19938/// offset pointer and addressing mode by reference if the node's address
19939/// can be legally represented as pre-indexed load / store address.
19940bool
19942 SDValue &Offset,
19944 SelectionDAG &DAG) const {
19945 if (Subtarget->isThumb1Only())
19946 return false;
19947
19948 EVT VT;
19949 SDValue Ptr;
19950 Align Alignment;
19951 bool isSEXTLoad = false;
19952 bool IsMasked = false;
19953 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19954 Ptr = LD->getBasePtr();
19955 VT = LD->getMemoryVT();
19956 Alignment = LD->getAlign();
19957 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19958 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19959 Ptr = ST->getBasePtr();
19960 VT = ST->getMemoryVT();
19961 Alignment = ST->getAlign();
19962 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19963 Ptr = LD->getBasePtr();
19964 VT = LD->getMemoryVT();
19965 Alignment = LD->getAlign();
19966 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19967 IsMasked = true;
19968 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
19969 Ptr = ST->getBasePtr();
19970 VT = ST->getMemoryVT();
19971 Alignment = ST->getAlign();
19972 IsMasked = true;
19973 } else
19974 return false;
19975
19976 bool isInc;
19977 bool isLegal = false;
19978 if (VT.isVector())
19979 isLegal = Subtarget->hasMVEIntegerOps() &&
19981 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19982 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19983 else {
19984 if (Subtarget->isThumb2())
19985 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19986 Offset, isInc, DAG);
19987 else
19988 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19989 Offset, isInc, DAG);
19990 }
19991 if (!isLegal)
19992 return false;
19993
19994 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19995 return true;
19996}
19997
19998/// getPostIndexedAddressParts - returns true by value, base pointer and
19999/// offset pointer and addressing mode by reference if this node can be
20000/// combined with a load / store to form a post-indexed load / store.
20002 SDValue &Base,
20003 SDValue &Offset,
20005 SelectionDAG &DAG) const {
20006 EVT VT;
20007 SDValue Ptr;
20008 Align Alignment;
20009 bool isSEXTLoad = false, isNonExt;
20010 bool IsMasked = false;
20011 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20012 VT = LD->getMemoryVT();
20013 Ptr = LD->getBasePtr();
20014 Alignment = LD->getAlign();
20015 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20016 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20017 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20018 VT = ST->getMemoryVT();
20019 Ptr = ST->getBasePtr();
20020 Alignment = ST->getAlign();
20021 isNonExt = !ST->isTruncatingStore();
20022 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
20023 VT = LD->getMemoryVT();
20024 Ptr = LD->getBasePtr();
20025 Alignment = LD->getAlign();
20026 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20027 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20028 IsMasked = true;
20029 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
20030 VT = ST->getMemoryVT();
20031 Ptr = ST->getBasePtr();
20032 Alignment = ST->getAlign();
20033 isNonExt = !ST->isTruncatingStore();
20034 IsMasked = true;
20035 } else
20036 return false;
20037
20038 if (Subtarget->isThumb1Only()) {
20039 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
20040 // must be non-extending/truncating, i32, with an offset of 4.
20041 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
20042 if (Op->getOpcode() != ISD::ADD || !isNonExt)
20043 return false;
20044 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
20045 if (!RHS || RHS->getZExtValue() != 4)
20046 return false;
20047 if (Alignment < Align(4))
20048 return false;
20049
20050 Offset = Op->getOperand(1);
20051 Base = Op->getOperand(0);
20052 AM = ISD::POST_INC;
20053 return true;
20054 }
20055
20056 bool isInc;
20057 bool isLegal = false;
20058 if (VT.isVector())
20059 isLegal = Subtarget->hasMVEIntegerOps() &&
20060 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
20061 Subtarget->isLittle(), Base, Offset,
20062 isInc, DAG);
20063 else {
20064 if (Subtarget->isThumb2())
20065 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20066 isInc, DAG);
20067 else
20068 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20069 isInc, DAG);
20070 }
20071 if (!isLegal)
20072 return false;
20073
20074 if (Ptr != Base) {
20075 // Swap base ptr and offset to catch more post-index load / store when
20076 // it's legal. In Thumb2 mode, offset must be an immediate.
20077 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
20078 !Subtarget->isThumb2())
20080
20081 // Post-indexed load / store update the base pointer.
20082 if (Ptr != Base)
20083 return false;
20084 }
20085
20086 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
20087 return true;
20088}
20089
20091 KnownBits &Known,
20092 const APInt &DemandedElts,
20093 const SelectionDAG &DAG,
20094 unsigned Depth) const {
20095 unsigned BitWidth = Known.getBitWidth();
20096 Known.resetAll();
20097 switch (Op.getOpcode()) {
20098 default: break;
20099 case ARMISD::ADDC:
20100 case ARMISD::ADDE:
20101 case ARMISD::SUBC:
20102 case ARMISD::SUBE:
20103 // Special cases when we convert a carry to a boolean.
20104 if (Op.getResNo() == 0) {
20105 SDValue LHS = Op.getOperand(0);
20106 SDValue RHS = Op.getOperand(1);
20107 // (ADDE 0, 0, C) will give us a single bit.
20108 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
20111 return;
20112 }
20113 }
20114 break;
20115 case ARMISD::CMOV: {
20116 // Bits are known zero/one if known on the LHS and RHS.
20117 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
20118 if (Known.isUnknown())
20119 return;
20120
20121 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
20122 Known = Known.intersectWith(KnownRHS);
20123 return;
20124 }
20126 Intrinsic::ID IntID =
20127 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
20128 switch (IntID) {
20129 default: return;
20130 case Intrinsic::arm_ldaex:
20131 case Intrinsic::arm_ldrex: {
20132 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20133 unsigned MemBits = VT.getScalarSizeInBits();
20134 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20135 return;
20136 }
20137 }
20138 }
20139 case ARMISD::BFI: {
20140 // Conservatively, we can recurse down the first operand
20141 // and just mask out all affected bits.
20142 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20143
20144 // The operand to BFI is already a mask suitable for removing the bits it
20145 // sets.
20146 const APInt &Mask = Op.getConstantOperandAPInt(2);
20147 Known.Zero &= Mask;
20148 Known.One &= Mask;
20149 return;
20150 }
20151 case ARMISD::VGETLANEs:
20152 case ARMISD::VGETLANEu: {
20153 const SDValue &SrcSV = Op.getOperand(0);
20154 EVT VecVT = SrcSV.getValueType();
20155 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20156 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20157 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20158 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20159 "VGETLANE index out of bounds");
20160 unsigned Idx = Pos->getZExtValue();
20161 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20162 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20163
20164 EVT VT = Op.getValueType();
20165 const unsigned DstSz = VT.getScalarSizeInBits();
20166 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20167 (void)SrcSz;
20168 assert(SrcSz == Known.getBitWidth());
20169 assert(DstSz > SrcSz);
20170 if (Op.getOpcode() == ARMISD::VGETLANEs)
20171 Known = Known.sext(DstSz);
20172 else {
20173 Known = Known.zext(DstSz);
20174 }
20175 assert(DstSz == Known.getBitWidth());
20176 break;
20177 }
20178 case ARMISD::VMOVrh: {
20179 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20180 assert(KnownOp.getBitWidth() == 16);
20181 Known = KnownOp.zext(32);
20182 break;
20183 }
20184 case ARMISD::CSINC:
20185 case ARMISD::CSINV:
20186 case ARMISD::CSNEG: {
20187 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20188 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20189
20190 // The result is either:
20191 // CSINC: KnownOp0 or KnownOp1 + 1
20192 // CSINV: KnownOp0 or ~KnownOp1
20193 // CSNEG: KnownOp0 or KnownOp1 * -1
20194 if (Op.getOpcode() == ARMISD::CSINC)
20195 KnownOp1 = KnownBits::computeForAddSub(
20196 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KnownOp1,
20198 else if (Op.getOpcode() == ARMISD::CSINV)
20199 std::swap(KnownOp1.Zero, KnownOp1.One);
20200 else if (Op.getOpcode() == ARMISD::CSNEG)
20201 KnownOp1 = KnownBits::mul(
20202 KnownOp1, KnownBits::makeConstant(APInt(32, -1)));
20203
20204 Known = KnownOp0.intersectWith(KnownOp1);
20205 break;
20206 }
20207 }
20208}
20209
20211 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20212 TargetLoweringOpt &TLO) const {
20213 // Delay optimization, so we don't have to deal with illegal types, or block
20214 // optimizations.
20215 if (!TLO.LegalOps)
20216 return false;
20217
20218 // Only optimize AND for now.
20219 if (Op.getOpcode() != ISD::AND)
20220 return false;
20221
20222 EVT VT = Op.getValueType();
20223
20224 // Ignore vectors.
20225 if (VT.isVector())
20226 return false;
20227
20228 assert(VT == MVT::i32 && "Unexpected integer type");
20229
20230 // Make sure the RHS really is a constant.
20231 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20232 if (!C)
20233 return false;
20234
20235 unsigned Mask = C->getZExtValue();
20236
20237 unsigned Demanded = DemandedBits.getZExtValue();
20238 unsigned ShrunkMask = Mask & Demanded;
20239 unsigned ExpandedMask = Mask | ~Demanded;
20240
20241 // If the mask is all zeros, let the target-independent code replace the
20242 // result with zero.
20243 if (ShrunkMask == 0)
20244 return false;
20245
20246 // If the mask is all ones, erase the AND. (Currently, the target-independent
20247 // code won't do this, so we have to do it explicitly to avoid an infinite
20248 // loop in obscure cases.)
20249 if (ExpandedMask == ~0U)
20250 return TLO.CombineTo(Op, Op.getOperand(0));
20251
20252 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20253 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20254 };
20255 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20256 if (NewMask == Mask)
20257 return true;
20258 SDLoc DL(Op);
20259 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20260 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20261 return TLO.CombineTo(Op, NewOp);
20262 };
20263
20264 // Prefer uxtb mask.
20265 if (IsLegalMask(0xFF))
20266 return UseMask(0xFF);
20267
20268 // Prefer uxth mask.
20269 if (IsLegalMask(0xFFFF))
20270 return UseMask(0xFFFF);
20271
20272 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20273 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20274 if (ShrunkMask < 256)
20275 return UseMask(ShrunkMask);
20276
20277 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20278 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20279 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20280 return UseMask(ExpandedMask);
20281
20282 // Potential improvements:
20283 //
20284 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20285 // We could try to prefer Thumb1 immediates which can be lowered to a
20286 // two-instruction sequence.
20287 // We could try to recognize more legal ARM/Thumb2 immediates here.
20288
20289 return false;
20290}
20291
20293 SDValue Op, const APInt &OriginalDemandedBits,
20294 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20295 unsigned Depth) const {
20296 unsigned Opc = Op.getOpcode();
20297
20298 switch (Opc) {
20299 case ARMISD::ASRL:
20300 case ARMISD::LSRL: {
20301 // If this is result 0 and the other result is unused, see if the demand
20302 // bits allow us to shrink this long shift into a standard small shift in
20303 // the opposite direction.
20304 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20305 isa<ConstantSDNode>(Op->getOperand(2))) {
20306 unsigned ShAmt = Op->getConstantOperandVal(2);
20307 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20308 << (32 - ShAmt)))
20309 return TLO.CombineTo(
20310 Op, TLO.DAG.getNode(
20311 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20312 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20313 }
20314 break;
20315 }
20316 case ARMISD::VBICIMM: {
20317 SDValue Op0 = Op.getOperand(0);
20318 unsigned ModImm = Op.getConstantOperandVal(1);
20319 unsigned EltBits = 0;
20320 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20321 if ((OriginalDemandedBits & Mask) == 0)
20322 return TLO.CombineTo(Op, Op0);
20323 }
20324 }
20325
20327 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20328}
20329
20330//===----------------------------------------------------------------------===//
20331// ARM Inline Assembly Support
20332//===----------------------------------------------------------------------===//
20333
20335 // Looking for "rev" which is V6+.
20336 if (!Subtarget->hasV6Ops())
20337 return false;
20338
20339 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
20340 StringRef AsmStr = IA->getAsmString();
20341 SmallVector<StringRef, 4> AsmPieces;
20342 SplitString(AsmStr, AsmPieces, ";\n");
20343
20344 switch (AsmPieces.size()) {
20345 default: return false;
20346 case 1:
20347 AsmStr = AsmPieces[0];
20348 AsmPieces.clear();
20349 SplitString(AsmStr, AsmPieces, " \t,");
20350
20351 // rev $0, $1
20352 if (AsmPieces.size() == 3 &&
20353 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
20354 IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
20355 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
20356 if (Ty && Ty->getBitWidth() == 32)
20358 }
20359 break;
20360 }
20361
20362 return false;
20363}
20364
20365const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20366 // At this point, we have to lower this constraint to something else, so we
20367 // lower it to an "r" or "w". However, by doing this we will force the result
20368 // to be in register, while the X constraint is much more permissive.
20369 //
20370 // Although we are correct (we are free to emit anything, without
20371 // constraints), we might break use cases that would expect us to be more
20372 // efficient and emit something else.
20373 if (!Subtarget->hasVFP2Base())
20374 return "r";
20375 if (ConstraintVT.isFloatingPoint())
20376 return "w";
20377 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20378 (ConstraintVT.getSizeInBits() == 64 ||
20379 ConstraintVT.getSizeInBits() == 128))
20380 return "w";
20381
20382 return "r";
20383}
20384
20385/// getConstraintType - Given a constraint letter, return the type of
20386/// constraint it is for this target.
20389 unsigned S = Constraint.size();
20390 if (S == 1) {
20391 switch (Constraint[0]) {
20392 default: break;
20393 case 'l': return C_RegisterClass;
20394 case 'w': return C_RegisterClass;
20395 case 'h': return C_RegisterClass;
20396 case 'x': return C_RegisterClass;
20397 case 't': return C_RegisterClass;
20398 case 'j': return C_Immediate; // Constant for movw.
20399 // An address with a single base register. Due to the way we
20400 // currently handle addresses it is the same as an 'r' memory constraint.
20401 case 'Q': return C_Memory;
20402 }
20403 } else if (S == 2) {
20404 switch (Constraint[0]) {
20405 default: break;
20406 case 'T': return C_RegisterClass;
20407 // All 'U+' constraints are addresses.
20408 case 'U': return C_Memory;
20409 }
20410 }
20411 return TargetLowering::getConstraintType(Constraint);
20412}
20413
20414/// Examine constraint type and operand type and determine a weight value.
20415/// This object must already have been set up with the operand type
20416/// and the current alternative constraint selected.
20419 AsmOperandInfo &info, const char *constraint) const {
20421 Value *CallOperandVal = info.CallOperandVal;
20422 // If we don't have a value, we can't do a match,
20423 // but allow it at the lowest weight.
20424 if (!CallOperandVal)
20425 return CW_Default;
20426 Type *type = CallOperandVal->getType();
20427 // Look at the constraint type.
20428 switch (*constraint) {
20429 default:
20431 break;
20432 case 'l':
20433 if (type->isIntegerTy()) {
20434 if (Subtarget->isThumb())
20435 weight = CW_SpecificReg;
20436 else
20437 weight = CW_Register;
20438 }
20439 break;
20440 case 'w':
20441 if (type->isFloatingPointTy())
20442 weight = CW_Register;
20443 break;
20444 }
20445 return weight;
20446}
20447
20448using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20449
20451 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20452 switch (Constraint.size()) {
20453 case 1:
20454 // GCC ARM Constraint Letters
20455 switch (Constraint[0]) {
20456 case 'l': // Low regs or general regs.
20457 if (Subtarget->isThumb())
20458 return RCPair(0U, &ARM::tGPRRegClass);
20459 return RCPair(0U, &ARM::GPRRegClass);
20460 case 'h': // High regs or no regs.
20461 if (Subtarget->isThumb())
20462 return RCPair(0U, &ARM::hGPRRegClass);
20463 break;
20464 case 'r':
20465 if (Subtarget->isThumb1Only())
20466 return RCPair(0U, &ARM::tGPRRegClass);
20467 return RCPair(0U, &ARM::GPRRegClass);
20468 case 'w':
20469 if (VT == MVT::Other)
20470 break;
20471 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20472 return RCPair(0U, &ARM::SPRRegClass);
20473 if (VT.getSizeInBits() == 64)
20474 return RCPair(0U, &ARM::DPRRegClass);
20475 if (VT.getSizeInBits() == 128)
20476 return RCPair(0U, &ARM::QPRRegClass);
20477 break;
20478 case 'x':
20479 if (VT == MVT::Other)
20480 break;
20481 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20482 return RCPair(0U, &ARM::SPR_8RegClass);
20483 if (VT.getSizeInBits() == 64)
20484 return RCPair(0U, &ARM::DPR_8RegClass);
20485 if (VT.getSizeInBits() == 128)
20486 return RCPair(0U, &ARM::QPR_8RegClass);
20487 break;
20488 case 't':
20489 if (VT == MVT::Other)
20490 break;
20491 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20492 return RCPair(0U, &ARM::SPRRegClass);
20493 if (VT.getSizeInBits() == 64)
20494 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20495 if (VT.getSizeInBits() == 128)
20496 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20497 break;
20498 }
20499 break;
20500
20501 case 2:
20502 if (Constraint[0] == 'T') {
20503 switch (Constraint[1]) {
20504 default:
20505 break;
20506 case 'e':
20507 return RCPair(0U, &ARM::tGPREvenRegClass);
20508 case 'o':
20509 return RCPair(0U, &ARM::tGPROddRegClass);
20510 }
20511 }
20512 break;
20513
20514 default:
20515 break;
20516 }
20517
20518 if (StringRef("{cc}").equals_insensitive(Constraint))
20519 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20520
20521 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20522}
20523
20524/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20525/// vector. If it is invalid, don't add anything to Ops.
20527 StringRef Constraint,
20528 std::vector<SDValue> &Ops,
20529 SelectionDAG &DAG) const {
20530 SDValue Result;
20531
20532 // Currently only support length 1 constraints.
20533 if (Constraint.size() != 1)
20534 return;
20535
20536 char ConstraintLetter = Constraint[0];
20537 switch (ConstraintLetter) {
20538 default: break;
20539 case 'j':
20540 case 'I': case 'J': case 'K': case 'L':
20541 case 'M': case 'N': case 'O':
20542 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
20543 if (!C)
20544 return;
20545
20546 int64_t CVal64 = C->getSExtValue();
20547 int CVal = (int) CVal64;
20548 // None of these constraints allow values larger than 32 bits. Check
20549 // that the value fits in an int.
20550 if (CVal != CVal64)
20551 return;
20552
20553 switch (ConstraintLetter) {
20554 case 'j':
20555 // Constant suitable for movw, must be between 0 and
20556 // 65535.
20557 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20558 if (CVal >= 0 && CVal <= 65535)
20559 break;
20560 return;
20561 case 'I':
20562 if (Subtarget->isThumb1Only()) {
20563 // This must be a constant between 0 and 255, for ADD
20564 // immediates.
20565 if (CVal >= 0 && CVal <= 255)
20566 break;
20567 } else if (Subtarget->isThumb2()) {
20568 // A constant that can be used as an immediate value in a
20569 // data-processing instruction.
20570 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20571 break;
20572 } else {
20573 // A constant that can be used as an immediate value in a
20574 // data-processing instruction.
20575 if (ARM_AM::getSOImmVal(CVal) != -1)
20576 break;
20577 }
20578 return;
20579
20580 case 'J':
20581 if (Subtarget->isThumb1Only()) {
20582 // This must be a constant between -255 and -1, for negated ADD
20583 // immediates. This can be used in GCC with an "n" modifier that
20584 // prints the negated value, for use with SUB instructions. It is
20585 // not useful otherwise but is implemented for compatibility.
20586 if (CVal >= -255 && CVal <= -1)
20587 break;
20588 } else {
20589 // This must be a constant between -4095 and 4095. It is not clear
20590 // what this constraint is intended for. Implemented for
20591 // compatibility with GCC.
20592 if (CVal >= -4095 && CVal <= 4095)
20593 break;
20594 }
20595 return;
20596
20597 case 'K':
20598 if (Subtarget->isThumb1Only()) {
20599 // A 32-bit value where only one byte has a nonzero value. Exclude
20600 // zero to match GCC. This constraint is used by GCC internally for
20601 // constants that can be loaded with a move/shift combination.
20602 // It is not useful otherwise but is implemented for compatibility.
20603 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20604 break;
20605 } else if (Subtarget->isThumb2()) {
20606 // A constant whose bitwise inverse can be used as an immediate
20607 // value in a data-processing instruction. This can be used in GCC
20608 // with a "B" modifier that prints the inverted value, for use with
20609 // BIC and MVN instructions. It is not useful otherwise but is
20610 // implemented for compatibility.
20611 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20612 break;
20613 } else {
20614 // A constant whose bitwise inverse can be used as an immediate
20615 // value in a data-processing instruction. This can be used in GCC
20616 // with a "B" modifier that prints the inverted value, for use with
20617 // BIC and MVN instructions. It is not useful otherwise but is
20618 // implemented for compatibility.
20619 if (ARM_AM::getSOImmVal(~CVal) != -1)
20620 break;
20621 }
20622 return;
20623
20624 case 'L':
20625 if (Subtarget->isThumb1Only()) {
20626 // This must be a constant between -7 and 7,
20627 // for 3-operand ADD/SUB immediate instructions.
20628 if (CVal >= -7 && CVal < 7)
20629 break;
20630 } else if (Subtarget->isThumb2()) {
20631 // A constant whose negation can be used as an immediate value in a
20632 // data-processing instruction. This can be used in GCC with an "n"
20633 // modifier that prints the negated value, for use with SUB
20634 // instructions. It is not useful otherwise but is implemented for
20635 // compatibility.
20636 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20637 break;
20638 } else {
20639 // A constant whose negation can be used as an immediate value in a
20640 // data-processing instruction. This can be used in GCC with an "n"
20641 // modifier that prints the negated value, for use with SUB
20642 // instructions. It is not useful otherwise but is implemented for
20643 // compatibility.
20644 if (ARM_AM::getSOImmVal(-CVal) != -1)
20645 break;
20646 }
20647 return;
20648
20649 case 'M':
20650 if (Subtarget->isThumb1Only()) {
20651 // This must be a multiple of 4 between 0 and 1020, for
20652 // ADD sp + immediate.
20653 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20654 break;
20655 } else {
20656 // A power of two or a constant between 0 and 32. This is used in
20657 // GCC for the shift amount on shifted register operands, but it is
20658 // useful in general for any shift amounts.
20659 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20660 break;
20661 }
20662 return;
20663
20664 case 'N':
20665 if (Subtarget->isThumb1Only()) {
20666 // This must be a constant between 0 and 31, for shift amounts.
20667 if (CVal >= 0 && CVal <= 31)
20668 break;
20669 }
20670 return;
20671
20672 case 'O':
20673 if (Subtarget->isThumb1Only()) {
20674 // This must be a multiple of 4 between -508 and 508, for
20675 // ADD/SUB sp = sp + immediate.
20676 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20677 break;
20678 }
20679 return;
20680 }
20681 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20682 break;
20683 }
20684
20685 if (Result.getNode()) {
20686 Ops.push_back(Result);
20687 return;
20688 }
20689 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20690}
20691
20693 const SDNode *N, MVT::SimpleValueType SVT) {
20694 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20695 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20696 "Unhandled Opcode in getDivRemLibcall");
20697 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20698 N->getOpcode() == ISD::SREM;
20699 RTLIB::Libcall LC;
20700 switch (SVT) {
20701 default: llvm_unreachable("Unexpected request for libcall!");
20702 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20703 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20704 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20705 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20706 }
20707 return LC;
20708}
20709
20711 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20712 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20713 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20714 "Unhandled Opcode in getDivRemArgList");
20715 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20716 N->getOpcode() == ISD::SREM;
20719 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20720 EVT ArgVT = N->getOperand(i).getValueType();
20721 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20722 Entry.Node = N->getOperand(i);
20723 Entry.Ty = ArgTy;
20724 Entry.IsSExt = isSigned;
20725 Entry.IsZExt = !isSigned;
20726 Args.push_back(Entry);
20727 }
20728 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20729 std::swap(Args[0], Args[1]);
20730 return Args;
20731}
20732
20733SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20734 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20735 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20736 Subtarget->isTargetWindows()) &&
20737 "Register-based DivRem lowering only");
20738 unsigned Opcode = Op->getOpcode();
20739 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20740 "Invalid opcode for Div/Rem lowering");
20741 bool isSigned = (Opcode == ISD::SDIVREM);
20742 EVT VT = Op->getValueType(0);
20743 SDLoc dl(Op);
20744
20745 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20747 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20748 SDValue Res0 =
20749 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20750 SDValue Res1 =
20751 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20752 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20753 {Res0, Res1});
20754 }
20755 }
20756
20757 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20758
20759 // If the target has hardware divide, use divide + multiply + subtract:
20760 // div = a / b
20761 // rem = a - b * div
20762 // return {div, rem}
20763 // This should be lowered into UDIV/SDIV + MLS later on.
20764 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20765 : Subtarget->hasDivideInARMMode();
20766 if (hasDivide && Op->getValueType(0).isSimple() &&
20767 Op->getSimpleValueType(0) == MVT::i32) {
20768 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20769 const SDValue Dividend = Op->getOperand(0);
20770 const SDValue Divisor = Op->getOperand(1);
20771 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20772 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20773 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20774
20775 SDValue Values[2] = {Div, Rem};
20776 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20777 }
20778
20779 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20780 VT.getSimpleVT().SimpleTy);
20781 SDValue InChain = DAG.getEntryNode();
20782
20784 DAG.getContext(),
20785 Subtarget);
20786
20789
20790 Type *RetTy = StructType::get(Ty, Ty);
20791
20792 if (Subtarget->isTargetWindows())
20793 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20794
20796 CLI.setDebugLoc(dl).setChain(InChain)
20797 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20799
20800 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20801 return CallInfo.first;
20802}
20803
20804// Lowers REM using divmod helpers
20805// see RTABI section 4.2/4.3
20806SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20807 EVT VT = N->getValueType(0);
20808
20809 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20811 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20812 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20813 Result[0], Result[1]);
20814 }
20815
20816 // Build return types (div and rem)
20817 std::vector<Type*> RetTyParams;
20818 Type *RetTyElement;
20819
20820 switch (VT.getSimpleVT().SimpleTy) {
20821 default: llvm_unreachable("Unexpected request for libcall!");
20822 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20823 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20824 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20825 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20826 }
20827
20828 RetTyParams.push_back(RetTyElement);
20829 RetTyParams.push_back(RetTyElement);
20830 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20831 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20832
20833 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20834 SimpleTy);
20835 SDValue InChain = DAG.getEntryNode();
20837 Subtarget);
20838 bool isSigned = N->getOpcode() == ISD::SREM;
20841
20842 if (Subtarget->isTargetWindows())
20843 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20844
20845 // Lower call
20846 CallLoweringInfo CLI(DAG);
20847 CLI.setChain(InChain)
20848 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20850 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20851
20852 // Return second (rem) result operand (first contains div)
20853 SDNode *ResNode = CallResult.first.getNode();
20854 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20855 return ResNode->getOperand(1);
20856}
20857
20858SDValue
20859ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20860 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20861 SDLoc DL(Op);
20862
20863 // Get the inputs.
20864 SDValue Chain = Op.getOperand(0);
20865 SDValue Size = Op.getOperand(1);
20866
20868 "no-stack-arg-probe")) {
20870 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20871 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20872 Chain = SP.getValue(1);
20873 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20874 if (Align)
20875 SP =
20876 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20877 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
20878 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20879 SDValue Ops[2] = { SP, Chain };
20880 return DAG.getMergeValues(Ops, DL);
20881 }
20882
20883 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20884 DAG.getConstant(2, DL, MVT::i32));
20885
20886 SDValue Glue;
20887 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20888 Glue = Chain.getValue(1);
20889
20890 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20891 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20892
20893 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20894 Chain = NewSP.getValue(1);
20895
20896 SDValue Ops[2] = { NewSP, Chain };
20897 return DAG.getMergeValues(Ops, DL);
20898}
20899
20900SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20901 bool IsStrict = Op->isStrictFPOpcode();
20902 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20903 const unsigned DstSz = Op.getValueType().getSizeInBits();
20904 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20905 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20906 "Unexpected type for custom-lowering FP_EXTEND");
20907
20908 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20909 "With both FP DP and 16, any FP conversion is legal!");
20910
20911 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20912 "With FP16, 16 to 32 conversion is legal!");
20913
20914 // Converting from 32 -> 64 is valid if we have FP64.
20915 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20916 // FIXME: Remove this when we have strict fp instruction selection patterns
20917 if (IsStrict) {
20918 SDLoc Loc(Op);
20920 Loc, Op.getValueType(), SrcVal);
20921 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20922 }
20923 return Op;
20924 }
20925
20926 // Either we are converting from 16 -> 64, without FP16 and/or
20927 // FP.double-precision or without Armv8-fp. So we must do it in two
20928 // steps.
20929 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20930 // without FP16. So we must do a function call.
20931 SDLoc Loc(Op);
20932 RTLIB::Libcall LC;
20933 MakeLibCallOptions CallOptions;
20934 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20935 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20936 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20937 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20938 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20939 if (Supported) {
20940 if (IsStrict) {
20941 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20942 {DstVT, MVT::Other}, {Chain, SrcVal});
20943 Chain = SrcVal.getValue(1);
20944 } else {
20945 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20946 }
20947 } else {
20948 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20949 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20950 "Unexpected type for custom-lowering FP_EXTEND");
20951 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20952 Loc, Chain);
20953 }
20954 }
20955
20956 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20957}
20958
20959SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20960 bool IsStrict = Op->isStrictFPOpcode();
20961
20962 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20963 EVT SrcVT = SrcVal.getValueType();
20964 EVT DstVT = Op.getValueType();
20965 const unsigned DstSz = Op.getValueType().getSizeInBits();
20966 const unsigned SrcSz = SrcVT.getSizeInBits();
20967 (void)DstSz;
20968 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20969 "Unexpected type for custom-lowering FP_ROUND");
20970
20971 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20972 "With both FP DP and 16, any FP conversion is legal!");
20973
20974 SDLoc Loc(Op);
20975
20976 // Instruction from 32 -> 16 if hasFP16 is valid
20977 if (SrcSz == 32 && Subtarget->hasFP16())
20978 return Op;
20979
20980 // Lib call from 32 -> 16 / 64 -> [32, 16]
20981 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20982 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20983 "Unexpected type for custom-lowering FP_ROUND");
20984 MakeLibCallOptions CallOptions;
20985 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20987 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20988 Loc, Chain);
20989 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20990}
20991
20992bool
20994 // The ARM target isn't yet aware of offsets.
20995 return false;
20996}
20997
20999 if (v == 0xffffffff)
21000 return false;
21001
21002 // there can be 1's on either or both "outsides", all the "inside"
21003 // bits must be 0's
21004 return isShiftedMask_32(~v);
21005}
21006
21007/// isFPImmLegal - Returns true if the target can instruction select the
21008/// specified FP immediate natively. If false, the legalizer will
21009/// materialize the FP immediate as a load from a constant pool.
21011 bool ForCodeSize) const {
21012 if (!Subtarget->hasVFP3Base())
21013 return false;
21014 if (VT == MVT::f16 && Subtarget->hasFullFP16())
21015 return ARM_AM::getFP16Imm(Imm) != -1;
21016 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
21017 ARM_AM::getFP32FP16Imm(Imm) != -1)
21018 return true;
21019 if (VT == MVT::f32)
21020 return ARM_AM::getFP32Imm(Imm) != -1;
21021 if (VT == MVT::f64 && Subtarget->hasFP64())
21022 return ARM_AM::getFP64Imm(Imm) != -1;
21023 return false;
21024}
21025
21026/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
21027/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
21028/// specified in the intrinsic calls.
21030 const CallInst &I,
21031 MachineFunction &MF,
21032 unsigned Intrinsic) const {
21033 switch (Intrinsic) {
21034 case Intrinsic::arm_neon_vld1:
21035 case Intrinsic::arm_neon_vld2:
21036 case Intrinsic::arm_neon_vld3:
21037 case Intrinsic::arm_neon_vld4:
21038 case Intrinsic::arm_neon_vld2lane:
21039 case Intrinsic::arm_neon_vld3lane:
21040 case Intrinsic::arm_neon_vld4lane:
21041 case Intrinsic::arm_neon_vld2dup:
21042 case Intrinsic::arm_neon_vld3dup:
21043 case Intrinsic::arm_neon_vld4dup: {
21045 // Conservatively set memVT to the entire set of vectors loaded.
21046 auto &DL = I.getDataLayout();
21047 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21048 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21049 Info.ptrVal = I.getArgOperand(0);
21050 Info.offset = 0;
21051 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21052 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21053 // volatile loads with NEON intrinsics not supported
21055 return true;
21056 }
21057 case Intrinsic::arm_neon_vld1x2:
21058 case Intrinsic::arm_neon_vld1x3:
21059 case Intrinsic::arm_neon_vld1x4: {
21061 // Conservatively set memVT to the entire set of vectors loaded.
21062 auto &DL = I.getDataLayout();
21063 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21064 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21065 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
21066 Info.offset = 0;
21067 Info.align.reset();
21068 // volatile loads with NEON intrinsics not supported
21070 return true;
21071 }
21072 case Intrinsic::arm_neon_vst1:
21073 case Intrinsic::arm_neon_vst2:
21074 case Intrinsic::arm_neon_vst3:
21075 case Intrinsic::arm_neon_vst4:
21076 case Intrinsic::arm_neon_vst2lane:
21077 case Intrinsic::arm_neon_vst3lane:
21078 case Intrinsic::arm_neon_vst4lane: {
21080 // Conservatively set memVT to the entire set of vectors stored.
21081 auto &DL = I.getDataLayout();
21082 unsigned NumElts = 0;
21083 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21084 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21085 if (!ArgTy->isVectorTy())
21086 break;
21087 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21088 }
21089 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21090 Info.ptrVal = I.getArgOperand(0);
21091 Info.offset = 0;
21092 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21093 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21094 // volatile stores with NEON intrinsics not supported
21096 return true;
21097 }
21098 case Intrinsic::arm_neon_vst1x2:
21099 case Intrinsic::arm_neon_vst1x3:
21100 case Intrinsic::arm_neon_vst1x4: {
21102 // Conservatively set memVT to the entire set of vectors stored.
21103 auto &DL = I.getDataLayout();
21104 unsigned NumElts = 0;
21105 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21106 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21107 if (!ArgTy->isVectorTy())
21108 break;
21109 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21110 }
21111 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21112 Info.ptrVal = I.getArgOperand(0);
21113 Info.offset = 0;
21114 Info.align.reset();
21115 // volatile stores with NEON intrinsics not supported
21117 return true;
21118 }
21119 case Intrinsic::arm_mve_vld2q:
21120 case Intrinsic::arm_mve_vld4q: {
21122 // Conservatively set memVT to the entire set of vectors loaded.
21123 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21124 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21125 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21126 Info.ptrVal = I.getArgOperand(0);
21127 Info.offset = 0;
21128 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21129 // volatile loads with MVE intrinsics not supported
21131 return true;
21132 }
21133 case Intrinsic::arm_mve_vst2q:
21134 case Intrinsic::arm_mve_vst4q: {
21136 // Conservatively set memVT to the entire set of vectors stored.
21137 Type *VecTy = I.getArgOperand(1)->getType();
21138 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21139 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21140 Info.ptrVal = I.getArgOperand(0);
21141 Info.offset = 0;
21142 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21143 // volatile stores with MVE intrinsics not supported
21145 return true;
21146 }
21147 case Intrinsic::arm_mve_vldr_gather_base:
21148 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21150 Info.ptrVal = nullptr;
21151 Info.memVT = MVT::getVT(I.getType());
21152 Info.align = Align(1);
21154 return true;
21155 }
21156 case Intrinsic::arm_mve_vldr_gather_base_wb:
21157 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21159 Info.ptrVal = nullptr;
21160 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21161 Info.align = Align(1);
21163 return true;
21164 }
21165 case Intrinsic::arm_mve_vldr_gather_offset:
21166 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21168 Info.ptrVal = nullptr;
21169 MVT DataVT = MVT::getVT(I.getType());
21170 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21171 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21172 DataVT.getVectorNumElements());
21173 Info.align = Align(1);
21175 return true;
21176 }
21177 case Intrinsic::arm_mve_vstr_scatter_base:
21178 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21180 Info.ptrVal = nullptr;
21181 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21182 Info.align = Align(1);
21184 return true;
21185 }
21186 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21187 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21189 Info.ptrVal = nullptr;
21190 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21191 Info.align = Align(1);
21193 return true;
21194 }
21195 case Intrinsic::arm_mve_vstr_scatter_offset:
21196 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21198 Info.ptrVal = nullptr;
21199 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21200 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21201 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21202 DataVT.getVectorNumElements());
21203 Info.align = Align(1);
21205 return true;
21206 }
21207 case Intrinsic::arm_ldaex:
21208 case Intrinsic::arm_ldrex: {
21209 auto &DL = I.getDataLayout();
21210 Type *ValTy = I.getParamElementType(0);
21212 Info.memVT = MVT::getVT(ValTy);
21213 Info.ptrVal = I.getArgOperand(0);
21214 Info.offset = 0;
21215 Info.align = DL.getABITypeAlign(ValTy);
21217 return true;
21218 }
21219 case Intrinsic::arm_stlex:
21220 case Intrinsic::arm_strex: {
21221 auto &DL = I.getDataLayout();
21222 Type *ValTy = I.getParamElementType(1);
21224 Info.memVT = MVT::getVT(ValTy);
21225 Info.ptrVal = I.getArgOperand(1);
21226 Info.offset = 0;
21227 Info.align = DL.getABITypeAlign(ValTy);
21229 return true;
21230 }
21231 case Intrinsic::arm_stlexd:
21232 case Intrinsic::arm_strexd:
21234 Info.memVT = MVT::i64;
21235 Info.ptrVal = I.getArgOperand(2);
21236 Info.offset = 0;
21237 Info.align = Align(8);
21239 return true;
21240
21241 case Intrinsic::arm_ldaexd:
21242 case Intrinsic::arm_ldrexd:
21244 Info.memVT = MVT::i64;
21245 Info.ptrVal = I.getArgOperand(0);
21246 Info.offset = 0;
21247 Info.align = Align(8);
21249 return true;
21250
21251 default:
21252 break;
21253 }
21254
21255 return false;
21256}
21257
21258/// Returns true if it is beneficial to convert a load of a constant
21259/// to just the constant itself.
21261 Type *Ty) const {
21262 assert(Ty->isIntegerTy());
21263
21264 unsigned Bits = Ty->getPrimitiveSizeInBits();
21265 if (Bits == 0 || Bits > 32)
21266 return false;
21267 return true;
21268}
21269
21271 unsigned Index) const {
21273 return false;
21274
21275 return (Index == 0 || Index == ResVT.getVectorNumElements());
21276}
21277
21279 ARM_MB::MemBOpt Domain) const {
21280 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21281
21282 // First, if the target has no DMB, see what fallback we can use.
21283 if (!Subtarget->hasDataBarrier()) {
21284 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21285 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21286 // here.
21287 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21288 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
21289 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21290 Builder.getInt32(0), Builder.getInt32(7),
21291 Builder.getInt32(10), Builder.getInt32(5)};
21292 return Builder.CreateCall(MCR, args);
21293 } else {
21294 // Instead of using barriers, atomic accesses on these subtargets use
21295 // libcalls.
21296 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21297 }
21298 } else {
21299 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
21300 // Only a full system barrier exists in the M-class architectures.
21301 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21302 Constant *CDomain = Builder.getInt32(Domain);
21303 return Builder.CreateCall(DMB, CDomain);
21304 }
21305}
21306
21307// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21309 Instruction *Inst,
21310 AtomicOrdering Ord) const {
21311 switch (Ord) {
21314 llvm_unreachable("Invalid fence: unordered/non-atomic");
21317 return nullptr; // Nothing to do
21319 if (!Inst->hasAtomicStore())
21320 return nullptr; // Nothing to do
21321 [[fallthrough]];
21324 if (Subtarget->preferISHSTBarriers())
21325 return makeDMB(Builder, ARM_MB::ISHST);
21326 // FIXME: add a comment with a link to documentation justifying this.
21327 else
21328 return makeDMB(Builder, ARM_MB::ISH);
21329 }
21330 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21331}
21332
21334 Instruction *Inst,
21335 AtomicOrdering Ord) const {
21336 switch (Ord) {
21339 llvm_unreachable("Invalid fence: unordered/not-atomic");
21342 return nullptr; // Nothing to do
21346 return makeDMB(Builder, ARM_MB::ISH);
21347 }
21348 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21349}
21350
21351// Loads and stores less than 64-bits are already atomic; ones above that
21352// are doomed anyway, so defer to the default libcall and blame the OS when
21353// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21354// anything for those.
21357 bool has64BitAtomicStore;
21358 if (Subtarget->isMClass())
21359 has64BitAtomicStore = false;
21360 else if (Subtarget->isThumb())
21361 has64BitAtomicStore = Subtarget->hasV7Ops();
21362 else
21363 has64BitAtomicStore = Subtarget->hasV6Ops();
21364
21365 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21366 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21368}
21369
21370// Loads and stores less than 64-bits are already atomic; ones above that
21371// are doomed anyway, so defer to the default libcall and blame the OS when
21372// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21373// anything for those.
21374// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21375// guarantee, see DDI0406C ARM architecture reference manual,
21376// sections A8.8.72-74 LDRD)
21379 bool has64BitAtomicLoad;
21380 if (Subtarget->isMClass())
21381 has64BitAtomicLoad = false;
21382 else if (Subtarget->isThumb())
21383 has64BitAtomicLoad = Subtarget->hasV7Ops();
21384 else
21385 has64BitAtomicLoad = Subtarget->hasV6Ops();
21386
21387 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21388 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21390}
21391
21392// For the real atomic operations, we have ldrex/strex up to 32 bits,
21393// and up to 64 bits on the non-M profiles
21396 if (AI->isFloatingPointOperation())
21398
21399 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21400 bool hasAtomicRMW;
21401 if (Subtarget->isMClass())
21402 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21403 else if (Subtarget->isThumb())
21404 hasAtomicRMW = Subtarget->hasV7Ops();
21405 else
21406 hasAtomicRMW = Subtarget->hasV6Ops();
21407 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21408 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21409 // implement atomicrmw without spilling. If the target address is also on
21410 // the stack and close enough to the spill slot, this can lead to a
21411 // situation where the monitor always gets cleared and the atomic operation
21412 // can never succeed. So at -O0 lower this operation to a CAS loop.
21413 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21416 }
21418}
21419
21420// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21421// bits, and up to 64 bits on the non-M profiles.
21424 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21425 // implement cmpxchg without spilling. If the address being exchanged is also
21426 // on the stack and close enough to the spill slot, this can lead to a
21427 // situation where the monitor always gets cleared and the atomic operation
21428 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21429 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21430 bool HasAtomicCmpXchg;
21431 if (Subtarget->isMClass())
21432 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21433 else if (Subtarget->isThumb())
21434 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21435 else
21436 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21437 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21438 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21441}
21442
21444 const Instruction *I) const {
21445 return InsertFencesForAtomic;
21446}
21447
21449 // ROPI/RWPI are not supported currently.
21450 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21451}
21452
21454 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21456
21457 // MSVC CRT has a global variable holding security cookie.
21458 M.getOrInsertGlobal("__security_cookie",
21459 PointerType::getUnqual(M.getContext()));
21460
21461 // MSVC CRT has a function to validate security cookie.
21462 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21463 "__security_check_cookie", Type::getVoidTy(M.getContext()),
21464 PointerType::getUnqual(M.getContext()));
21465 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21466 F->addParamAttr(0, Attribute::AttrKind::InReg);
21467}
21468
21470 // MSVC CRT has a global variable holding security cookie.
21471 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21472 return M.getGlobalVariable("__security_cookie");
21474}
21475
21477 // MSVC CRT has a function to validate security cookie.
21478 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21479 return M.getFunction("__security_check_cookie");
21481}
21482
21484 unsigned &Cost) const {
21485 // If we do not have NEON, vector types are not natively supported.
21486 if (!Subtarget->hasNEON())
21487 return false;
21488
21489 // Floating point values and vector values map to the same register file.
21490 // Therefore, although we could do a store extract of a vector type, this is
21491 // better to leave at float as we have more freedom in the addressing mode for
21492 // those.
21493 if (VectorTy->isFPOrFPVectorTy())
21494 return false;
21495
21496 // If the index is unknown at compile time, this is very expensive to lower
21497 // and it is not possible to combine the store with the extract.
21498 if (!isa<ConstantInt>(Idx))
21499 return false;
21500
21501 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21502 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21503 // We can do a store + vector extract on any vector that fits perfectly in a D
21504 // or Q register.
21505 if (BitWidth == 64 || BitWidth == 128) {
21506 Cost = 0;
21507 return true;
21508 }
21509 return false;
21510}
21511
21513 return Subtarget->hasV6T2Ops();
21514}
21515
21517 return Subtarget->hasV6T2Ops();
21518}
21519
21521 const Instruction &AndI) const {
21522 if (!Subtarget->hasV7Ops())
21523 return false;
21524
21525 // Sink the `and` instruction only if the mask would fit into a modified
21526 // immediate operand.
21527 ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
21528 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21529 return false;
21530 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21531 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21532 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21533}
21534
21537 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21538 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21541 ExpansionFactor);
21542}
21543
21545 Value *Addr,
21546 AtomicOrdering Ord) const {
21547 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21548 bool IsAcquire = isAcquireOrStronger(Ord);
21549
21550 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21551 // intrinsic must return {i32, i32} and we have to recombine them into a
21552 // single i64 here.
21553 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21555 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21557
21558 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
21559
21560 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21561 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21562 if (!Subtarget->isLittle())
21563 std::swap (Lo, Hi);
21564 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21565 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21566 return Builder.CreateOr(
21567 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21568 }
21569
21570 Type *Tys[] = { Addr->getType() };
21571 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21572 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
21573 CallInst *CI = Builder.CreateCall(Ldrex, Addr);
21574
21575 CI->addParamAttr(
21576 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21577 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21578}
21579
21581 IRBuilderBase &Builder) const {
21582 if (!Subtarget->hasV7Ops())
21583 return;
21584 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21585 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
21586}
21587
21589 Value *Val, Value *Addr,
21590 AtomicOrdering Ord) const {
21591 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21592 bool IsRelease = isReleaseOrStronger(Ord);
21593
21594 // Since the intrinsics must have legal type, the i64 intrinsics take two
21595 // parameters: "i32, i32". We must marshal Val into the appropriate form
21596 // before the call.
21597 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21599 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21601 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21602
21603 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21604 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21605 if (!Subtarget->isLittle())
21606 std::swap(Lo, Hi);
21607 return Builder.CreateCall(Strex, {Lo, Hi, Addr});
21608 }
21609
21610 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21611 Type *Tys[] = { Addr->getType() };
21612 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
21613
21614 CallInst *CI = Builder.CreateCall(
21615 Strex, {Builder.CreateZExtOrBitCast(
21616 Val, Strex->getFunctionType()->getParamType(0)),
21617 Addr});
21618 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21619 Val->getType()));
21620 return CI;
21621}
21622
21623
21625 return Subtarget->isMClass();
21626}
21627
21628/// A helper function for determining the number of interleaved accesses we
21629/// will generate when lowering accesses of the given type.
21630unsigned
21632 const DataLayout &DL) const {
21633 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21634}
21635
21637 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21638 const DataLayout &DL) const {
21639
21640 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21641 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21642
21643 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21644 return false;
21645
21646 // Ensure the vector doesn't have f16 elements. Even though we could do an
21647 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21648 // f32.
21649 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21650 return false;
21651 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21652 return false;
21653
21654 // Ensure the number of vector elements is greater than 1.
21655 if (VecTy->getNumElements() < 2)
21656 return false;
21657
21658 // Ensure the element type is legal.
21659 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21660 return false;
21661 // And the alignment if high enough under MVE.
21662 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21663 return false;
21664
21665 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21666 // 128 will be split into multiple interleaved accesses.
21667 if (Subtarget->hasNEON() && VecSize == 64)
21668 return true;
21669 return VecSize % 128 == 0;
21670}
21671
21673 if (Subtarget->hasNEON())
21674 return 4;
21675 if (Subtarget->hasMVEIntegerOps())
21678}
21679
21680/// Lower an interleaved load into a vldN intrinsic.
21681///
21682/// E.g. Lower an interleaved load (Factor = 2):
21683/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21684/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21685/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21686///
21687/// Into:
21688/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21689/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21690/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21693 ArrayRef<unsigned> Indices, unsigned Factor) const {
21694 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21695 "Invalid interleave factor");
21696 assert(!Shuffles.empty() && "Empty shufflevector input");
21697 assert(Shuffles.size() == Indices.size() &&
21698 "Unmatched number of shufflevectors and indices");
21699
21700 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21701 Type *EltTy = VecTy->getElementType();
21702
21703 const DataLayout &DL = LI->getDataLayout();
21704 Align Alignment = LI->getAlign();
21705
21706 // Skip if we do not have NEON and skip illegal vector types. We can
21707 // "legalize" wide vector types into multiple interleaved accesses as long as
21708 // the vector types are divisible by 128.
21709 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21710 return false;
21711
21712 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21713
21714 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21715 // load integer vectors first and then convert to pointer vectors.
21716 if (EltTy->isPointerTy())
21717 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21718
21719 IRBuilder<> Builder(LI);
21720
21721 // The base address of the load.
21722 Value *BaseAddr = LI->getPointerOperand();
21723
21724 if (NumLoads > 1) {
21725 // If we're going to generate more than one load, reset the sub-vector type
21726 // to something legal.
21727 VecTy = FixedVectorType::get(VecTy->getElementType(),
21728 VecTy->getNumElements() / NumLoads);
21729 }
21730
21731 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21732
21733 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21734 if (Subtarget->hasNEON()) {
21735 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21736 Type *Tys[] = {VecTy, PtrTy};
21737 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21738 Intrinsic::arm_neon_vld3,
21739 Intrinsic::arm_neon_vld4};
21740 Function *VldnFunc =
21741 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
21742
21744 Ops.push_back(BaseAddr);
21745 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21746
21747 return Builder.CreateCall(VldnFunc, Ops, "vldN");
21748 } else {
21749 assert((Factor == 2 || Factor == 4) &&
21750 "expected interleave factor of 2 or 4 for MVE");
21751 Intrinsic::ID LoadInts =
21752 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21753 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21754 Type *Tys[] = {VecTy, PtrTy};
21755 Function *VldnFunc =
21756 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
21757
21759 Ops.push_back(BaseAddr);
21760 return Builder.CreateCall(VldnFunc, Ops, "vldN");
21761 }
21762 };
21763
21764 // Holds sub-vectors extracted from the load intrinsic return values. The
21765 // sub-vectors are associated with the shufflevector instructions they will
21766 // replace.
21768
21769 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21770 // If we're generating more than one load, compute the base address of
21771 // subsequent loads as an offset from the previous.
21772 if (LoadCount > 0)
21773 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21774 VecTy->getNumElements() * Factor);
21775
21776 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21777
21778 // Replace uses of each shufflevector with the corresponding vector loaded
21779 // by ldN.
21780 for (unsigned i = 0; i < Shuffles.size(); i++) {
21781 ShuffleVectorInst *SV = Shuffles[i];
21782 unsigned Index = Indices[i];
21783
21784 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21785
21786 // Convert the integer vector to pointer vector if the element is pointer.
21787 if (EltTy->isPointerTy())
21788 SubVec = Builder.CreateIntToPtr(
21789 SubVec,
21791
21792 SubVecs[SV].push_back(SubVec);
21793 }
21794 }
21795
21796 // Replace uses of the shufflevector instructions with the sub-vectors
21797 // returned by the load intrinsic. If a shufflevector instruction is
21798 // associated with more than one sub-vector, those sub-vectors will be
21799 // concatenated into a single wide vector.
21800 for (ShuffleVectorInst *SVI : Shuffles) {
21801 auto &SubVec = SubVecs[SVI];
21802 auto *WideVec =
21803 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21804 SVI->replaceAllUsesWith(WideVec);
21805 }
21806
21807 return true;
21808}
21809
21810/// Lower an interleaved store into a vstN intrinsic.
21811///
21812/// E.g. Lower an interleaved store (Factor = 3):
21813/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21814/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21815/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21816///
21817/// Into:
21818/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21819/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21820/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21821/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21822///
21823/// Note that the new shufflevectors will be removed and we'll only generate one
21824/// vst3 instruction in CodeGen.
21825///
21826/// Example for a more general valid mask (Factor 3). Lower:
21827/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21828/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21829/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21830///
21831/// Into:
21832/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21833/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21834/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21835/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21837 ShuffleVectorInst *SVI,
21838 unsigned Factor) const {
21839 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21840 "Invalid interleave factor");
21841
21842 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21843 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21844
21845 unsigned LaneLen = VecTy->getNumElements() / Factor;
21846 Type *EltTy = VecTy->getElementType();
21847 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21848
21849 const DataLayout &DL = SI->getDataLayout();
21850 Align Alignment = SI->getAlign();
21851
21852 // Skip if we do not have NEON and skip illegal vector types. We can
21853 // "legalize" wide vector types into multiple interleaved accesses as long as
21854 // the vector types are divisible by 128.
21855 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21856 return false;
21857
21858 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21859
21860 Value *Op0 = SVI->getOperand(0);
21861 Value *Op1 = SVI->getOperand(1);
21862 IRBuilder<> Builder(SI);
21863
21864 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21865 // vectors to integer vectors.
21866 if (EltTy->isPointerTy()) {
21867 Type *IntTy = DL.getIntPtrType(EltTy);
21868
21869 // Convert to the corresponding integer vector.
21870 auto *IntVecTy =
21871 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
21872 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21873 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21874
21875 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21876 }
21877
21878 // The base address of the store.
21879 Value *BaseAddr = SI->getPointerOperand();
21880
21881 if (NumStores > 1) {
21882 // If we're going to generate more than one store, reset the lane length
21883 // and sub-vector type to something legal.
21884 LaneLen /= NumStores;
21885 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21886 }
21887
21888 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21889
21890 auto Mask = SVI->getShuffleMask();
21891
21892 auto createStoreIntrinsic = [&](Value *BaseAddr,
21893 SmallVectorImpl<Value *> &Shuffles) {
21894 if (Subtarget->hasNEON()) {
21895 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21896 Intrinsic::arm_neon_vst3,
21897 Intrinsic::arm_neon_vst4};
21898 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21899 Type *Tys[] = {PtrTy, SubVecTy};
21900
21902 SI->getModule(), StoreInts[Factor - 2], Tys);
21903
21905 Ops.push_back(BaseAddr);
21906 append_range(Ops, Shuffles);
21907 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21908 Builder.CreateCall(VstNFunc, Ops);
21909 } else {
21910 assert((Factor == 2 || Factor == 4) &&
21911 "expected interleave factor of 2 or 4 for MVE");
21912 Intrinsic::ID StoreInts =
21913 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21914 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21915 Type *Tys[] = {PtrTy, SubVecTy};
21916 Function *VstNFunc =
21917 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
21918
21920 Ops.push_back(BaseAddr);
21921 append_range(Ops, Shuffles);
21922 for (unsigned F = 0; F < Factor; F++) {
21923 Ops.push_back(Builder.getInt32(F));
21924 Builder.CreateCall(VstNFunc, Ops);
21925 Ops.pop_back();
21926 }
21927 }
21928 };
21929
21930 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21931 // If we generating more than one store, we compute the base address of
21932 // subsequent stores as an offset from the previous.
21933 if (StoreCount > 0)
21934 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21935 BaseAddr, LaneLen * Factor);
21936
21937 SmallVector<Value *, 4> Shuffles;
21938
21939 // Split the shufflevector operands into sub vectors for the new vstN call.
21940 for (unsigned i = 0; i < Factor; i++) {
21941 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21942 if (Mask[IdxI] >= 0) {
21943 Shuffles.push_back(Builder.CreateShuffleVector(
21944 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21945 } else {
21946 unsigned StartMask = 0;
21947 for (unsigned j = 1; j < LaneLen; j++) {
21948 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21949 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21950 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21951 break;
21952 }
21953 }
21954 // Note: If all elements in a chunk are undefs, StartMask=0!
21955 // Note: Filling undef gaps with random elements is ok, since
21956 // those elements were being written anyway (with undefs).
21957 // In the case of all undefs we're defaulting to using elems from 0
21958 // Note: StartMask cannot be negative, it's checked in
21959 // isReInterleaveMask
21960 Shuffles.push_back(Builder.CreateShuffleVector(
21961 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21962 }
21963 }
21964
21965 createStoreIntrinsic(BaseAddr, Shuffles);
21966 }
21967 return true;
21968}
21969
21977
21979 uint64_t &Members) {
21980 if (auto *ST = dyn_cast<StructType>(Ty)) {
21981 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21982 uint64_t SubMembers = 0;
21983 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21984 return false;
21985 Members += SubMembers;
21986 }
21987 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21988 uint64_t SubMembers = 0;
21989 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21990 return false;
21991 Members += SubMembers * AT->getNumElements();
21992 } else if (Ty->isFloatTy()) {
21993 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21994 return false;
21995 Members = 1;
21996 Base = HA_FLOAT;
21997 } else if (Ty->isDoubleTy()) {
21998 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21999 return false;
22000 Members = 1;
22001 Base = HA_DOUBLE;
22002 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
22003 Members = 1;
22004 switch (Base) {
22005 case HA_FLOAT:
22006 case HA_DOUBLE:
22007 return false;
22008 case HA_VECT64:
22009 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
22010 case HA_VECT128:
22011 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
22012 case HA_UNKNOWN:
22013 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
22014 case 64:
22015 Base = HA_VECT64;
22016 return true;
22017 case 128:
22018 Base = HA_VECT128;
22019 return true;
22020 default:
22021 return false;
22022 }
22023 }
22024 }
22025
22026 return (Members > 0 && Members <= 4);
22027}
22028
22029/// Return the correct alignment for the current calling convention.
22031 Type *ArgTy, const DataLayout &DL) const {
22032 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
22033 if (!ArgTy->isVectorTy())
22034 return ABITypeAlign;
22035
22036 // Avoid over-aligning vector parameters. It would require realigning the
22037 // stack and waste space for no real benefit.
22038 return std::min(ABITypeAlign, DL.getStackAlignment());
22039}
22040
22041/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
22042/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
22043/// passing according to AAPCS rules.
22045 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
22046 const DataLayout &DL) const {
22047 if (getEffectiveCallingConv(CallConv, isVarArg) !=
22049 return false;
22050
22052 uint64_t Members = 0;
22053 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
22054 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
22055
22056 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
22057 return IsHA || IsIntArray;
22058}
22059
22061 const Constant *PersonalityFn) const {
22062 // Platforms which do not use SjLj EH may return values in these registers
22063 // via the personality function.
22064 return Subtarget->useSjLjEH() ? Register() : ARM::R0;
22065}
22066
22068 const Constant *PersonalityFn) const {
22069 // Platforms which do not use SjLj EH may return values in these registers
22070 // via the personality function.
22071 return Subtarget->useSjLjEH() ? Register() : ARM::R1;
22072}
22073
22074void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
22075 // Update IsSplitCSR in ARMFunctionInfo.
22076 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
22077 AFI->setIsSplitCSR(true);
22078}
22079
22080void ARMTargetLowering::insertCopiesSplitCSR(
22081 MachineBasicBlock *Entry,
22082 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
22083 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
22084 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
22085 if (!IStart)
22086 return;
22087
22088 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22089 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22090 MachineBasicBlock::iterator MBBI = Entry->begin();
22091 for (const MCPhysReg *I = IStart; *I; ++I) {
22092 const TargetRegisterClass *RC = nullptr;
22093 if (ARM::GPRRegClass.contains(*I))
22094 RC = &ARM::GPRRegClass;
22095 else if (ARM::DPRRegClass.contains(*I))
22096 RC = &ARM::DPRRegClass;
22097 else
22098 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22099
22100 Register NewVR = MRI->createVirtualRegister(RC);
22101 // Create copy from CSR to a virtual register.
22102 // FIXME: this currently does not emit CFI pseudo-instructions, it works
22103 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22104 // nounwind. If we want to generalize this later, we may need to emit
22105 // CFI pseudo-instructions.
22106 assert(Entry->getParent()->getFunction().hasFnAttribute(
22107 Attribute::NoUnwind) &&
22108 "Function should be nounwind in insertCopiesSplitCSR!");
22109 Entry->addLiveIn(*I);
22110 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
22111 .addReg(*I);
22112
22113 // Insert the copy-back instructions right before the terminator.
22114 for (auto *Exit : Exits)
22115 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22116 TII->get(TargetOpcode::COPY), *I)
22117 .addReg(NewVR);
22118 }
22119}
22120
22124}
22125
22127 return Subtarget->hasMVEIntegerOps();
22128}
22129
22132 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22133 if (!VTy)
22134 return false;
22135
22136 auto *ScalarTy = VTy->getScalarType();
22137 unsigned NumElements = VTy->getNumElements();
22138
22139 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22140 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22141 return false;
22142
22143 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22144 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22145 return Subtarget->hasMVEFloatOps();
22146
22148 return false;
22149
22150 return Subtarget->hasMVEIntegerOps() &&
22151 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22152 ScalarTy->isIntegerTy(32));
22153}
22154
22157 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22158 Value *Accumulator) const {
22159
22160 FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
22161
22162 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22163
22164 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22165
22166 if (TyWidth > 128) {
22167 int Stride = Ty->getNumElements() / 2;
22168 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22169 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22170 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22171 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22172
22173 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22174 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22175 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22176 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22177 Value *LowerSplitAcc = nullptr;
22178 Value *UpperSplitAcc = nullptr;
22179
22180 if (Accumulator) {
22181 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22182 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22183 }
22184
22185 auto *LowerSplitInt = createComplexDeinterleavingIR(
22186 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22187 auto *UpperSplitInt = createComplexDeinterleavingIR(
22188 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22189
22190 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22191 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22192 }
22193
22194 auto *IntTy = Type::getInt32Ty(B.getContext());
22195
22196 ConstantInt *ConstRotation = nullptr;
22197 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22198 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22199
22200 if (Accumulator)
22201 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22202 {ConstRotation, Accumulator, InputB, InputA});
22203 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22204 {ConstRotation, InputB, InputA});
22205 }
22206
22207 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22208 // 1 means the value is not halved.
22209 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22210
22212 ConstRotation = ConstantInt::get(IntTy, 0);
22214 ConstRotation = ConstantInt::get(IntTy, 1);
22215
22216 if (!ConstRotation)
22217 return nullptr; // Invalid rotation for arm_mve_vcaddq
22218
22219 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22220 {ConstHalving, ConstRotation, InputA, InputB});
22221 }
22222
22223 return nullptr;
22224}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
#define MAKE_CASE(V)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
static const LLT S1
static const LLT F64
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const TargetInstrInfo *TII)
MatchingStackOffset - Return true if the given stack call argument is already available in the same p...
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
@ HA_DOUBLE
@ HA_VECT128
@ HA_VECT64
@ HA_FLOAT
@ HA_UNKNOWN
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static const MCPhysReg GPRArgRegs[]
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
ShuffleOpCodes
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
loop Loop Strength Reduction
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
Module.h This file contains the declarations for the Module class.
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
bool getExactInverse(APFloat *inv) const
Definition: APFloat.h:1387
APInt bitcastToAPInt() const
Definition: APFloat.h:1254
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition: APFloat.h:1229
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1628
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1471
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1309
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1180
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:350
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1090
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1597
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1556
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
unsigned logBase2() const
Definition: APInt.h:1718
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:454
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1236
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:419
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:285
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:275
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:218
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:837
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:830
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1614
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1200
An arbitrary precision integer that knows its signedness.
Definition: APSInt.h:23
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setPromotedConstpoolIncrease(int Sz)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
void setVarArgsFrameIndex(int Index)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
bool isTargetMachO() const
Definition: ARMSubtarget.h:312
bool useMovt() const
bool isTargetAEABI() const
Definition: ARMSubtarget.h:321
bool hasARMOps() const
Definition: ARMSubtarget.h:265
bool supportsTailCall() const
Definition: ARMSubtarget.h:399
const Triple & getTargetTriple() const
Definition: ARMSubtarget.h:298
bool hasVFP4Base() const
Definition: ARMSubtarget.h:273
const ARMBaseInstrInfo * getInstrInfo() const override
Definition: ARMSubtarget.h:196
bool isThumb1Only() const
Definition: ARMSubtarget.h:364
bool useFPVFMx() const
Definition: ARMSubtarget.h:282
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:274
bool isThumb2() const
Definition: ARMSubtarget.h:365
bool isTargetWindows() const
Definition: ARMSubtarget.h:308
bool isGVIndirectSymbol(const GlobalValue *GV) const
True if the GV will be accessed via an indirect symbol.
bool hasBaseDSP() const
Definition: ARMSubtarget.h:288
const ARMTargetLowering * getTargetLowering() const override
Definition: ARMSubtarget.h:200
bool useSjLjEH() const
Definition: ARMSubtarget.h:287
bool isTargetDarwin() const
Definition: ARMSubtarget.h:300
const ARMBaseRegisterInfo * getRegisterInfo() const override
Definition: ARMSubtarget.h:208
bool hasVFP2Base() const
Definition: ARMSubtarget.h:271
bool isTargetAndroid() const
Definition: ARMSubtarget.h:350
bool isROPI() const
bool isTargetCOFF() const
Definition: ARMSubtarget.h:310
bool isTargetGNUAEABI() const
Definition: ARMSubtarget.h:326
bool hasVFP3Base() const
Definition: ARMSubtarget.h:272
bool isAPCS_ABI() const
bool useFPVFMx64() const
Definition: ARMSubtarget.h:286
bool isTargetWatchOS() const
Definition: ARMSubtarget.h:302
bool hasMinSize() const
Definition: ARMSubtarget.h:363
bool isTargetIOS() const
Definition: ARMSubtarget.h:301
bool useNEONForSinglePrecisionFP() const
Definition: ARMSubtarget.h:267
const InstrItineraryData * getInstrItineraryData() const override
getInstrItins - Return the instruction itineraries based on subtarget selection.
Definition: ARMSubtarget.h:433
bool isTargetWatchABI() const
Definition: ARMSubtarget.h:303
bool hasAnyDataBarrier() const
Definition: ARMSubtarget.h:276
bool isTargetDriverKit() const
Definition: ARMSubtarget.h:304
bool isAAPCS_ABI() const
bool isRWPI() const
bool isLittle() const
Definition: ARMSubtarget.h:407
bool allowsUnalignedMem() const
Definition: ARMSubtarget.h:401
bool isTargetMuslAEABI() const
Definition: ARMSubtarget.h:331
bool isTargetLinux() const
Definition: ARMSubtarget.h:305
bool useFPVFMx16() const
Definition: ARMSubtarget.h:285
bool isMClass() const
Definition: ARMSubtarget.h:366
unsigned getPrefLoopLogAlignment() const
Definition: ARMSubtarget.h:486
bool isTargetHardFloat() const
bool useMulOps() const
Definition: ARMSubtarget.h:280
bool isTargetELF() const
Definition: ARMSubtarget.h:311
Align getDualLoadStoreAlignment() const
Definition: ARMSubtarget.h:443
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a vstN intrinsic.
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy, Idx).
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a vldN intrinsic.
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:494
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
bool isFloatingPointOperation() const
Definition: Instructions.h:863
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
static BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
The address of a basic block.
Definition: Constants.h:890
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
void rewindByValRegsInfo()
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
unsigned getValNo() const
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getCalledOperand() const
Definition: InstrTypes.h:1458
AttributeList getAttributes() const
Return the parameter attributes for this call.
Definition: InstrTypes.h:1542
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:706
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
Align getStackAlignment() const
Definition: DataLayout.h:271
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition: DataLayout.h:332
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Diagnostic information for unsupported feature in backend.
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
arg_iterator arg_begin()
Definition: Function.h:831
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition: Function.h:679
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition: Function.h:225
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:690
const GlobalValue * getGlobal() const
bool isDSOLocal() const
Definition: GlobalValue.h:305
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
bool hasDLLImportStorageClass() const
Definition: GlobalValue.h:278
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
Definition: GlobalValue.h:631
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
TargetInstrInfo overrides.
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:92
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2135
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1877
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2514
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2120
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1435
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:172
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:484
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1414
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2019
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2492
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2115
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2005
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1495
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:567
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2410
Value * CreateTruncOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
std::optional< unsigned > getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const
Return the cycle for the given class and operand.
bool isEmpty() const
Returns true if there are no itineraries.
bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:173
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:258
Value * getPointerOperand()
Definition: Instructions.h:252
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:208
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
Definition: MCInstrDesc.h:219
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & reset(Property P)
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition: Pass.cpp:130
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
static use_iterator use_end()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:746
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:673
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:753
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:289
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
const unsigned char * bytes_end() const
Definition: StringRef.h:118
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
const unsigned char * bytes_begin() const
Definition: StringRef.h:115
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC)
Override the default CondCode to be used to test the result of the comparison libcall against zero.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition: Triple.h:399
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isOSVersionLT(unsigned Major, unsigned Minor=0, unsigned Micro=0) const
Helper function for doing comparisons against version numbers included in the target triple.
Definition: Triple.h:500
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:634
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
Type * getArrayElementType() const
Definition: Type.h:404
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
void dump() const
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt16Ty(LLVMContext &C)
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition: ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
Definition: ARMBaseInfo.h:242
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
Definition: ARMBaseInfo.h:288
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
Definition: ARMBaseInfo.h:270
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
Definition: ARMBaseInfo.h:275
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
Definition: ARMBaseInfo.h:266
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: ARMBaseInfo.h:263
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
Definition: CallingConv.h:107
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
Definition: CallingConv.h:111
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
Definition: CallingConv.h:114
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1147
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1143
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:737
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:484
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1019
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1391
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:153
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:505
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1290
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:567
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1176
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1292
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1262
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1293
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1023
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1042
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:797
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:157
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:551
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1376
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:702
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1254
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1046
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1390
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:485
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:927
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1288
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1289
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:954
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1431
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ FrameIndex
Definition: ISDOpcodes.h:80
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:899
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:670
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:628
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1068
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1373
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:736
@ WRITE_REGISTER
Definition: ISDOpcodes.h:125
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1242
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1377
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1009
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:772
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1098
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1291
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1077
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1258
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1392
@ RegisterMask
Definition: ISDOpcodes.h:75
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1172
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1385
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:894
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1037
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1014
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:608
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1286
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:581
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:124
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1232
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:870
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:756
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1269
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1294
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1062
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:750
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1393
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1284
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:451
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:473
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:450
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1285
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1203
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:478
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1229
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:665
@ VECREDUCE_FMUL
Definition: ISDOpcodes.h:1374
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:532
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1283
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:959
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:112
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:869
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:147
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1167
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1091
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:777
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1587
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1503
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1554
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1534
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1505
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:31
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
@ Length
Definition: DWP.cpp:480
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:307
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:251
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition: STLExtras.h:1541
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:263
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool isReleaseOrStronger(AtomicOrdering AO)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1312
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ MVEVMVNModImm
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ BeforeLegalizeTypes
Definition: DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
unsigned gettBLXrOpcode(const MachineFunction &MF)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:250
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:274
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:161
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:300
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:169
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:51
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:797
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)