LLVM 23.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
297
298 // No native support for these.
308
309 // Vector reductions
319
320 if (!HasMVEFP) {
325 } else {
328 }
329
330 // Pre and Post inc are supported on loads and stores
331 for (unsigned im = (unsigned)ISD::PRE_INC;
332 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
337 }
338 }
339
340 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
341 for (auto VT : FloatTypes) {
342 addRegisterClass(VT, &ARM::MQPRRegClass);
343 if (!HasMVEFP)
344 setAllExpand(VT);
345
346 // These are legal or custom whether we have MVE.fp or not
359
360 // Pre and Post inc are supported on loads and stores
361 for (unsigned im = (unsigned)ISD::PRE_INC;
362 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
367 }
368
369 if (HasMVEFP) {
377 }
382
383 // No native support for these.
398 }
399 }
400
401 // Custom Expand smaller than legal vector reductions to prevent false zero
402 // items being added.
411
412 // We 'support' these types up to bitcast/load/store level, regardless of
413 // MVE integer-only / float support. Only doing FP data processing on the FP
414 // vector types is inhibited at integer-only level.
415 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
416 for (auto VT : LongTypes) {
417 addRegisterClass(VT, &ARM::MQPRRegClass);
418 setAllExpand(VT);
424 }
426
427 // We can do bitwise operations on v2i64 vectors
428 setOperationAction(ISD::AND, MVT::v2i64, Legal);
429 setOperationAction(ISD::OR, MVT::v2i64, Legal);
430 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
431
432 // It is legal to extload from v4i8 to v4i16 or v4i32.
433 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
436
437 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
443
444 // Some truncating stores are legal too.
445 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
446 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
447 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
448
449 // Pre and Post inc on these are legal, given the correct extends
450 for (unsigned im = (unsigned)ISD::PRE_INC;
451 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
452 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
457 }
458 }
459
460 // Predicate types
461 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
462 for (auto VT : pTypes) {
463 addRegisterClass(VT, &ARM::VCCRRegClass);
478
479 if (!HasMVEFP) {
484 }
485 }
489 setOperationAction(ISD::OR, MVT::v2i1, Expand);
495
504}
505
507 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
508}
509
511 const ARMSubtarget &STI)
512 : TargetLowering(TM_, STI), Subtarget(&STI),
513 RegInfo(Subtarget->getRegisterInfo()),
514 Itins(Subtarget->getInstrItineraryData()) {
515 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
516
519
520 const Triple &TT = TM.getTargetTriple();
521
522 if (Subtarget->isThumb1Only())
523 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
524 else
525 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
526
527 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
528 Subtarget->hasFPRegs()) {
529 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
530 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
531
536
537 if (!Subtarget->hasVFP2Base()) {
538 setAllExpand(MVT::f32);
539 } else {
542 setOperationAction(Op, MVT::f32, Legal);
543 }
544 if (!Subtarget->hasFP64()) {
545 setAllExpand(MVT::f64);
546 } else {
549 setOperationAction(Op, MVT::f64, Legal);
550
552 }
553 }
554
555 if (Subtarget->hasFullFP16()) {
558 setOperationAction(Op, MVT::f16, Legal);
559
560 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
563
568 }
569
570 if (Subtarget->hasBF16()) {
571 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
572 setAllExpand(MVT::bf16);
573 if (!Subtarget->hasFullFP16())
575 } else {
580 }
581
583 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
584 setTruncStoreAction(VT, InnerVT, Expand);
585 addAllExtLoads(VT, InnerVT, Expand);
586 }
587
590
592 }
593
594 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
596
597 if (!Subtarget->hasV8_1MMainlineOps())
599
600 if (!Subtarget->isThumb1Only())
602
605
608
609 if (Subtarget->hasMVEIntegerOps())
610 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
611
612 // Combine low-overhead loop intrinsics so that we can lower i1 types.
613 if (Subtarget->hasLOB()) {
615 }
616
617 if (Subtarget->hasNEON()) {
618 addDRTypeForNEON(MVT::v2f32);
619 addDRTypeForNEON(MVT::v8i8);
620 addDRTypeForNEON(MVT::v4i16);
621 addDRTypeForNEON(MVT::v2i32);
622 addDRTypeForNEON(MVT::v1i64);
623
624 addQRTypeForNEON(MVT::v4f32);
625 addQRTypeForNEON(MVT::v2f64);
626 addQRTypeForNEON(MVT::v16i8);
627 addQRTypeForNEON(MVT::v8i16);
628 addQRTypeForNEON(MVT::v4i32);
629 addQRTypeForNEON(MVT::v2i64);
630
631 if (Subtarget->hasFullFP16()) {
632 addQRTypeForNEON(MVT::v8f16);
633 addDRTypeForNEON(MVT::v4f16);
634 }
635
636 if (Subtarget->hasBF16()) {
637 addQRTypeForNEON(MVT::v8bf16);
638 addDRTypeForNEON(MVT::v4bf16);
639 }
640 }
641
642 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
643 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
644 // none of Neon, MVE or VFP supports any arithmetic operations on it.
645 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
646 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
647 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
648 // FIXME: Code duplication: FDIV and FREM are expanded always, see
649 // ARMTargetLowering::addTypeForNEON method for details.
650 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
651 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
652 // FIXME: Create unittest.
653 // In another words, find a way when "copysign" appears in DAG with vector
654 // operands.
656 // FIXME: Code duplication: SETCC has custom operation action, see
657 // ARMTargetLowering::addTypeForNEON method for details.
659 // FIXME: Create unittest for FNEG and for FABS.
660 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
661 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
663 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
664 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
665 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
666 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
667 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
670 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
679 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
680 }
681
682 if (Subtarget->hasNEON()) {
683 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
684 // supported for v4f32.
686 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
687 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
688 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
689 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
690 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
693 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
702
703 // Mark v2f32 intrinsics.
705 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
706 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
707 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
708 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
709 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
712 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
721
724 setOperationAction(Op, MVT::v4f16, Expand);
725 setOperationAction(Op, MVT::v8f16, Expand);
726 }
727
728 // Neon does not support some operations on v1i64 and v2i64 types.
729 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
730 // Custom handling for some quad-vector types to detect VMULL.
731 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
732 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
733 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
734 // Custom handling for some vector types to avoid expensive expansions
735 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
737 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
739 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
740 // a destination type that is wider than the source, and nor does
741 // it have a FP_TO_[SU]INT instruction with a narrower destination than
742 // source.
751
754
755 // NEON does not have single instruction CTPOP for vectors with element
756 // types wider than 8-bits. However, custom lowering can leverage the
757 // v8i8/v16i8 vcnt instruction.
764
765 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
766 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
767
768 // NEON does not have single instruction CTTZ for vectors.
770 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
771 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
772 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
773
774 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
775 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
777 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
778
783
788
792 }
793
794 // NEON only has FMA instructions as of VFP4.
795 if (!Subtarget->hasVFP4Base()) {
796 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
797 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
798 }
799
802
803 // It is legal to extload from v4i8 to v4i16 or v4i32.
804 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
805 MVT::v2i32}) {
810 }
811 }
812
813 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
814 MVT::v4i32}) {
819 }
820 }
821
822 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
829 }
830 if (Subtarget->hasMVEIntegerOps()) {
833 ISD::SETCC});
834 }
835 if (Subtarget->hasMVEFloatOps()) {
837 }
838
839 if (!Subtarget->hasFP64()) {
840 // When targeting a floating-point unit with only single-precision
841 // operations, f64 is legal for the few double-precision instructions which
842 // are present However, no double-precision operations other than moves,
843 // loads and stores are provided by the hardware.
880 }
881
884
885 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
888 if (Subtarget->hasFullFP16()) {
891 }
892 } else {
894 }
895
896 if (!Subtarget->hasFP16()) {
899 } else {
902 }
903
904 computeRegisterProperties(Subtarget->getRegisterInfo());
905
906 // ARM does not have floating-point extending loads.
907 for (MVT VT : MVT::fp_valuetypes()) {
908 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
909 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
910 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
911 }
912
913 // ... or truncating stores
914 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
915 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
916 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
917 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
918 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
919
920 // ARM does not have i1 sign extending load.
921 for (MVT VT : MVT::integer_valuetypes())
923
924 // ARM supports all 4 flavors of integer indexed load / store.
925 if (!Subtarget->isThumb1Only()) {
926 for (unsigned im = (unsigned)ISD::PRE_INC;
928 setIndexedLoadAction(im, MVT::i1, Legal);
929 setIndexedLoadAction(im, MVT::i8, Legal);
930 setIndexedLoadAction(im, MVT::i16, Legal);
931 setIndexedLoadAction(im, MVT::i32, Legal);
932 setIndexedStoreAction(im, MVT::i1, Legal);
933 setIndexedStoreAction(im, MVT::i8, Legal);
934 setIndexedStoreAction(im, MVT::i16, Legal);
935 setIndexedStoreAction(im, MVT::i32, Legal);
936 }
937 } else {
938 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
941 }
942
943 // Custom loads/stores to possible use __aeabi_uread/write*
944 if (TT.isTargetAEABI() && !Subtarget->allowsUnalignedMem()) {
949 }
950
955
956 if (!Subtarget->isThumb1Only()) {
959 }
960
963 if (Subtarget->hasDSP()) {
972 }
973 if (Subtarget->hasBaseDSP()) {
976 }
977
978 // i64 operation support.
981 if (Subtarget->isThumb1Only()) {
984 }
985 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
986 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
988
998
999 // MVE lowers 64 bit shifts to lsll and lsrl
1000 // assuming that ISD::SRL and SRA of i64 are already marked custom
1001 if (Subtarget->hasMVEIntegerOps())
1003
1004 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1005 if (Subtarget->isThumb1Only()) {
1009 }
1010
1011 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1013
1014 // ARM does not have ROTL.
1019 }
1021 // TODO: These two should be set to LibCall, but this currently breaks
1022 // the Linux kernel build. See #101786.
1025 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1028 }
1029
1030 // @llvm.readcyclecounter requires the Performance Monitors extension.
1031 // Default to the 0 expansion on unsupported platforms.
1032 // FIXME: Technically there are older ARM CPUs that have
1033 // implementation-specific ways of obtaining this information.
1034 if (Subtarget->hasPerfMon())
1036
1037 // Only ARMv6 has BSWAP.
1038 if (!Subtarget->hasV6Ops())
1040
1041 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1042 : Subtarget->hasDivideInARMMode();
1043 if (!hasDivide) {
1044 // These are expanded into libcalls if the cpu doesn't have HW divider.
1047 }
1048
1049 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1052
1055 }
1056
1059
1060 // Register based DivRem for AEABI (RTABI 4.2)
1061 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1062 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1065 HasStandaloneRem = false;
1066
1071 } else {
1074 }
1075
1080
1081 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1083
1084 // Use the default implementation.
1086 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1088 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1091
1092 if (TT.isOSWindows())
1094 else
1096
1097 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1098 // the default expansion.
1099 InsertFencesForAtomic = false;
1100 if (Subtarget->hasAnyDataBarrier() &&
1101 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1102 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1103 // to ldrex/strex loops already.
1105 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1107
1108 // On v8, we have particularly efficient implementations of atomic fences
1109 // if they can be combined with nearby atomic loads and stores.
1110 if (!Subtarget->hasAcquireRelease() ||
1111 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1112 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1113 InsertFencesForAtomic = true;
1114 }
1115 } else {
1116 // If there's anything we can use as a barrier, go through custom lowering
1117 // for ATOMIC_FENCE.
1118 // If target has DMB in thumb, Fences can be inserted.
1119 if (Subtarget->hasDataBarrier())
1120 InsertFencesForAtomic = true;
1121
1123 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1124
1125 // Set them all for libcall, which will force libcalls.
1138 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1139 // Unordered/Monotonic case.
1140 if (!InsertFencesForAtomic) {
1143 }
1144 }
1145
1146 // Compute supported atomic widths.
1147 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1148 // For targets where __sync_* routines are reliably available, we use them
1149 // if necessary.
1150 //
1151 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1152 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1153 //
1154 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1155 // such targets should provide __sync_* routines, which use the ARM mode
1156 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1157 // encoding; see ARMISD::MEMBARRIER_MCR.)
1159 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1160 Subtarget->hasForced32BitAtomics()) {
1161 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1163 } else {
1164 // We can't assume anything about other targets; just use libatomic
1165 // routines.
1167 }
1168
1170
1172
1173 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1174 if (!Subtarget->hasV6Ops()) {
1177 }
1179
1180 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1181 !Subtarget->isThumb1Only()) {
1182 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1183 // iff target supports vfp2.
1193 }
1194
1195 // We want to custom lower some of our intrinsics.
1200
1210 if (Subtarget->hasFullFP16()) {
1214 }
1215
1217
1220 if (Subtarget->hasFullFP16())
1224 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1225
1226 // We don't support sin/cos/fmod/copysign/pow
1235 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1236 !Subtarget->isThumb1Only()) {
1239 }
1242
1243 if (!Subtarget->hasVFP4Base()) {
1246 }
1247
1248 // Various VFP goodness
1249 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1250 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1251 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1256 }
1257
1258 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1259 if (!Subtarget->hasFP16()) {
1264 }
1265
1266 // Strict floating-point comparisons need custom lowering.
1273 }
1274
1277
1278 // FP-ARMv8 implements a lot of rounding-like FP operations.
1279 if (Subtarget->hasFPARMv8Base()) {
1280 for (auto Op :
1287 setOperationAction(Op, MVT::f32, Legal);
1288
1289 if (Subtarget->hasFP64())
1290 setOperationAction(Op, MVT::f64, Legal);
1291 }
1292
1293 if (Subtarget->hasNEON()) {
1298 }
1299 }
1300
1301 // FP16 often need to be promoted to call lib functions
1302 // clang-format off
1303 if (Subtarget->hasFullFP16()) {
1307
1308 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1322 setOperationAction(Op, MVT::f16, Promote);
1323 }
1324
1325 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1326 // because the result type is integer.
1328 setOperationAction(Op, MVT::f16, Custom);
1329
1335 setOperationAction(Op, MVT::f16, Legal);
1336 }
1337 // clang-format on
1338 }
1339
1340 if (Subtarget->hasNEON()) {
1341 // vmin and vmax aren't available in a scalar form, so we can use
1342 // a NEON instruction with an undef lane instead.
1351
1352 if (Subtarget->hasV8Ops()) {
1357 setOperationAction(Op, MVT::v2f32, Legal);
1358 setOperationAction(Op, MVT::v4f32, Legal);
1359 }
1360 }
1361
1362 if (Subtarget->hasFullFP16()) {
1367
1372
1377 setOperationAction(Op, MVT::v4f16, Legal);
1378 setOperationAction(Op, MVT::v8f16, Legal);
1379 }
1380 }
1381 }
1382
1383 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1384 // it, but it's just a wrapper around ldexp.
1385 if (TT.isOSWindows()) {
1387 if (isOperationExpand(Op, MVT::f32))
1388 setOperationAction(Op, MVT::f32, Promote);
1389 }
1390
1391 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1392 // isn't legal.
1394 if (isOperationExpand(Op, MVT::f16))
1395 setOperationAction(Op, MVT::f16, Promote);
1396
1397 // We have target-specific dag combine patterns for the following nodes:
1398 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1401
1402 if (Subtarget->hasMVEIntegerOps())
1404
1405 if (Subtarget->hasV6Ops())
1407 if (Subtarget->isThumb1Only())
1409 // Attempt to lower smin/smax to ssat/usat
1410 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1411 Subtarget->isThumb2()) {
1413 }
1414
1416
1417 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1418 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1420 else
1422
1423 //// temporary - rewrite interface to use type
1426 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1428 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1430
1431 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1432 // are at least 4 bytes aligned.
1434
1435 // Prefer likely predicted branches to selects on out-of-order cores.
1436 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1437
1438 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1440 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1441
1442 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1443
1444 IsStrictFPEnabled = true;
1445}
1446
1448 return Subtarget->useSoftFloat();
1449}
1450
1452 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1453}
1454
1455// FIXME: It might make sense to define the representative register class as the
1456// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1457// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1458// SPR's representative would be DPR_VFP2. This should work well if register
1459// pressure tracking were modified such that a register use would increment the
1460// pressure of the register class's representative and all of it's super
1461// classes' representatives transitively. We have not implemented this because
1462// of the difficulty prior to coalescing of modeling operand register classes
1463// due to the common occurrence of cross class copies and subregister insertions
1464// and extractions.
1465std::pair<const TargetRegisterClass *, uint8_t>
1467 MVT VT) const {
1468 const TargetRegisterClass *RRC = nullptr;
1469 uint8_t Cost = 1;
1470 switch (VT.SimpleTy) {
1471 default:
1473 // Use DPR as representative register class for all floating point
1474 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1475 // the cost is 1 for both f32 and f64.
1476 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1477 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1478 RRC = &ARM::DPRRegClass;
1479 // When NEON is used for SP, only half of the register file is available
1480 // because operations that define both SP and DP results will be constrained
1481 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1482 // coalescing by double-counting the SP regs. See the FIXME above.
1483 if (Subtarget->useNEONForSinglePrecisionFP())
1484 Cost = 2;
1485 break;
1486 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1487 case MVT::v4f32: case MVT::v2f64:
1488 RRC = &ARM::DPRRegClass;
1489 Cost = 2;
1490 break;
1491 case MVT::v4i64:
1492 RRC = &ARM::DPRRegClass;
1493 Cost = 4;
1494 break;
1495 case MVT::v8i64:
1496 RRC = &ARM::DPRRegClass;
1497 Cost = 8;
1498 break;
1499 }
1500 return std::make_pair(RRC, Cost);
1501}
1502
1504 EVT VT) const {
1505 if (!VT.isVector())
1506 return getPointerTy(DL);
1507
1508 // MVE has a predicate register.
1509 if ((Subtarget->hasMVEIntegerOps() &&
1510 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1511 VT == MVT::v16i8)) ||
1512 (Subtarget->hasMVEFloatOps() &&
1513 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1514 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1516}
1517
1518/// getRegClassFor - Return the register class that should be used for the
1519/// specified value type.
1520const TargetRegisterClass *
1521ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1522 (void)isDivergent;
1523 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1524 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1525 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1526 // MVE Q registers.
1527 if (Subtarget->hasNEON()) {
1528 if (VT == MVT::v4i64)
1529 return &ARM::QQPRRegClass;
1530 if (VT == MVT::v8i64)
1531 return &ARM::QQQQPRRegClass;
1532 }
1533 if (Subtarget->hasMVEIntegerOps()) {
1534 if (VT == MVT::v4i64)
1535 return &ARM::MQQPRRegClass;
1536 if (VT == MVT::v8i64)
1537 return &ARM::MQQQQPRRegClass;
1538 }
1540}
1541
1542// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1543// source/dest is aligned and the copy size is large enough. We therefore want
1544// to align such objects passed to memory intrinsics.
1546 Align &PrefAlign) const {
1547 if (!isa<MemIntrinsic>(CI))
1548 return false;
1549 MinSize = 8;
1550 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1551 // cycle faster than 4-byte aligned LDM.
1552 PrefAlign =
1553 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1554 return true;
1555}
1556
1557// Create a fast isel object.
1559 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
1560 const LibcallLoweringInfo *libcallLowering) const {
1561 return ARM::createFastISel(funcInfo, libInfo, libcallLowering);
1562}
1563
1565 unsigned NumVals = N->getNumValues();
1566 if (!NumVals)
1567 return Sched::RegPressure;
1568
1569 for (unsigned i = 0; i != NumVals; ++i) {
1570 EVT VT = N->getValueType(i);
1571 if (VT == MVT::Glue || VT == MVT::Other)
1572 continue;
1573 if (VT.isFloatingPoint() || VT.isVector())
1574 return Sched::ILP;
1575 }
1576
1577 if (!N->isMachineOpcode())
1578 return Sched::RegPressure;
1579
1580 // Load are scheduled for latency even if there instruction itinerary
1581 // is not available.
1582 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1583 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1584
1585 if (MCID.getNumDefs() == 0)
1586 return Sched::RegPressure;
1587 if (!Itins->isEmpty() &&
1588 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1589 return Sched::ILP;
1590
1591 return Sched::RegPressure;
1592}
1593
1594//===----------------------------------------------------------------------===//
1595// Lowering Code
1596//===----------------------------------------------------------------------===//
1597
1598static bool isSRL16(const SDValue &Op) {
1599 if (Op.getOpcode() != ISD::SRL)
1600 return false;
1601 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1602 return Const->getZExtValue() == 16;
1603 return false;
1604}
1605
1606static bool isSRA16(const SDValue &Op) {
1607 if (Op.getOpcode() != ISD::SRA)
1608 return false;
1609 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1610 return Const->getZExtValue() == 16;
1611 return false;
1612}
1613
1614static bool isSHL16(const SDValue &Op) {
1615 if (Op.getOpcode() != ISD::SHL)
1616 return false;
1617 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1618 return Const->getZExtValue() == 16;
1619 return false;
1620}
1621
1622// Check for a signed 16-bit value. We special case SRA because it makes it
1623// more simple when also looking for SRAs that aren't sign extending a
1624// smaller value. Without the check, we'd need to take extra care with
1625// checking order for some operations.
1626static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1627 if (isSRA16(Op))
1628 return isSHL16(Op.getOperand(0));
1629 return DAG.ComputeNumSignBits(Op) == 17;
1630}
1631
1632/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1634 switch (CC) {
1635 default: llvm_unreachable("Unknown condition code!");
1636 case ISD::SETNE: return ARMCC::NE;
1637 case ISD::SETEQ: return ARMCC::EQ;
1638 case ISD::SETGT: return ARMCC::GT;
1639 case ISD::SETGE: return ARMCC::GE;
1640 case ISD::SETLT: return ARMCC::LT;
1641 case ISD::SETLE: return ARMCC::LE;
1642 case ISD::SETUGT: return ARMCC::HI;
1643 case ISD::SETUGE: return ARMCC::HS;
1644 case ISD::SETULT: return ARMCC::LO;
1645 case ISD::SETULE: return ARMCC::LS;
1646 }
1647}
1648
1649/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1651 ARMCC::CondCodes &CondCode2) {
1652 CondCode2 = ARMCC::AL;
1653 switch (CC) {
1654 default: llvm_unreachable("Unknown FP condition!");
1655 case ISD::SETEQ:
1656 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1657 case ISD::SETGT:
1658 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1659 case ISD::SETGE:
1660 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1661 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1662 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1663 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1664 case ISD::SETO: CondCode = ARMCC::VC; break;
1665 case ISD::SETUO: CondCode = ARMCC::VS; break;
1666 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1667 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1668 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1669 case ISD::SETLT:
1670 case ISD::SETULT: CondCode = ARMCC::LT; break;
1671 case ISD::SETLE:
1672 case ISD::SETULE: CondCode = ARMCC::LE; break;
1673 case ISD::SETNE:
1674 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1675 }
1676}
1677
1678//===----------------------------------------------------------------------===//
1679// Calling Convention Implementation
1680//===----------------------------------------------------------------------===//
1681
1682/// getEffectiveCallingConv - Get the effective calling convention, taking into
1683/// account presence of floating point hardware and calling convention
1684/// limitations, such as support for variadic functions.
1686ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1687 bool isVarArg) const {
1688 switch (CC) {
1689 default:
1690 report_fatal_error("Unsupported calling convention");
1693 case CallingConv::GHC:
1695 return CC;
1701 case CallingConv::Swift:
1704 case CallingConv::C:
1705 case CallingConv::Tail:
1706 if (!getTM().isAAPCS_ABI())
1707 return CallingConv::ARM_APCS;
1708 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1709 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1710 !isVarArg)
1712 else
1714 case CallingConv::Fast:
1716 if (!getTM().isAAPCS_ABI()) {
1717 if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() && !isVarArg)
1718 return CallingConv::Fast;
1719 return CallingConv::ARM_APCS;
1720 } else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1721 !isVarArg)
1723 else
1725 }
1726}
1727
1729 bool isVarArg) const {
1730 return CCAssignFnForNode(CC, false, isVarArg);
1731}
1732
1734 bool isVarArg) const {
1735 return CCAssignFnForNode(CC, true, isVarArg);
1736}
1737
1738/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1739/// CallingConvention.
1740CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1741 bool Return,
1742 bool isVarArg) const {
1743 switch (getEffectiveCallingConv(CC, isVarArg)) {
1744 default:
1745 report_fatal_error("Unsupported calling convention");
1747 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1749 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1751 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1752 case CallingConv::Fast:
1753 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1754 case CallingConv::GHC:
1755 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1757 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1759 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1761 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1762 }
1763}
1764
1765SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1766 MVT LocVT, MVT ValVT, SDValue Val) const {
1767 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1768 Val);
1769 if (Subtarget->hasFullFP16()) {
1770 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1771 } else {
1772 Val = DAG.getNode(ISD::TRUNCATE, dl,
1773 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1774 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1775 }
1776 return Val;
1777}
1778
1779SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1780 MVT LocVT, MVT ValVT,
1781 SDValue Val) const {
1782 if (Subtarget->hasFullFP16()) {
1783 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1784 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1785 } else {
1786 Val = DAG.getNode(ISD::BITCAST, dl,
1787 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1788 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1789 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1790 }
1791 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1792}
1793
1794/// LowerCallResult - Lower the result values of a call into the
1795/// appropriate copies out of appropriate physical registers.
1796SDValue ARMTargetLowering::LowerCallResult(
1797 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1798 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1799 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1800 SDValue ThisVal, bool isCmseNSCall) const {
1801 // Assign locations to each value returned by this call.
1803 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1804 *DAG.getContext());
1805 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1806
1807 // Copy all of the result registers out of their specified physreg.
1808 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1809 CCValAssign VA = RVLocs[i];
1810
1811 // Pass 'this' value directly from the argument to return value, to avoid
1812 // reg unit interference
1813 if (i == 0 && isThisReturn) {
1814 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1815 "unexpected return calling convention register assignment");
1816 InVals.push_back(ThisVal);
1817 continue;
1818 }
1819
1820 SDValue Val;
1821 if (VA.needsCustom() &&
1822 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1823 // Handle f64 or half of a v2f64.
1824 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1825 InGlue);
1826 Chain = Lo.getValue(1);
1827 InGlue = Lo.getValue(2);
1828 VA = RVLocs[++i]; // skip ahead to next loc
1829 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1830 InGlue);
1831 Chain = Hi.getValue(1);
1832 InGlue = Hi.getValue(2);
1833 if (!Subtarget->isLittle())
1834 std::swap (Lo, Hi);
1835 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1836
1837 if (VA.getLocVT() == MVT::v2f64) {
1838 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1839 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1840 DAG.getConstant(0, dl, MVT::i32));
1841
1842 VA = RVLocs[++i]; // skip ahead to next loc
1843 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1844 Chain = Lo.getValue(1);
1845 InGlue = Lo.getValue(2);
1846 VA = RVLocs[++i]; // skip ahead to next loc
1847 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1848 Chain = Hi.getValue(1);
1849 InGlue = Hi.getValue(2);
1850 if (!Subtarget->isLittle())
1851 std::swap (Lo, Hi);
1852 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1853 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1854 DAG.getConstant(1, dl, MVT::i32));
1855 }
1856 } else {
1857 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1858 InGlue);
1859 Chain = Val.getValue(1);
1860 InGlue = Val.getValue(2);
1861 }
1862
1863 switch (VA.getLocInfo()) {
1864 default: llvm_unreachable("Unknown loc info!");
1865 case CCValAssign::Full: break;
1866 case CCValAssign::BCvt:
1867 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1868 break;
1869 }
1870
1871 // f16 arguments have their size extended to 4 bytes and passed as if they
1872 // had been copied to the LSBs of a 32-bit register.
1873 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1874 if (VA.needsCustom() &&
1875 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1876 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1877
1878 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1879 // is less than 32 bits must be sign- or zero-extended after the call for
1880 // security reasons. Although the ABI mandates an extension done by the
1881 // callee, the latter cannot be trusted to follow the rules of the ABI.
1882 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1883 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1884 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1885 Val = handleCMSEValue(Val, Arg, DAG, dl);
1886
1887 InVals.push_back(Val);
1888 }
1889
1890 return Chain;
1891}
1892
1893std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1894 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1895 bool IsTailCall, int SPDiff) const {
1896 SDValue DstAddr;
1897 MachinePointerInfo DstInfo;
1898 int32_t Offset = VA.getLocMemOffset();
1899 MachineFunction &MF = DAG.getMachineFunction();
1900
1901 if (IsTailCall) {
1902 Offset += SPDiff;
1903 auto PtrVT = getPointerTy(DAG.getDataLayout());
1904 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1905 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1906 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1907 DstInfo =
1909 } else {
1910 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1911 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1912 StackPtr, PtrOff);
1913 DstInfo =
1915 }
1916
1917 return std::make_pair(DstAddr, DstInfo);
1918}
1919
1920// Returns the type of copying which is required to set up a byval argument to
1921// a tail-called function. This isn't needed for non-tail calls, because they
1922// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1923// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1924// optimised to zero copies when forwarding an argument from the caller's
1925// caller (NoCopy).
1926ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1927 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1928 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1929 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1930
1931 // Globals are always safe to copy from.
1933 return CopyOnce;
1934
1935 // Can only analyse frame index nodes, conservatively assume we need a
1936 // temporary.
1937 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1938 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1939 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1940 return CopyViaTemp;
1941
1942 int SrcFI = SrcFrameIdxNode->getIndex();
1943 int DstFI = DstFrameIdxNode->getIndex();
1944 assert(MFI.isFixedObjectIndex(DstFI) &&
1945 "byval passed in non-fixed stack slot");
1946
1947 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
1948 int64_t DstOffset = MFI.getObjectOffset(DstFI);
1949
1950 // If the source is in the local frame, then the copy to the argument memory
1951 // is always valid.
1952 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
1953 if (!FixedSrc ||
1954 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
1955 return CopyOnce;
1956
1957 // In the case of byval arguments split between registers and the stack,
1958 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
1959 // stack portion, but the Src SDValue will refer to the full value, including
1960 // the local stack memory that the register portion gets stored into. We only
1961 // need to compare them for equality, so normalise on the full value version.
1962 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
1963 DstOffset -= RegSize;
1964
1965 // If the value is already in the correct location, then no copying is
1966 // needed. If not, then we need to copy via a temporary.
1967 if (SrcOffset == DstOffset)
1968 return NoCopy;
1969 else
1970 return CopyViaTemp;
1971}
1972
1973void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1974 SDValue Chain, SDValue &Arg,
1975 RegsToPassVector &RegsToPass,
1976 CCValAssign &VA, CCValAssign &NextVA,
1977 SDValue &StackPtr,
1978 SmallVectorImpl<SDValue> &MemOpChains,
1979 bool IsTailCall,
1980 int SPDiff) const {
1981 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1982 DAG.getVTList(MVT::i32, MVT::i32), Arg);
1983 unsigned id = Subtarget->isLittle() ? 0 : 1;
1984 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1985
1986 if (NextVA.isRegLoc())
1987 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1988 else {
1989 assert(NextVA.isMemLoc());
1990 if (!StackPtr.getNode())
1991 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1993
1994 SDValue DstAddr;
1995 MachinePointerInfo DstInfo;
1996 std::tie(DstAddr, DstInfo) =
1997 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
1998 MemOpChains.push_back(
1999 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2000 }
2001}
2002
2003static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2004 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2006}
2007
2008/// LowerCall - Lowering a call into a callseq_start <-
2009/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2010/// nodes.
2011SDValue
2012ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2013 SmallVectorImpl<SDValue> &InVals) const {
2014 SelectionDAG &DAG = CLI.DAG;
2015 SDLoc &dl = CLI.DL;
2016 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2017 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2018 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2019 SDValue Chain = CLI.Chain;
2020 SDValue Callee = CLI.Callee;
2021 bool &isTailCall = CLI.IsTailCall;
2022 CallingConv::ID CallConv = CLI.CallConv;
2023 bool doesNotRet = CLI.DoesNotReturn;
2024 bool isVarArg = CLI.IsVarArg;
2025 const CallBase *CB = CLI.CB;
2026
2027 MachineFunction &MF = DAG.getMachineFunction();
2028 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2029 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2030 MachineFunction::CallSiteInfo CSInfo;
2031 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2032 bool isThisReturn = false;
2033 bool isCmseNSCall = false;
2034 bool isSibCall = false;
2035 bool PreferIndirect = false;
2036 bool GuardWithBTI = false;
2037
2038 // Analyze operands of the call, assigning locations to each operand.
2040 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2041 *DAG.getContext());
2042 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2043
2044 // Lower 'returns_twice' calls to a pseudo-instruction.
2045 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2046 !Subtarget->noBTIAtReturnTwice())
2047 GuardWithBTI = AFI->branchTargetEnforcement();
2048
2049 // Set type id for call site info.
2050 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
2051
2052 // Determine whether this is a non-secure function call.
2053 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2054 isCmseNSCall = true;
2055
2056 // Disable tail calls if they're not supported.
2057 if (!Subtarget->supportsTailCall())
2058 isTailCall = false;
2059
2060 // For both the non-secure calls and the returns from a CMSE entry function,
2061 // the function needs to do some extra work after the call, or before the
2062 // return, respectively, thus it cannot end with a tail call
2063 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2064 isTailCall = false;
2065
2066 if (isa<GlobalAddressSDNode>(Callee)) {
2067 // If we're optimizing for minimum size and the function is called three or
2068 // more times in this block, we can improve codesize by calling indirectly
2069 // as BLXr has a 16-bit encoding.
2070 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2071 if (CLI.CB) {
2072 auto *BB = CLI.CB->getParent();
2073 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2074 count_if(GV->users(), [&BB](const User *U) {
2075 return isa<Instruction>(U) &&
2076 cast<Instruction>(U)->getParent() == BB;
2077 }) > 2;
2078 }
2079 }
2080 if (isTailCall) {
2081 // Check if it's really possible to do a tail call.
2082 isTailCall =
2083 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2084
2085 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2086 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2087 isSibCall = true;
2088
2089 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2090 // detected sibcalls.
2091 if (isTailCall)
2092 ++NumTailCalls;
2093 }
2094
2095 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2096 report_fatal_error("failed to perform tail call elimination on a call "
2097 "site marked musttail");
2098
2099 // Get a count of how many bytes are to be pushed on the stack.
2100 unsigned NumBytes = CCInfo.getStackSize();
2101
2102 // SPDiff is the byte offset of the call's argument area from the callee's.
2103 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2104 // by this amount for a tail call. In a sibling call it must be 0 because the
2105 // caller will deallocate the entire stack and the callee still expects its
2106 // arguments to begin at SP+0. Completely unused for non-tail calls.
2107 int SPDiff = 0;
2108
2109 if (isTailCall && !isSibCall) {
2110 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2111 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2112
2113 // Since callee will pop argument stack as a tail call, we must keep the
2114 // popped size 16-byte aligned.
2115 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2116 assert(StackAlign && "data layout string is missing stack alignment");
2117 NumBytes = alignTo(NumBytes, *StackAlign);
2118
2119 // SPDiff will be negative if this tail call requires more space than we
2120 // would automatically have in our incoming argument space. Positive if we
2121 // can actually shrink the stack.
2122 SPDiff = NumReusableBytes - NumBytes;
2123
2124 // If this call requires more stack than we have available from
2125 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2126 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2127 AFI->setArgRegsSaveSize(-SPDiff);
2128 }
2129
2130 if (isSibCall) {
2131 // For sibling tail calls, memory operands are available in our caller's stack.
2132 NumBytes = 0;
2133 } else {
2134 // Adjust the stack pointer for the new arguments...
2135 // These operations are automatically eliminated by the prolog/epilog pass
2136 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2137 }
2138
2140 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2141
2142 RegsToPassVector RegsToPass;
2143 SmallVector<SDValue, 8> MemOpChains;
2144
2145 // If we are doing a tail-call, any byval arguments will be written to stack
2146 // space which was used for incoming arguments. If any the values being used
2147 // are incoming byval arguments to this function, then they might be
2148 // overwritten by the stores of the outgoing arguments. To avoid this, we
2149 // need to make a temporary copy of them in local stack space, then copy back
2150 // to the argument area.
2151 DenseMap<unsigned, SDValue> ByValTemporaries;
2152 SDValue ByValTempChain;
2153 if (isTailCall) {
2154 SmallVector<SDValue, 8> ByValCopyChains;
2155 for (const CCValAssign &VA : ArgLocs) {
2156 unsigned ArgIdx = VA.getValNo();
2157 SDValue Src = OutVals[ArgIdx];
2158 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2159
2160 if (!Flags.isByVal())
2161 continue;
2162
2163 SDValue Dst;
2164 MachinePointerInfo DstInfo;
2165 std::tie(Dst, DstInfo) =
2166 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2167 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2168
2169 if (Copy == NoCopy) {
2170 // If the argument is already at the correct offset on the stack
2171 // (because we are forwarding a byval argument from our caller), we
2172 // don't need any copying.
2173 continue;
2174 } else if (Copy == CopyOnce) {
2175 // If the argument is in our local stack frame, no other argument
2176 // preparation can clobber it, so we can copy it to the final location
2177 // later.
2178 ByValTemporaries[ArgIdx] = Src;
2179 } else {
2180 assert(Copy == CopyViaTemp && "unexpected enum value");
2181 // If we might be copying this argument from the outgoing argument
2182 // stack area, we need to copy via a temporary in the local stack
2183 // frame.
2184 int TempFrameIdx = MFI.CreateStackObject(
2185 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2186 SDValue Temp =
2187 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2188
2189 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2190 SDValue AlignNode =
2191 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2192
2193 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2194 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2195 ByValCopyChains.push_back(
2196 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2197 ByValTemporaries[ArgIdx] = Temp;
2198 }
2199 }
2200 if (!ByValCopyChains.empty())
2201 ByValTempChain =
2202 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2203 }
2204
2205 // During a tail call, stores to the argument area must happen after all of
2206 // the function's incoming arguments have been loaded because they may alias.
2207 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2208 // there's no point in doing so repeatedly so this tracks whether that's
2209 // happened yet.
2210 bool AfterFormalArgLoads = false;
2211
2212 // Walk the register/memloc assignments, inserting copies/loads. In the case
2213 // of tail call optimization, arguments are handled later.
2214 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2215 i != e;
2216 ++i, ++realArgIdx) {
2217 CCValAssign &VA = ArgLocs[i];
2218 SDValue Arg = OutVals[realArgIdx];
2219 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2220 bool isByVal = Flags.isByVal();
2221
2222 // Promote the value if needed.
2223 switch (VA.getLocInfo()) {
2224 default: llvm_unreachable("Unknown loc info!");
2225 case CCValAssign::Full: break;
2226 case CCValAssign::SExt:
2227 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2228 break;
2229 case CCValAssign::ZExt:
2230 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2231 break;
2232 case CCValAssign::AExt:
2233 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2234 break;
2235 case CCValAssign::BCvt:
2236 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2237 break;
2238 }
2239
2240 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2241 Chain = DAG.getStackArgumentTokenFactor(Chain);
2242 if (ByValTempChain) {
2243 // In case of large byval copies, re-using the stackframe for tail-calls
2244 // can lead to overwriting incoming arguments on the stack. Force
2245 // loading these stack arguments before the copy to avoid that.
2246 SmallVector<SDValue, 8> IncomingLoad;
2247 for (unsigned I = 0; I < OutVals.size(); ++I) {
2248 if (Outs[I].Flags.isByVal())
2249 continue;
2250
2251 SDValue OutVal = OutVals[I];
2252 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2253 if (!OutLN)
2254 continue;
2255
2256 FrameIndexSDNode *FIN =
2258 if (!FIN)
2259 continue;
2260
2261 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2262 continue;
2263
2264 for (const CCValAssign &VA : ArgLocs) {
2265 if (VA.isMemLoc())
2266 IncomingLoad.push_back(OutVal.getValue(1));
2267 }
2268 }
2269
2270 // Update the chain to force loads for potentially clobbered argument
2271 // loads to happen before the byval copy.
2272 if (!IncomingLoad.empty()) {
2273 IncomingLoad.push_back(Chain);
2274 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2275 }
2276
2277 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2278 ByValTempChain);
2279 }
2280 AfterFormalArgLoads = true;
2281 }
2282
2283 // f16 arguments have their size extended to 4 bytes and passed as if they
2284 // had been copied to the LSBs of a 32-bit register.
2285 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2286 if (VA.needsCustom() &&
2287 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2288 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2289 } else {
2290 // f16 arguments could have been extended prior to argument lowering.
2291 // Mask them arguments if this is a CMSE nonsecure call.
2292 auto ArgVT = Outs[realArgIdx].ArgVT;
2293 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2294 auto LocBits = VA.getLocVT().getSizeInBits();
2295 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2296 SDValue Mask =
2297 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2298 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2299 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2300 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2301 }
2302 }
2303
2304 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2305 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2306 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2307 DAG.getConstant(0, dl, MVT::i32));
2308 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2309 DAG.getConstant(1, dl, MVT::i32));
2310
2311 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2312 StackPtr, MemOpChains, isTailCall, SPDiff);
2313
2314 VA = ArgLocs[++i]; // skip ahead to next loc
2315 if (VA.isRegLoc()) {
2316 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2317 StackPtr, MemOpChains, isTailCall, SPDiff);
2318 } else {
2319 assert(VA.isMemLoc());
2320 SDValue DstAddr;
2321 MachinePointerInfo DstInfo;
2322 std::tie(DstAddr, DstInfo) =
2323 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2324 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2325 }
2326 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2327 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2328 StackPtr, MemOpChains, isTailCall, SPDiff);
2329 } else if (VA.isRegLoc()) {
2330 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2331 Outs[0].VT == MVT::i32) {
2332 assert(VA.getLocVT() == MVT::i32 &&
2333 "unexpected calling convention register assignment");
2334 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2335 "unexpected use of 'returned'");
2336 isThisReturn = true;
2337 }
2338 const TargetOptions &Options = DAG.getTarget().Options;
2339 if (Options.EmitCallSiteInfo)
2340 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2341 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2342 } else if (isByVal) {
2343 assert(VA.isMemLoc());
2344 unsigned offset = 0;
2345
2346 // True if this byval aggregate will be split between registers
2347 // and memory.
2348 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2349 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2350
2351 SDValue ByValSrc;
2352 bool NeedsStackCopy;
2353 if (auto It = ByValTemporaries.find(realArgIdx);
2354 It != ByValTemporaries.end()) {
2355 ByValSrc = It->second;
2356 NeedsStackCopy = true;
2357 } else {
2358 ByValSrc = Arg;
2359 NeedsStackCopy = !isTailCall;
2360 }
2361
2362 // If part of the argument is in registers, load them.
2363 if (CurByValIdx < ByValArgsCount) {
2364 unsigned RegBegin, RegEnd;
2365 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2366
2367 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2368 unsigned int i, j;
2369 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2370 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2371 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2372 SDValue Load =
2373 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2374 DAG.InferPtrAlign(AddArg));
2375 MemOpChains.push_back(Load.getValue(1));
2376 RegsToPass.push_back(std::make_pair(j, Load));
2377 }
2378
2379 // If parameter size outsides register area, "offset" value
2380 // helps us to calculate stack slot for remained part properly.
2381 offset = RegEnd - RegBegin;
2382
2383 CCInfo.nextInRegsParam();
2384 }
2385
2386 // If the memory part of the argument isn't already in the correct place
2387 // (which can happen with tail calls), copy it into the argument area.
2388 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2389 auto PtrVT = getPointerTy(DAG.getDataLayout());
2390 SDValue Dst;
2391 MachinePointerInfo DstInfo;
2392 std::tie(Dst, DstInfo) =
2393 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2394 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2395 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2396 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2397 MVT::i32);
2398 SDValue AlignNode =
2399 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2400
2401 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2402 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2403 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2404 Ops));
2405 }
2406 } else {
2407 assert(VA.isMemLoc());
2408 SDValue DstAddr;
2409 MachinePointerInfo DstInfo;
2410 std::tie(DstAddr, DstInfo) =
2411 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2412
2413 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2414 MemOpChains.push_back(Store);
2415 }
2416 }
2417
2418 if (!MemOpChains.empty())
2419 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2420
2421 // Build a sequence of copy-to-reg nodes chained together with token chain
2422 // and flag operands which copy the outgoing args into the appropriate regs.
2423 SDValue InGlue;
2424 for (const auto &[Reg, N] : RegsToPass) {
2425 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2426 InGlue = Chain.getValue(1);
2427 }
2428
2429 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2430 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2431 // node so that legalize doesn't hack it.
2432 bool isDirect = false;
2433
2434 const TargetMachine &TM = getTargetMachine();
2435 const Triple &TT = TM.getTargetTriple();
2436 const GlobalValue *GVal = nullptr;
2437 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2438 GVal = G->getGlobal();
2439 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && TT.isOSBinFormatMachO();
2440
2441 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2442 bool isLocalARMFunc = false;
2443 auto PtrVt = getPointerTy(DAG.getDataLayout());
2444
2445 if (Subtarget->genLongCalls()) {
2446 assert((!isPositionIndependent() || TT.isOSWindows()) &&
2447 "long-calls codegen is not position independent!");
2448 // Handle a global address or an external symbol. If it's not one of
2449 // those, the target's already in a register, so we don't need to do
2450 // anything extra.
2451 if (isa<GlobalAddressSDNode>(Callee)) {
2452 if (Subtarget->genExecuteOnly()) {
2453 if (Subtarget->useMovt())
2454 ++NumMovwMovt;
2455 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2456 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2457 } else {
2458 // Create a constant pool entry for the callee address
2459 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2460 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2461 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2462
2463 // Get the address of the callee into a register
2464 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2465 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2466 Callee = DAG.getLoad(
2467 PtrVt, dl, DAG.getEntryNode(), Addr,
2469 }
2470 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2471 const char *Sym = S->getSymbol();
2472
2473 if (Subtarget->genExecuteOnly()) {
2474 if (Subtarget->useMovt())
2475 ++NumMovwMovt;
2476 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2477 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2478 } else {
2479 // Create a constant pool entry for the callee address
2480 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2481 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2482 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2483
2484 // Get the address of the callee into a register
2485 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2486 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2487 Callee = DAG.getLoad(
2488 PtrVt, dl, DAG.getEntryNode(), Addr,
2490 }
2491 }
2492 } else if (isa<GlobalAddressSDNode>(Callee)) {
2493 if (!PreferIndirect) {
2494 isDirect = true;
2495 bool isDef = GVal->isStrongDefinitionForLinker();
2496
2497 // ARM call to a local ARM function is predicable.
2498 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2499 // tBX takes a register source operand.
2500 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2501 assert(TT.isOSBinFormatMachO() && "WrapperPIC use on non-MachO?");
2502 Callee = DAG.getNode(
2503 ARMISD::WrapperPIC, dl, PtrVt,
2504 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2505 Callee = DAG.getLoad(
2506 PtrVt, dl, DAG.getEntryNode(), Callee,
2510 } else if (Subtarget->isTargetCOFF()) {
2511 assert(Subtarget->isTargetWindows() &&
2512 "Windows is the only supported COFF target");
2513 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2514 if (GVal->hasDLLImportStorageClass())
2515 TargetFlags = ARMII::MO_DLLIMPORT;
2516 else if (!TM.shouldAssumeDSOLocal(GVal))
2517 TargetFlags = ARMII::MO_COFFSTUB;
2518 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2519 TargetFlags);
2520 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2521 Callee =
2522 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2523 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2525 } else {
2526 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2527 }
2528 }
2529 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2530 isDirect = true;
2531 // tBX takes a register source operand.
2532 const char *Sym = S->getSymbol();
2533 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2534 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2535 ARMConstantPoolValue *CPV =
2537 ARMPCLabelIndex, 4);
2538 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2539 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2540 Callee = DAG.getLoad(
2541 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2543 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2544 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2545 } else {
2546 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2547 }
2548 }
2549
2550 if (isCmseNSCall) {
2551 assert(!isARMFunc && !isDirect &&
2552 "Cannot handle call to ARM function or direct call");
2553 if (NumBytes > 0) {
2554 DAG.getContext()->diagnose(
2555 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2556 "call to non-secure function would require "
2557 "passing arguments on stack",
2558 dl.getDebugLoc()));
2559 }
2560 if (isStructRet) {
2561 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2563 "call to non-secure function would return value through pointer",
2564 dl.getDebugLoc()));
2565 }
2566 }
2567
2568 // FIXME: handle tail calls differently.
2569 unsigned CallOpc;
2570 if (Subtarget->isThumb()) {
2571 if (GuardWithBTI)
2572 CallOpc = ARMISD::t2CALL_BTI;
2573 else if (isCmseNSCall)
2574 CallOpc = ARMISD::tSECALL;
2575 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2576 CallOpc = ARMISD::CALL_NOLINK;
2577 else
2578 CallOpc = ARMISD::CALL;
2579 } else {
2580 if (!isDirect && !Subtarget->hasV5TOps())
2581 CallOpc = ARMISD::CALL_NOLINK;
2582 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2583 // Emit regular call when code size is the priority
2584 !Subtarget->hasMinSize())
2585 // "mov lr, pc; b _foo" to avoid confusing the RSP
2586 CallOpc = ARMISD::CALL_NOLINK;
2587 else
2588 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2589 }
2590
2591 // We don't usually want to end the call-sequence here because we would tidy
2592 // the frame up *after* the call, however in the ABI-changing tail-call case
2593 // we've carefully laid out the parameters so that when sp is reset they'll be
2594 // in the correct location.
2595 if (isTailCall && !isSibCall) {
2596 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2597 InGlue = Chain.getValue(1);
2598 }
2599
2600 std::vector<SDValue> Ops;
2601 Ops.push_back(Chain);
2602 Ops.push_back(Callee);
2603
2604 if (isTailCall) {
2605 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2606 }
2607
2608 // Add argument registers to the end of the list so that they are known live
2609 // into the call.
2610 for (const auto &[Reg, N] : RegsToPass)
2611 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2612
2613 // Add a register mask operand representing the call-preserved registers.
2614 const uint32_t *Mask;
2615 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2616 if (isThisReturn) {
2617 // For 'this' returns, use the R0-preserving mask if applicable
2618 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2619 if (!Mask) {
2620 // Set isThisReturn to false if the calling convention is not one that
2621 // allows 'returned' to be modeled in this way, so LowerCallResult does
2622 // not try to pass 'this' straight through
2623 isThisReturn = false;
2624 Mask = ARI->getCallPreservedMask(MF, CallConv);
2625 }
2626 } else
2627 Mask = ARI->getCallPreservedMask(MF, CallConv);
2628
2629 assert(Mask && "Missing call preserved mask for calling convention");
2630 Ops.push_back(DAG.getRegisterMask(Mask));
2631
2632 if (InGlue.getNode())
2633 Ops.push_back(InGlue);
2634
2635 if (isTailCall) {
2637 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2638 if (CLI.CFIType)
2639 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2640 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2641 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2642 return Ret;
2643 }
2644
2645 // Returns a chain and a flag for retval copy to use.
2646 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2647 if (CLI.CFIType)
2648 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2649 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2650 InGlue = Chain.getValue(1);
2651 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2652
2653 // If we're guaranteeing tail-calls will be honoured, the callee must
2654 // pop its own argument stack on return. But this call is *not* a tail call so
2655 // we need to undo that after it returns to restore the status-quo.
2656 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2657 uint64_t CalleePopBytes =
2658 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2659
2660 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2661 if (!Ins.empty())
2662 InGlue = Chain.getValue(1);
2663
2664 // Handle result values, copying them out of physregs into vregs that we
2665 // return.
2666 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2667 InVals, isThisReturn,
2668 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2669}
2670
2671/// HandleByVal - Every parameter *after* a byval parameter is passed
2672/// on the stack. Remember the next parameter register to allocate,
2673/// and then confiscate the rest of the parameter registers to insure
2674/// this.
2675void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2676 Align Alignment) const {
2677 // Byval (as with any stack) slots are always at least 4 byte aligned.
2678 Alignment = std::max(Alignment, Align(4));
2679
2680 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2681 if (!Reg)
2682 return;
2683
2684 unsigned AlignInRegs = Alignment.value() / 4;
2685 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2686 for (unsigned i = 0; i < Waste; ++i)
2687 Reg = State->AllocateReg(GPRArgRegs);
2688
2689 if (!Reg)
2690 return;
2691
2692 unsigned Excess = 4 * (ARM::R4 - Reg);
2693
2694 // Special case when NSAA != SP and parameter size greater than size of
2695 // all remained GPR regs. In that case we can't split parameter, we must
2696 // send it to stack. We also must set NCRN to R4, so waste all
2697 // remained registers.
2698 const unsigned NSAAOffset = State->getStackSize();
2699 if (NSAAOffset != 0 && Size > Excess) {
2700 while (State->AllocateReg(GPRArgRegs))
2701 ;
2702 return;
2703 }
2704
2705 // First register for byval parameter is the first register that wasn't
2706 // allocated before this method call, so it would be "reg".
2707 // If parameter is small enough to be saved in range [reg, r4), then
2708 // the end (first after last) register would be reg + param-size-in-regs,
2709 // else parameter would be splitted between registers and stack,
2710 // end register would be r4 in this case.
2711 unsigned ByValRegBegin = Reg;
2712 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2713 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2714 // Note, first register is allocated in the beginning of function already,
2715 // allocate remained amount of registers we need.
2716 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2717 State->AllocateReg(GPRArgRegs);
2718 // A byval parameter that is split between registers and memory needs its
2719 // size truncated here.
2720 // In the case where the entire structure fits in registers, we set the
2721 // size in memory to zero.
2722 Size = std::max<int>(Size - Excess, 0);
2723}
2724
2725/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2726/// for tail call optimization. Targets which want to do tail call
2727/// optimization should implement this function. Note that this function also
2728/// processes musttail calls, so when this function returns false on a valid
2729/// musttail call, a fatal backend error occurs.
2730bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2732 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2733 CallingConv::ID CalleeCC = CLI.CallConv;
2734 SDValue Callee = CLI.Callee;
2735 bool isVarArg = CLI.IsVarArg;
2736 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2737 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2738 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2739 const SelectionDAG &DAG = CLI.DAG;
2740 MachineFunction &MF = DAG.getMachineFunction();
2741 const Function &CallerF = MF.getFunction();
2742 CallingConv::ID CallerCC = CallerF.getCallingConv();
2743
2744 assert(Subtarget->supportsTailCall());
2745
2746 // Indirect tail-calls require a register to hold the target address. That
2747 // register must be:
2748 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2749 // * Not callee-saved, so must be one of r0-r3 or r12.
2750 // * Not used to hold an argument to the tail-called function, which might be
2751 // in r0-r3.
2752 // * Not used to hold the return address authentication code, which is in r12
2753 // if enabled.
2754 // Sometimes, no register matches all of these conditions, so we can't do a
2755 // tail-call.
2756 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2757 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2758 ARM::R3};
2759 if (!(Subtarget->isThumb1Only() ||
2760 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2761 AddressRegisters.insert(ARM::R12);
2762 for (const CCValAssign &AL : ArgLocs)
2763 if (AL.isRegLoc())
2764 AddressRegisters.erase(AL.getLocReg());
2765 if (AddressRegisters.empty()) {
2766 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2767 return false;
2768 }
2769 }
2770
2771 // Look for obvious safe cases to perform tail call optimization that do not
2772 // require ABI changes. This is what gcc calls sibcall.
2773
2774 // Exception-handling functions need a special set of instructions to indicate
2775 // a return to the hardware. Tail-calling another function would probably
2776 // break this.
2777 if (CallerF.hasFnAttribute("interrupt")) {
2778 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2779 return false;
2780 }
2781
2782 if (canGuaranteeTCO(CalleeCC,
2783 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2784 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2785 << " (guaranteed tail-call CC)\n");
2786 return CalleeCC == CallerCC;
2787 }
2788
2789 // Also avoid sibcall optimization if either caller or callee uses struct
2790 // return semantics.
2791 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2792 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2793 if (isCalleeStructRet != isCallerStructRet) {
2794 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2795 return false;
2796 }
2797
2798 // Externally-defined functions with weak linkage should not be
2799 // tail-called on ARM when the OS does not support dynamic
2800 // pre-emption of symbols, as the AAELF spec requires normal calls
2801 // to undefined weak functions to be replaced with a NOP or jump to the
2802 // next instruction. The behaviour of branch instructions in this
2803 // situation (as used for tail calls) is implementation-defined, so we
2804 // cannot rely on the linker replacing the tail call with a return.
2805 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2806 const GlobalValue *GV = G->getGlobal();
2807 const Triple &TT = getTargetMachine().getTargetTriple();
2808 if (GV->hasExternalWeakLinkage() &&
2809 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2810 TT.isOSBinFormatMachO())) {
2811 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2812 return false;
2813 }
2814 }
2815
2816 // Check that the call results are passed in the same way.
2817 LLVMContext &C = *DAG.getContext();
2819 getEffectiveCallingConv(CalleeCC, isVarArg),
2820 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2821 CCAssignFnForReturn(CalleeCC, isVarArg),
2822 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2823 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2824 return false;
2825 }
2826 // The callee has to preserve all registers the caller needs to preserve.
2827 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2828 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2829 if (CalleeCC != CallerCC) {
2830 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2831 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2832 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2833 return false;
2834 }
2835 }
2836
2837 // If Caller's vararg argument has been split between registers and stack, do
2838 // not perform tail call, since part of the argument is in caller's local
2839 // frame.
2840 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2841 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2842 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2843 return false;
2844 }
2845
2846 // If the callee takes no arguments then go on to check the results of the
2847 // call.
2848 const MachineRegisterInfo &MRI = MF.getRegInfo();
2849 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2850 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2851 return false;
2852 }
2853
2854 // If the stack arguments for this call do not fit into our own save area then
2855 // the call cannot be made tail.
2856 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2857 return false;
2858
2859 LLVM_DEBUG(dbgs() << "true\n");
2860 return true;
2861}
2862
2863bool
2864ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2865 MachineFunction &MF, bool isVarArg,
2867 LLVMContext &Context, const Type *RetTy) const {
2869 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2870 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2871}
2872
2874 const SDLoc &DL, SelectionDAG &DAG) {
2875 const MachineFunction &MF = DAG.getMachineFunction();
2876 const Function &F = MF.getFunction();
2877
2878 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2879
2880 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2881 // version of the "preferred return address". These offsets affect the return
2882 // instruction if this is a return from PL1 without hypervisor extensions.
2883 // IRQ/FIQ: +4 "subs pc, lr, #4"
2884 // SWI: 0 "subs pc, lr, #0"
2885 // ABORT: +4 "subs pc, lr, #4"
2886 // UNDEF: +4/+2 "subs pc, lr, #0"
2887 // UNDEF varies depending on where the exception came from ARM or Thumb
2888 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2889
2890 int64_t LROffset;
2891 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2892 IntKind == "ABORT")
2893 LROffset = 4;
2894 else if (IntKind == "SWI" || IntKind == "UNDEF")
2895 LROffset = 0;
2896 else
2897 report_fatal_error("Unsupported interrupt attribute. If present, value "
2898 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2899
2900 RetOps.insert(RetOps.begin() + 1,
2901 DAG.getConstant(LROffset, DL, MVT::i32, false));
2902
2903 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2904}
2905
2906SDValue
2907ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2908 bool isVarArg,
2910 const SmallVectorImpl<SDValue> &OutVals,
2911 const SDLoc &dl, SelectionDAG &DAG) const {
2912 // CCValAssign - represent the assignment of the return value to a location.
2914
2915 // CCState - Info about the registers and stack slots.
2916 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2917 *DAG.getContext());
2918
2919 // Analyze outgoing return values.
2920 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2921
2922 SDValue Glue;
2924 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2925 bool isLittleEndian = Subtarget->isLittle();
2926
2927 MachineFunction &MF = DAG.getMachineFunction();
2928 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2929 AFI->setReturnRegsCount(RVLocs.size());
2930
2931 // Report error if cmse entry function returns structure through first ptr arg.
2932 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2933 // Note: using an empty SDLoc(), as the first line of the function is a
2934 // better place to report than the last line.
2935 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2937 "secure entry function would return value through pointer",
2938 SDLoc().getDebugLoc()));
2939 }
2940
2941 // Copy the result values into the output registers.
2942 for (unsigned i = 0, realRVLocIdx = 0;
2943 i != RVLocs.size();
2944 ++i, ++realRVLocIdx) {
2945 CCValAssign &VA = RVLocs[i];
2946 assert(VA.isRegLoc() && "Can only return in registers!");
2947
2948 SDValue Arg = OutVals[realRVLocIdx];
2949 bool ReturnF16 = false;
2950
2951 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
2952 // Half-precision return values can be returned like this:
2953 //
2954 // t11 f16 = fadd ...
2955 // t12: i16 = bitcast t11
2956 // t13: i32 = zero_extend t12
2957 // t14: f32 = bitcast t13 <~~~~~~~ Arg
2958 //
2959 // to avoid code generation for bitcasts, we simply set Arg to the node
2960 // that produces the f16 value, t11 in this case.
2961 //
2962 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2963 SDValue ZE = Arg.getOperand(0);
2964 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2965 SDValue BC = ZE.getOperand(0);
2966 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2967 Arg = BC.getOperand(0);
2968 ReturnF16 = true;
2969 }
2970 }
2971 }
2972 }
2973
2974 switch (VA.getLocInfo()) {
2975 default: llvm_unreachable("Unknown loc info!");
2976 case CCValAssign::Full: break;
2977 case CCValAssign::BCvt:
2978 if (!ReturnF16)
2979 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2980 break;
2981 }
2982
2983 // Mask f16 arguments if this is a CMSE nonsecure entry.
2984 auto RetVT = Outs[realRVLocIdx].ArgVT;
2985 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
2986 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
2987 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2988 } else {
2989 auto LocBits = VA.getLocVT().getSizeInBits();
2990 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
2991 SDValue Mask =
2992 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2993 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2994 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2995 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2996 }
2997 }
2998
2999 if (VA.needsCustom() &&
3000 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3001 if (VA.getLocVT() == MVT::v2f64) {
3002 // Extract the first half and return it in two registers.
3003 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3004 DAG.getConstant(0, dl, MVT::i32));
3005 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3006 DAG.getVTList(MVT::i32, MVT::i32), Half);
3007
3008 Chain =
3009 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3010 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3011 Glue = Chain.getValue(1);
3012 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3013 VA = RVLocs[++i]; // skip ahead to next loc
3014 Chain =
3015 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3016 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3017 Glue = Chain.getValue(1);
3018 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3019 VA = RVLocs[++i]; // skip ahead to next loc
3020
3021 // Extract the 2nd half and fall through to handle it as an f64 value.
3022 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3023 DAG.getConstant(1, dl, MVT::i32));
3024 }
3025 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3026 // available.
3027 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3028 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3029 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3030 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3031 Glue = Chain.getValue(1);
3032 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3033 VA = RVLocs[++i]; // skip ahead to next loc
3034 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3035 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3036 } else
3037 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3038
3039 // Guarantee that all emitted copies are
3040 // stuck together, avoiding something bad.
3041 Glue = Chain.getValue(1);
3042 RetOps.push_back(DAG.getRegister(
3043 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3044 }
3045 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3046 const MCPhysReg *I =
3047 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3048 if (I) {
3049 for (; *I; ++I) {
3050 if (ARM::GPRRegClass.contains(*I))
3051 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3052 else if (ARM::DPRRegClass.contains(*I))
3054 else
3055 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3056 }
3057 }
3058
3059 // Update chain and glue.
3060 RetOps[0] = Chain;
3061 if (Glue.getNode())
3062 RetOps.push_back(Glue);
3063
3064 // CPUs which aren't M-class use a special sequence to return from
3065 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3066 // though we use "subs pc, lr, #N").
3067 //
3068 // M-class CPUs actually use a normal return sequence with a special
3069 // (hardware-provided) value in LR, so the normal code path works.
3070 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3071 !Subtarget->isMClass()) {
3072 if (Subtarget->isThumb1Only())
3073 report_fatal_error("interrupt attribute is not supported in Thumb1");
3074 return LowerInterruptReturn(RetOps, dl, DAG);
3075 }
3076
3077 unsigned RetNode =
3078 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3079 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3080}
3081
3082bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3083 if (N->getNumValues() != 1)
3084 return false;
3085 if (!N->hasNUsesOfValue(1, 0))
3086 return false;
3087
3088 SDValue TCChain = Chain;
3089 SDNode *Copy = *N->user_begin();
3090 if (Copy->getOpcode() == ISD::CopyToReg) {
3091 // If the copy has a glue operand, we conservatively assume it isn't safe to
3092 // perform a tail call.
3093 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3094 return false;
3095 TCChain = Copy->getOperand(0);
3096 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3097 SDNode *VMov = Copy;
3098 // f64 returned in a pair of GPRs.
3099 SmallPtrSet<SDNode*, 2> Copies;
3100 for (SDNode *U : VMov->users()) {
3101 if (U->getOpcode() != ISD::CopyToReg)
3102 return false;
3103 Copies.insert(U);
3104 }
3105 if (Copies.size() > 2)
3106 return false;
3107
3108 for (SDNode *U : VMov->users()) {
3109 SDValue UseChain = U->getOperand(0);
3110 if (Copies.count(UseChain.getNode()))
3111 // Second CopyToReg
3112 Copy = U;
3113 else {
3114 // We are at the top of this chain.
3115 // If the copy has a glue operand, we conservatively assume it
3116 // isn't safe to perform a tail call.
3117 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3118 return false;
3119 // First CopyToReg
3120 TCChain = UseChain;
3121 }
3122 }
3123 } else if (Copy->getOpcode() == ISD::BITCAST) {
3124 // f32 returned in a single GPR.
3125 if (!Copy->hasOneUse())
3126 return false;
3127 Copy = *Copy->user_begin();
3128 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3129 return false;
3130 // If the copy has a glue operand, we conservatively assume it isn't safe to
3131 // perform a tail call.
3132 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3133 return false;
3134 TCChain = Copy->getOperand(0);
3135 } else {
3136 return false;
3137 }
3138
3139 bool HasRet = false;
3140 for (const SDNode *U : Copy->users()) {
3141 if (U->getOpcode() != ARMISD::RET_GLUE &&
3142 U->getOpcode() != ARMISD::INTRET_GLUE)
3143 return false;
3144 HasRet = true;
3145 }
3146
3147 if (!HasRet)
3148 return false;
3149
3150 Chain = TCChain;
3151 return true;
3152}
3153
3154bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3155 if (!Subtarget->supportsTailCall())
3156 return false;
3157
3158 if (!CI->isTailCall())
3159 return false;
3160
3161 return true;
3162}
3163
3164// Trying to write a 64 bit value so need to split into two 32 bit values first,
3165// and pass the lower and high parts through.
3167 SDLoc DL(Op);
3168 SDValue WriteValue = Op->getOperand(2);
3169
3170 // This function is only supposed to be called for i64 type argument.
3171 assert(WriteValue.getValueType() == MVT::i64
3172 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3173
3174 SDValue Lo, Hi;
3175 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3176 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3177 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3178}
3179
3180// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3181// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3182// one of the above mentioned nodes. It has to be wrapped because otherwise
3183// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3184// be used to form addressing mode. These wrapped nodes will be selected
3185// into MOVi.
3186SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3187 SelectionDAG &DAG) const {
3188 EVT PtrVT = Op.getValueType();
3189 // FIXME there is no actual debug info here
3190 SDLoc dl(Op);
3191 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3192 SDValue Res;
3193
3194 // When generating execute-only code Constant Pools must be promoted to the
3195 // global data section. It's a bit ugly that we can't share them across basic
3196 // blocks, but this way we guarantee that execute-only behaves correct with
3197 // position-independent addressing modes.
3198 if (Subtarget->genExecuteOnly()) {
3199 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3200 auto *T = CP->getType();
3201 auto C = const_cast<Constant*>(CP->getConstVal());
3202 auto M = DAG.getMachineFunction().getFunction().getParent();
3203 auto GV = new GlobalVariable(
3204 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3205 Twine(DAG.getDataLayout().getInternalSymbolPrefix()) + "CP" +
3206 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3207 Twine(AFI->createPICLabelUId()));
3209 dl, PtrVT);
3210 return LowerGlobalAddress(GA, DAG);
3211 }
3212
3213 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3214 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3215 Align CPAlign = CP->getAlign();
3216 if (Subtarget->isThumb1Only())
3217 CPAlign = std::max(CPAlign, Align(4));
3219 Res =
3220 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3221 else
3222 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3223 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3224}
3225
3227 // If we don't have a 32-bit pc-relative branch instruction then the jump
3228 // table consists of block addresses. Usually this is inline, but for
3229 // execute-only it must be placed out-of-line.
3230 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3233}
3234
3235SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3236 SelectionDAG &DAG) const {
3239 unsigned ARMPCLabelIndex = 0;
3240 SDLoc DL(Op);
3241 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3242 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3243 SDValue CPAddr;
3244 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3245 if (!IsPositionIndependent) {
3246 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3247 } else {
3248 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3249 ARMPCLabelIndex = AFI->createPICLabelUId();
3251 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3252 ARMCP::CPBlockAddress, PCAdj);
3253 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3254 }
3255 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3256 SDValue Result = DAG.getLoad(
3257 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3259 if (!IsPositionIndependent)
3260 return Result;
3261 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3262 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3263}
3264
3265/// Convert a TLS address reference into the correct sequence of loads
3266/// and calls to compute the variable's address for Darwin, and return an
3267/// SDValue containing the final node.
3268
3269/// Darwin only has one TLS scheme which must be capable of dealing with the
3270/// fully general situation, in the worst case. This means:
3271/// + "extern __thread" declaration.
3272/// + Defined in a possibly unknown dynamic library.
3273///
3274/// The general system is that each __thread variable has a [3 x i32] descriptor
3275/// which contains information used by the runtime to calculate the address. The
3276/// only part of this the compiler needs to know about is the first word, which
3277/// contains a function pointer that must be called with the address of the
3278/// entire descriptor in "r0".
3279///
3280/// Since this descriptor may be in a different unit, in general access must
3281/// proceed along the usual ARM rules. A common sequence to produce is:
3282///
3283/// movw rT1, :lower16:_var$non_lazy_ptr
3284/// movt rT1, :upper16:_var$non_lazy_ptr
3285/// ldr r0, [rT1]
3286/// ldr rT2, [r0]
3287/// blx rT2
3288/// [...address now in r0...]
3289SDValue
3290ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3291 SelectionDAG &DAG) const {
3292 assert(getTargetMachine().getTargetTriple().isOSDarwin() &&
3293 "This function expects a Darwin target");
3294 SDLoc DL(Op);
3295
3296 // First step is to get the address of the actua global symbol. This is where
3297 // the TLS descriptor lives.
3298 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3299
3300 // The first entry in the descriptor is a function pointer that we must call
3301 // to obtain the address of the variable.
3302 SDValue Chain = DAG.getEntryNode();
3303 SDValue FuncTLVGet = DAG.getLoad(
3304 MVT::i32, DL, Chain, DescAddr,
3308 Chain = FuncTLVGet.getValue(1);
3309
3310 MachineFunction &F = DAG.getMachineFunction();
3311 MachineFrameInfo &MFI = F.getFrameInfo();
3312 MFI.setAdjustsStack(true);
3313
3314 // TLS calls preserve all registers except those that absolutely must be
3315 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3316 // silly).
3317 auto TRI =
3319 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3320 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3321
3322 // Finally, we can make the call. This is just a degenerate version of a
3323 // normal AArch64 call node: r0 takes the address of the descriptor, and
3324 // returns the address of the variable in this thread.
3325 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3326 Chain =
3327 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3328 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3329 DAG.getRegisterMask(Mask), Chain.getValue(1));
3330 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3331}
3332
3333SDValue
3334ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3335 SelectionDAG &DAG) const {
3336 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3337 "Windows specific TLS lowering");
3338
3339 SDValue Chain = DAG.getEntryNode();
3340 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3341 SDLoc DL(Op);
3342
3343 // Load the current TEB (thread environment block)
3344 SDValue Ops[] = {Chain,
3345 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3346 DAG.getTargetConstant(15, DL, MVT::i32),
3347 DAG.getTargetConstant(0, DL, MVT::i32),
3348 DAG.getTargetConstant(13, DL, MVT::i32),
3349 DAG.getTargetConstant(0, DL, MVT::i32),
3350 DAG.getTargetConstant(2, DL, MVT::i32)};
3351 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3352 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3353
3354 SDValue TEB = CurrentTEB.getValue(0);
3355 Chain = CurrentTEB.getValue(1);
3356
3357 // Load the ThreadLocalStoragePointer from the TEB
3358 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3359 SDValue TLSArray =
3360 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3361 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3362
3363 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3364 // offset into the TLSArray.
3365
3366 // Load the TLS index from the C runtime
3367 SDValue TLSIndex =
3368 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3369 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3370 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3371
3372 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3373 DAG.getConstant(2, DL, MVT::i32));
3374 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3375 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3376 MachinePointerInfo());
3377
3378 // Get the offset of the start of the .tls section (section base)
3379 const auto *GA = cast<GlobalAddressSDNode>(Op);
3380 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3381 SDValue Offset = DAG.getLoad(
3382 PtrVT, DL, Chain,
3383 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3384 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3386
3387 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3388}
3389
3390// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3391SDValue
3392ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3393 SelectionDAG &DAG) const {
3394 SDLoc dl(GA);
3395 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3396 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3397 MachineFunction &MF = DAG.getMachineFunction();
3398 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3399 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3400 ARMConstantPoolValue *CPV =
3401 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3402 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3403 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3404 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3405 Argument = DAG.getLoad(
3406 PtrVT, dl, DAG.getEntryNode(), Argument,
3408 SDValue Chain = Argument.getValue(1);
3409
3410 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3411 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3412
3413 // call __tls_get_addr.
3415 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3416
3417 // FIXME: is there useful debug info available here?
3418 TargetLowering::CallLoweringInfo CLI(DAG);
3419 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3421 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3422
3423 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3424 return CallResult.first;
3425}
3426
3427// Lower ISD::GlobalTLSAddress using the "initial exec" or
3428// "local exec" model.
3429SDValue
3430ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3431 SelectionDAG &DAG,
3432 TLSModel::Model model) const {
3433 const GlobalValue *GV = GA->getGlobal();
3434 SDLoc dl(GA);
3436 SDValue Chain = DAG.getEntryNode();
3437 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3438 // Get the Thread Pointer
3439 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3440
3441 if (model == TLSModel::InitialExec) {
3442 MachineFunction &MF = DAG.getMachineFunction();
3443 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3444 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3445 // Initial exec model.
3446 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3447 ARMConstantPoolValue *CPV =
3448 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3450 true);
3451 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3452 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3453 Offset = DAG.getLoad(
3454 PtrVT, dl, Chain, Offset,
3456 Chain = Offset.getValue(1);
3457
3458 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3459 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3460
3461 Offset = DAG.getLoad(
3462 PtrVT, dl, Chain, Offset,
3464 } else {
3465 // local exec model
3466 assert(model == TLSModel::LocalExec);
3467 ARMConstantPoolValue *CPV =
3469 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3470 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3471 Offset = DAG.getLoad(
3472 PtrVT, dl, Chain, Offset,
3474 }
3475
3476 // The address of the thread local variable is the add of the thread
3477 // pointer with the offset of the variable.
3478 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3479}
3480
3481SDValue
3482ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3483 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3484 if (DAG.getTarget().useEmulatedTLS())
3485 return LowerToTLSEmulatedModel(GA, DAG);
3486
3487 const Triple &TT = getTargetMachine().getTargetTriple();
3488 if (TT.isOSDarwin())
3489 return LowerGlobalTLSAddressDarwin(Op, DAG);
3490
3491 if (TT.isOSWindows())
3492 return LowerGlobalTLSAddressWindows(Op, DAG);
3493
3494 // TODO: implement the "local dynamic" model
3495 assert(TT.isOSBinFormatELF() && "Only ELF implemented here");
3497
3498 switch (model) {
3501 return LowerToTLSGeneralDynamicModel(GA, DAG);
3504 return LowerToTLSExecModels(GA, DAG, model);
3505 }
3506 llvm_unreachable("bogus TLS model");
3507}
3508
3509/// Return true if all users of V are within function F, looking through
3510/// ConstantExprs.
3511static bool allUsersAreInFunction(const Value *V, const Function *F) {
3512 SmallVector<const User*,4> Worklist(V->users());
3513 while (!Worklist.empty()) {
3514 auto *U = Worklist.pop_back_val();
3515 if (isa<ConstantExpr>(U)) {
3516 append_range(Worklist, U->users());
3517 continue;
3518 }
3519
3520 auto *I = dyn_cast<Instruction>(U);
3521 if (!I || I->getParent()->getParent() != F)
3522 return false;
3523 }
3524 return true;
3525}
3526
3528 const GlobalValue *GV, SelectionDAG &DAG,
3529 EVT PtrVT, const SDLoc &dl) {
3530 // If we're creating a pool entry for a constant global with unnamed address,
3531 // and the global is small enough, we can emit it inline into the constant pool
3532 // to save ourselves an indirection.
3533 //
3534 // This is a win if the constant is only used in one function (so it doesn't
3535 // need to be duplicated) or duplicating the constant wouldn't increase code
3536 // size (implying the constant is no larger than 4 bytes).
3537 const Function &F = DAG.getMachineFunction().getFunction();
3538
3539 // We rely on this decision to inline being idempotent and unrelated to the
3540 // use-site. We know that if we inline a variable at one use site, we'll
3541 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3542 // doesn't know about this optimization, so bail out if it's enabled else
3543 // we could decide to inline here (and thus never emit the GV) but require
3544 // the GV from fast-isel generated code.
3547 return SDValue();
3548
3549 auto *GVar = dyn_cast<GlobalVariable>(GV);
3550 if (!GVar || !GVar->hasInitializer() ||
3551 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3552 !GVar->hasLocalLinkage())
3553 return SDValue();
3554
3555 // If we inline a value that contains relocations, we move the relocations
3556 // from .data to .text. This is not allowed in position-independent code.
3557 auto *Init = GVar->getInitializer();
3558 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3559 Init->needsDynamicRelocation())
3560 return SDValue();
3561
3562 // The constant islands pass can only really deal with alignment requests
3563 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3564 // any type wanting greater alignment requirements than 4 bytes. We also
3565 // can only promote constants that are multiples of 4 bytes in size or
3566 // are paddable to a multiple of 4. Currently we only try and pad constants
3567 // that are strings for simplicity.
3568 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3569 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3570 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3571 unsigned RequiredPadding = 4 - (Size % 4);
3572 bool PaddingPossible =
3573 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3574 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3575 Size == 0)
3576 return SDValue();
3577
3578 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3580 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3581
3582 // We can't bloat the constant pool too much, else the ConstantIslands pass
3583 // may fail to converge. If we haven't promoted this global yet (it may have
3584 // multiple uses), and promoting it would increase the constant pool size (Sz
3585 // > 4), ensure we have space to do so up to MaxTotal.
3586 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3587 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3589 return SDValue();
3590
3591 // This is only valid if all users are in a single function; we can't clone
3592 // the constant in general. The LLVM IR unnamed_addr allows merging
3593 // constants, but not cloning them.
3594 //
3595 // We could potentially allow cloning if we could prove all uses of the
3596 // constant in the current function don't care about the address, like
3597 // printf format strings. But that isn't implemented for now.
3598 if (!allUsersAreInFunction(GVar, &F))
3599 return SDValue();
3600
3601 // We're going to inline this global. Pad it out if needed.
3602 if (RequiredPadding != 4) {
3603 StringRef S = CDAInit->getAsString();
3604
3606 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3607 while (RequiredPadding--)
3608 V.push_back(0);
3610 }
3611
3612 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3613 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3614 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3617 PaddedSize - 4);
3618 }
3619 ++NumConstpoolPromoted;
3620 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3621}
3622
3624 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3625 if (!(GV = GA->getAliaseeObject()))
3626 return false;
3627 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3628 return V->isConstant();
3629 return isa<Function>(GV);
3630}
3631
3632SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3633 SelectionDAG &DAG) const {
3634 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3635 default: llvm_unreachable("unknown object format");
3636 case Triple::COFF:
3637 return LowerGlobalAddressWindows(Op, DAG);
3638 case Triple::ELF:
3639 return LowerGlobalAddressELF(Op, DAG);
3640 case Triple::MachO:
3641 return LowerGlobalAddressDarwin(Op, DAG);
3642 }
3643}
3644
3645SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3646 SelectionDAG &DAG) const {
3647 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3648 SDLoc dl(Op);
3649 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3650 bool IsRO = isReadOnly(GV);
3651
3652 // promoteToConstantPool only if not generating XO text section
3653 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3654 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3655 return V;
3656
3657 if (isPositionIndependent()) {
3659 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3660 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3661 if (!GV->isDSOLocal())
3662 Result =
3663 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3665 return Result;
3666 } else if (Subtarget->isROPI() && IsRO) {
3667 // PC-relative.
3668 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3669 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3670 return Result;
3671 } else if (Subtarget->isRWPI() && !IsRO) {
3672 // SB-relative.
3673 SDValue RelAddr;
3674 if (Subtarget->useMovt()) {
3675 ++NumMovwMovt;
3676 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3677 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3678 } else { // use literal pool for address constant
3679 ARMConstantPoolValue *CPV =
3681 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3682 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3683 RelAddr = DAG.getLoad(
3684 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3686 }
3687 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3688 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3689 return Result;
3690 }
3691
3692 // If we have T2 ops, we can materialize the address directly via movt/movw
3693 // pair. This is always cheaper. If need to generate Execute Only code, and we
3694 // only have Thumb1 available, we can't use a constant pool and are forced to
3695 // use immediate relocations.
3696 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3697 if (Subtarget->useMovt())
3698 ++NumMovwMovt;
3699 // FIXME: Once remat is capable of dealing with instructions with register
3700 // operands, expand this into two nodes.
3701 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3702 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3703 } else {
3704 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3705 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3706 return DAG.getLoad(
3707 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3709 }
3710}
3711
3712SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3713 SelectionDAG &DAG) const {
3714 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3715 "ROPI/RWPI not currently supported for Darwin");
3716 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3717 SDLoc dl(Op);
3718 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3719
3720 if (Subtarget->useMovt())
3721 ++NumMovwMovt;
3722
3723 // FIXME: Once remat is capable of dealing with instructions with register
3724 // operands, expand this into multiple nodes
3725 unsigned Wrapper =
3726 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3727
3728 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3729 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3730
3731 if (Subtarget->isGVIndirectSymbol(GV))
3732 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3734 return Result;
3735}
3736
3737SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3738 SelectionDAG &DAG) const {
3739 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3740 "non-Windows COFF is not supported");
3741 assert(Subtarget->useMovt() &&
3742 "Windows on ARM expects to use movw/movt");
3743 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3744 "ROPI/RWPI not currently supported for Windows");
3745
3746 const TargetMachine &TM = getTargetMachine();
3747 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3748 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3749 if (GV->hasDLLImportStorageClass())
3750 TargetFlags = ARMII::MO_DLLIMPORT;
3751 else if (!TM.shouldAssumeDSOLocal(GV))
3752 TargetFlags = ARMII::MO_COFFSTUB;
3753 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3755 SDLoc DL(Op);
3756
3757 ++NumMovwMovt;
3758
3759 // FIXME: Once remat is capable of dealing with instructions with register
3760 // operands, expand this into two nodes.
3761 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3762 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3763 TargetFlags));
3764 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3765 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3767 return Result;
3768}
3769
3770SDValue
3771ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3772 SDLoc dl(Op);
3773 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3774 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3775 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3776 Op.getOperand(1), Val);
3777}
3778
3779SDValue
3780ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3781 SDLoc dl(Op);
3782 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3783 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3784}
3785
3786SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3787 SelectionDAG &DAG) const {
3788 SDLoc dl(Op);
3789 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3790 Op.getOperand(0));
3791}
3792
3793SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3794 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3795 unsigned IntNo =
3796 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3797 switch (IntNo) {
3798 default:
3799 return SDValue(); // Don't custom lower most intrinsics.
3800 case Intrinsic::arm_gnu_eabi_mcount: {
3801 MachineFunction &MF = DAG.getMachineFunction();
3802 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3803 SDLoc dl(Op);
3804 SDValue Chain = Op.getOperand(0);
3805 // call "\01__gnu_mcount_nc"
3806 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3807 const uint32_t *Mask =
3809 assert(Mask && "Missing call preserved mask for calling convention");
3810 // Mark LR an implicit live-in.
3811 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3812 SDValue ReturnAddress =
3813 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3814 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3815 SDValue Callee =
3816 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3818 if (Subtarget->isThumb())
3819 return SDValue(
3820 DAG.getMachineNode(
3821 ARM::tBL_PUSHLR, dl, ResultTys,
3822 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3823 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3824 0);
3825 return SDValue(
3826 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3827 {ReturnAddress, Callee, RegisterMask, Chain}),
3828 0);
3829 }
3830 }
3831}
3832
3833SDValue
3834ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3835 const ARMSubtarget *Subtarget) const {
3836 unsigned IntNo = Op.getConstantOperandVal(0);
3837 SDLoc dl(Op);
3838 switch (IntNo) {
3839 default: return SDValue(); // Don't custom lower most intrinsics.
3840 case Intrinsic::thread_pointer: {
3841 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3842 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3843 }
3844 case Intrinsic::arm_cls: {
3845 // Note: arm_cls and arm_cls64 intrinsics are expanded directly here
3846 // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS
3847 // instruction.
3848 const SDValue &Operand = Op.getOperand(1);
3849 const EVT VTy = Op.getValueType();
3850 return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
3851 }
3852 case Intrinsic::arm_cls64: {
3853 // arm_cls64 returns i32 but takes i64 input.
3854 // Use ISD::CTLS for i64 and truncate the result.
3855 SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Op.getOperand(1));
3856 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
3857 }
3858 case Intrinsic::arm_neon_vcls:
3859 case Intrinsic::arm_mve_vcls: {
3860 // Lower vector CLS intrinsics to ISD::CTLS.
3861 // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
3862 const EVT VTy = Op.getValueType();
3863 return DAG.getNode(ISD::CTLS, dl, VTy, Op.getOperand(1));
3864 }
3865 case Intrinsic::eh_sjlj_lsda: {
3866 MachineFunction &MF = DAG.getMachineFunction();
3867 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3868 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3869 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3870 SDValue CPAddr;
3871 bool IsPositionIndependent = isPositionIndependent();
3872 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3873 ARMConstantPoolValue *CPV =
3874 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3875 ARMCP::CPLSDA, PCAdj);
3876 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3877 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3878 SDValue Result = DAG.getLoad(
3879 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3881
3882 if (IsPositionIndependent) {
3883 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3884 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3885 }
3886 return Result;
3887 }
3888 case Intrinsic::arm_neon_vabs:
3889 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3890 Op.getOperand(1));
3891 case Intrinsic::arm_neon_vabds:
3892 if (Op.getValueType().isInteger())
3893 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3894 Op.getOperand(1), Op.getOperand(2));
3895 return SDValue();
3896 case Intrinsic::arm_neon_vabdu:
3897 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3898 Op.getOperand(1), Op.getOperand(2));
3899 case Intrinsic::arm_neon_vmulls:
3900 case Intrinsic::arm_neon_vmullu: {
3901 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3902 ? ARMISD::VMULLs : ARMISD::VMULLu;
3903 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3904 Op.getOperand(1), Op.getOperand(2));
3905 }
3906 case Intrinsic::arm_neon_vminnm:
3907 case Intrinsic::arm_neon_vmaxnm: {
3908 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3909 ? ISD::FMINNUM : ISD::FMAXNUM;
3910 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3911 Op.getOperand(1), Op.getOperand(2));
3912 }
3913 case Intrinsic::arm_neon_vminu:
3914 case Intrinsic::arm_neon_vmaxu: {
3915 if (Op.getValueType().isFloatingPoint())
3916 return SDValue();
3917 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3918 ? ISD::UMIN : ISD::UMAX;
3919 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3920 Op.getOperand(1), Op.getOperand(2));
3921 }
3922 case Intrinsic::arm_neon_vmins:
3923 case Intrinsic::arm_neon_vmaxs: {
3924 // v{min,max}s is overloaded between signed integers and floats.
3925 if (!Op.getValueType().isFloatingPoint()) {
3926 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3927 ? ISD::SMIN : ISD::SMAX;
3928 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3929 Op.getOperand(1), Op.getOperand(2));
3930 }
3931 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3932 ? ISD::FMINIMUM : ISD::FMAXIMUM;
3933 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3934 Op.getOperand(1), Op.getOperand(2));
3935 }
3936 case Intrinsic::arm_neon_vtbl1:
3937 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3938 Op.getOperand(1), Op.getOperand(2));
3939 case Intrinsic::arm_neon_vtbl2:
3940 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3941 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3942 case Intrinsic::arm_mve_pred_i2v:
3943 case Intrinsic::arm_mve_pred_v2i:
3944 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
3945 Op.getOperand(1));
3946 case Intrinsic::arm_mve_vreinterpretq:
3947 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
3948 Op.getOperand(1));
3949 case Intrinsic::arm_mve_lsll:
3950 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
3951 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3952 case Intrinsic::arm_mve_asrl:
3953 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
3954 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3955 case Intrinsic::arm_mve_vsli:
3956 return DAG.getNode(ARMISD::VSLIIMM, SDLoc(Op), Op->getVTList(),
3957 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3958 case Intrinsic::arm_mve_vsri:
3959 return DAG.getNode(ARMISD::VSRIIMM, SDLoc(Op), Op->getVTList(),
3960 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3961 }
3962}
3963
3965 const ARMSubtarget *Subtarget) {
3966 SDLoc dl(Op);
3967 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
3968 if (SSID == SyncScope::SingleThread)
3969 return Op;
3970
3971 if (!Subtarget->hasDataBarrier()) {
3972 // Some ARMv6 cpus can support data barriers with an mcr instruction.
3973 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3974 // here.
3975 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3976 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3977 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3978 DAG.getConstant(0, dl, MVT::i32));
3979 }
3980
3981 AtomicOrdering Ord =
3982 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
3984 if (Subtarget->isMClass()) {
3985 // Only a full system barrier exists in the M-class architectures.
3987 } else if (Subtarget->preferISHSTBarriers() &&
3988 Ord == AtomicOrdering::Release) {
3989 // Swift happens to implement ISHST barriers in a way that's compatible with
3990 // Release semantics but weaker than ISH so we'd be fools not to use
3991 // it. Beware: other processors probably don't!
3993 }
3994
3995 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3996 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3997 DAG.getConstant(Domain, dl, MVT::i32));
3998}
3999
4001 const ARMSubtarget *Subtarget) {
4002 // ARM pre v5TE and Thumb1 does not have preload instructions.
4003 if (!(Subtarget->isThumb2() ||
4004 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4005 // Just preserve the chain.
4006 return Op.getOperand(0);
4007
4008 SDLoc dl(Op);
4009 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4010 if (!isRead &&
4011 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4012 // ARMv7 with MP extension has PLDW.
4013 return Op.getOperand(0);
4014
4015 unsigned isData = Op.getConstantOperandVal(4);
4016 if (Subtarget->isThumb()) {
4017 // Invert the bits.
4018 isRead = ~isRead & 1;
4019 isData = ~isData & 1;
4020 }
4021
4022 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4023 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4024 DAG.getConstant(isData, dl, MVT::i32));
4025}
4026
4029 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4030
4031 // vastart just stores the address of the VarArgsFrameIndex slot into the
4032 // memory location argument.
4033 SDLoc dl(Op);
4035 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4036 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4037 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4038 MachinePointerInfo(SV));
4039}
4040
4041SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4042 CCValAssign &NextVA,
4043 SDValue &Root,
4044 SelectionDAG &DAG,
4045 const SDLoc &dl) const {
4046 MachineFunction &MF = DAG.getMachineFunction();
4047 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4048
4049 const TargetRegisterClass *RC;
4050 if (AFI->isThumb1OnlyFunction())
4051 RC = &ARM::tGPRRegClass;
4052 else
4053 RC = &ARM::GPRRegClass;
4054
4055 // Transform the arguments stored in physical registers into virtual ones.
4056 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4057 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4058
4059 SDValue ArgValue2;
4060 if (NextVA.isMemLoc()) {
4061 MachineFrameInfo &MFI = MF.getFrameInfo();
4062 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4063
4064 // Create load node to retrieve arguments from the stack.
4065 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4066 ArgValue2 = DAG.getLoad(
4067 MVT::i32, dl, Root, FIN,
4069 } else {
4070 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4071 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4072 }
4073 if (!Subtarget->isLittle())
4074 std::swap (ArgValue, ArgValue2);
4075 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4076}
4077
4078// The remaining GPRs hold either the beginning of variable-argument
4079// data, or the beginning of an aggregate passed by value (usually
4080// byval). Either way, we allocate stack slots adjacent to the data
4081// provided by our caller, and store the unallocated registers there.
4082// If this is a variadic function, the va_list pointer will begin with
4083// these values; otherwise, this reassembles a (byval) structure that
4084// was split between registers and memory.
4085// Return: The frame index registers were stored into.
4086int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4087 const SDLoc &dl, SDValue &Chain,
4088 const Value *OrigArg,
4089 unsigned InRegsParamRecordIdx,
4090 int ArgOffset, unsigned ArgSize) const {
4091 // Currently, two use-cases possible:
4092 // Case #1. Non-var-args function, and we meet first byval parameter.
4093 // Setup first unallocated register as first byval register;
4094 // eat all remained registers
4095 // (these two actions are performed by HandleByVal method).
4096 // Then, here, we initialize stack frame with
4097 // "store-reg" instructions.
4098 // Case #2. Var-args function, that doesn't contain byval parameters.
4099 // The same: eat all remained unallocated registers,
4100 // initialize stack frame.
4101
4102 MachineFunction &MF = DAG.getMachineFunction();
4103 MachineFrameInfo &MFI = MF.getFrameInfo();
4104 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4105 unsigned RBegin, REnd;
4106 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4107 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4108 } else {
4109 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4110 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4111 REnd = ARM::R4;
4112 }
4113
4114 if (REnd != RBegin)
4115 ArgOffset = -4 * (ARM::R4 - RBegin);
4116
4117 auto PtrVT = getPointerTy(DAG.getDataLayout());
4118 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4119 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4120
4122 const TargetRegisterClass *RC =
4123 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4124
4125 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4126 Register VReg = MF.addLiveIn(Reg, RC);
4127 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4128 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4129 MachinePointerInfo(OrigArg, 4 * i));
4130 MemOps.push_back(Store);
4131 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4132 }
4133
4134 if (!MemOps.empty())
4135 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4136 return FrameIndex;
4137}
4138
4139// Setup stack frame, the va_list pointer will start from.
4140void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4141 const SDLoc &dl, SDValue &Chain,
4142 unsigned ArgOffset,
4143 unsigned TotalArgRegsSaveSize,
4144 bool ForceMutable) const {
4145 MachineFunction &MF = DAG.getMachineFunction();
4146 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4147
4148 // Try to store any remaining integer argument regs
4149 // to their spots on the stack so that they may be loaded by dereferencing
4150 // the result of va_next.
4151 // If there is no regs to be stored, just point address after last
4152 // argument passed via stack.
4153 int FrameIndex = StoreByValRegs(
4154 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4155 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4156 AFI->setVarArgsFrameIndex(FrameIndex);
4157}
4158
4159bool ARMTargetLowering::splitValueIntoRegisterParts(
4160 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4161 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4162 EVT ValueVT = Val.getValueType();
4163 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4164 unsigned ValueBits = ValueVT.getSizeInBits();
4165 unsigned PartBits = PartVT.getSizeInBits();
4166 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4167 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4168 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4169 Parts[0] = Val;
4170 return true;
4171 }
4172 return false;
4173}
4174
4175SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4176 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4177 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4178 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4179 unsigned ValueBits = ValueVT.getSizeInBits();
4180 unsigned PartBits = PartVT.getSizeInBits();
4181 SDValue Val = Parts[0];
4182
4183 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4184 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4185 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4186 return Val;
4187 }
4188 return SDValue();
4189}
4190
4191SDValue ARMTargetLowering::LowerFormalArguments(
4192 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4193 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4194 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4195 MachineFunction &MF = DAG.getMachineFunction();
4196 MachineFrameInfo &MFI = MF.getFrameInfo();
4197
4198 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4199
4200 // Assign locations to all of the incoming arguments.
4202 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4203 *DAG.getContext());
4204 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4205
4207 unsigned CurArgIdx = 0;
4208
4209 // Initially ArgRegsSaveSize is zero.
4210 // Then we increase this value each time we meet byval parameter.
4211 // We also increase this value in case of varargs function.
4212 AFI->setArgRegsSaveSize(0);
4213
4214 // Calculate the amount of stack space that we need to allocate to store
4215 // byval and variadic arguments that are passed in registers.
4216 // We need to know this before we allocate the first byval or variadic
4217 // argument, as they will be allocated a stack slot below the CFA (Canonical
4218 // Frame Address, the stack pointer at entry to the function).
4219 unsigned ArgRegBegin = ARM::R4;
4220 for (const CCValAssign &VA : ArgLocs) {
4221 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4222 break;
4223
4224 unsigned Index = VA.getValNo();
4225 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4226 if (!Flags.isByVal())
4227 continue;
4228
4229 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4230 unsigned RBegin, REnd;
4231 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4232 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4233
4234 CCInfo.nextInRegsParam();
4235 }
4236 CCInfo.rewindByValRegsInfo();
4237
4238 int lastInsIndex = -1;
4239 if (isVarArg && MFI.hasVAStart()) {
4240 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4241 if (RegIdx != std::size(GPRArgRegs))
4242 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4243 }
4244
4245 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4246 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4247 auto PtrVT = getPointerTy(DAG.getDataLayout());
4248
4249 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4250 CCValAssign &VA = ArgLocs[i];
4251 if (Ins[VA.getValNo()].isOrigArg()) {
4252 std::advance(CurOrigArg,
4253 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4254 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4255 }
4256 // Arguments stored in registers.
4257 if (VA.isRegLoc()) {
4258 EVT RegVT = VA.getLocVT();
4259 SDValue ArgValue;
4260
4261 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4262 // f64 and vector types are split up into multiple registers or
4263 // combinations of registers and stack slots.
4264 SDValue ArgValue1 =
4265 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4266 VA = ArgLocs[++i]; // skip ahead to next loc
4267 SDValue ArgValue2;
4268 if (VA.isMemLoc()) {
4269 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4270 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4271 ArgValue2 = DAG.getLoad(
4272 MVT::f64, dl, Chain, FIN,
4274 } else {
4275 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4276 }
4277 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4278 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4279 ArgValue1, DAG.getIntPtrConstant(0, dl));
4280 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4281 ArgValue2, DAG.getIntPtrConstant(1, dl));
4282 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4283 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4284 } else {
4285 const TargetRegisterClass *RC;
4286
4287 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4288 RC = &ARM::HPRRegClass;
4289 else if (RegVT == MVT::f32)
4290 RC = &ARM::SPRRegClass;
4291 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4292 RegVT == MVT::v4bf16)
4293 RC = &ARM::DPRRegClass;
4294 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4295 RegVT == MVT::v8bf16)
4296 RC = &ARM::QPRRegClass;
4297 else if (RegVT == MVT::i32)
4298 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4299 : &ARM::GPRRegClass;
4300 else
4301 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4302
4303 // Transform the arguments in physical registers into virtual ones.
4304 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4305 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4306
4307 // If this value is passed in r0 and has the returned attribute (e.g.
4308 // C++ 'structors), record this fact for later use.
4309 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4310 AFI->setPreservesR0();
4311 }
4312 }
4313
4314 // If this is an 8 or 16-bit value, it is really passed promoted
4315 // to 32 bits. Insert an assert[sz]ext to capture this, then
4316 // truncate to the right size.
4317 switch (VA.getLocInfo()) {
4318 default: llvm_unreachable("Unknown loc info!");
4319 case CCValAssign::Full: break;
4320 case CCValAssign::BCvt:
4321 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4322 break;
4323 }
4324
4325 // f16 arguments have their size extended to 4 bytes and passed as if they
4326 // had been copied to the LSBs of a 32-bit register.
4327 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4328 if (VA.needsCustom() &&
4329 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4330 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4331
4332 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4333 // less than 32 bits must be sign- or zero-extended in the callee for
4334 // security reasons. Although the ABI mandates an extension done by the
4335 // caller, the latter cannot be trusted to follow the rules of the ABI.
4336 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4337 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4338 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4339 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4340
4341 InVals.push_back(ArgValue);
4342 } else { // VA.isRegLoc()
4343 // Only arguments passed on the stack should make it here.
4344 assert(VA.isMemLoc());
4345 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4346
4347 int index = VA.getValNo();
4348
4349 // Some Ins[] entries become multiple ArgLoc[] entries.
4350 // Process them only once.
4351 if (index != lastInsIndex)
4352 {
4353 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4354 // FIXME: For now, all byval parameter objects are marked mutable.
4355 // This can be changed with more analysis.
4356 // In case of tail call optimization mark all arguments mutable.
4357 // Since they could be overwritten by lowering of arguments in case of
4358 // a tail call.
4359 if (Flags.isByVal()) {
4360 assert(Ins[index].isOrigArg() &&
4361 "Byval arguments cannot be implicit");
4362 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4363
4364 int FrameIndex = StoreByValRegs(
4365 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4366 VA.getLocMemOffset(), Flags.getByValSize());
4367 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4368 CCInfo.nextInRegsParam();
4369 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4370 VA.getValVT() == MVT::bf16)) {
4371 // f16 and bf16 values are passed in the least-significant half of
4372 // a 4 byte stack slot. This is done as-if the extension was done
4373 // in a 32-bit register, so the actual bytes used for the value
4374 // differ between little and big endian.
4375 assert(VA.getLocVT().getSizeInBits() == 32);
4376 unsigned FIOffset = VA.getLocMemOffset();
4377 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4378 FIOffset, true);
4379
4380 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4381 if (DAG.getDataLayout().isBigEndian())
4382 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4383
4384 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4386 DAG.getMachineFunction(), FI)));
4387
4388 } else {
4389 unsigned FIOffset = VA.getLocMemOffset();
4390 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4391 FIOffset, true);
4392
4393 // Create load nodes to retrieve arguments from the stack.
4394 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4395 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4397 DAG.getMachineFunction(), FI)));
4398 }
4399 lastInsIndex = index;
4400 }
4401 }
4402 }
4403
4404 // varargs
4405 if (isVarArg && MFI.hasVAStart()) {
4406 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4407 TotalArgRegsSaveSize);
4408 if (AFI->isCmseNSEntryFunction()) {
4409 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4411 "secure entry function must not be variadic", dl.getDebugLoc()));
4412 }
4413 }
4414
4415 unsigned StackArgSize = CCInfo.getStackSize();
4416 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4417 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4418 // The only way to guarantee a tail call is if the callee restores its
4419 // argument area, but it must also keep the stack aligned when doing so.
4420 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4421 assert(StackAlign && "data layout string is missing stack alignment");
4422 StackArgSize = alignTo(StackArgSize, *StackAlign);
4423
4424 AFI->setArgumentStackToRestore(StackArgSize);
4425 }
4426 AFI->setArgumentStackSize(StackArgSize);
4427
4428 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4429 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4431 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4432 }
4433
4434 return Chain;
4435}
4436
4437/// isFloatingPointZero - Return true if this is +0.0.
4440 return CFP->getValueAPF().isPosZero();
4441 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4442 // Maybe this has already been legalized into the constant pool?
4443 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4444 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4446 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4447 return CFP->getValueAPF().isPosZero();
4448 }
4449 } else if (Op->getOpcode() == ISD::BITCAST &&
4450 Op->getValueType(0) == MVT::f64) {
4451 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4452 // created by LowerConstantFP().
4453 SDValue BitcastOp = Op->getOperand(0);
4454 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4455 isNullConstant(BitcastOp->getOperand(0)))
4456 return true;
4457 }
4458 return false;
4459}
4460
4461/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4462/// the given operands.
4463SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4464 SDValue &ARMcc, SelectionDAG &DAG,
4465 const SDLoc &dl) const {
4466 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4467 unsigned C = RHSC->getZExtValue();
4468 if (!isLegalICmpImmediate((int32_t)C)) {
4469 // Constant does not fit, try adjusting it by one.
4470 switch (CC) {
4471 default: break;
4472 case ISD::SETLT:
4473 case ISD::SETGE:
4474 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4475 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4476 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4477 }
4478 break;
4479 case ISD::SETULT:
4480 case ISD::SETUGE:
4481 if (C != 0 && isLegalICmpImmediate(C-1)) {
4482 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4483 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4484 }
4485 break;
4486 case ISD::SETLE:
4487 case ISD::SETGT:
4488 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4489 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4490 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4491 }
4492 break;
4493 case ISD::SETULE:
4494 case ISD::SETUGT:
4495 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4496 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4497 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4498 }
4499 break;
4500 }
4501 }
4502 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4504 // In ARM and Thumb-2, the compare instructions can shift their second
4505 // operand.
4507 std::swap(LHS, RHS);
4508 }
4509
4510 // Thumb1 has very limited immediate modes, so turning an "and" into a
4511 // shift can save multiple instructions.
4512 //
4513 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4514 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4515 // own. If it's the operand to an unsigned comparison with an immediate,
4516 // we can eliminate one of the shifts: we transform
4517 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4518 //
4519 // We avoid transforming cases which aren't profitable due to encoding
4520 // details:
4521 //
4522 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4523 // would not; in that case, we're essentially trading one immediate load for
4524 // another.
4525 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4526 // 3. C2 is zero; we have other code for this special case.
4527 //
4528 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4529 // instruction, since the AND is always one instruction anyway, but we could
4530 // use narrow instructions in some cases.
4531 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4532 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4533 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4534 !isSignedIntSetCC(CC)) {
4535 unsigned Mask = LHS.getConstantOperandVal(1);
4536 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4537 uint64_t RHSV = RHSC->getZExtValue();
4538 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4539 unsigned ShiftBits = llvm::countl_zero(Mask);
4540 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4541 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4542 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4543 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4544 }
4545 }
4546 }
4547
4548 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4549 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4550 // way a cmp would.
4551 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4552 // some tweaks to the heuristics for the previous and->shift transform.
4553 // FIXME: Optimize cases where the LHS isn't a shift.
4554 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4555 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4556 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4557 LHS.getConstantOperandVal(1) < 31) {
4558 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4559 SDValue Shift =
4560 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4561 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4562 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4563 return Shift.getValue(1);
4564 }
4565
4567
4568 // If the RHS is a constant zero then the V (overflow) flag will never be
4569 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4570 // simpler for other passes (like the peephole optimiser) to deal with.
4571 if (isNullConstant(RHS)) {
4572 switch (CondCode) {
4573 default: break;
4574 case ARMCC::GE:
4576 break;
4577 case ARMCC::LT:
4579 break;
4580 }
4581 }
4582
4583 unsigned CompareType;
4584 switch (CondCode) {
4585 default:
4586 CompareType = ARMISD::CMP;
4587 break;
4588 case ARMCC::EQ:
4589 case ARMCC::NE:
4590 // Uses only Z Flag
4591 CompareType = ARMISD::CMPZ;
4592 break;
4593 }
4594 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4595 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4596}
4597
4598/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4599SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4600 SelectionDAG &DAG, const SDLoc &dl,
4601 bool Signaling) const {
4602 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4603 SDValue Flags;
4605 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4606 LHS, RHS);
4607 else
4608 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4609 FlagsVT, LHS);
4610 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4611}
4612
4613// This function returns three things: the arithmetic computation itself
4614// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4615// comparison and the condition code define the case in which the arithmetic
4616// computation *does not* overflow.
4617std::pair<SDValue, SDValue>
4618ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4619 SDValue &ARMcc) const {
4620 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4621
4622 SDValue Value, OverflowCmp;
4623 SDValue LHS = Op.getOperand(0);
4624 SDValue RHS = Op.getOperand(1);
4625 SDLoc dl(Op);
4626
4627 // FIXME: We are currently always generating CMPs because we don't support
4628 // generating CMN through the backend. This is not as good as the natural
4629 // CMP case because it causes a register dependency and cannot be folded
4630 // later.
4631
4632 switch (Op.getOpcode()) {
4633 default:
4634 llvm_unreachable("Unknown overflow instruction!");
4635 case ISD::SADDO:
4636 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4637 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4638 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4639 break;
4640 case ISD::UADDO:
4641 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4642 // We use ADDC here to correspond to its use in LowerALUO.
4643 // We do not use it in the USUBO case as Value may not be used.
4644 Value = DAG.getNode(ARMISD::ADDC, dl,
4645 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4646 .getValue(0);
4647 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4648 break;
4649 case ISD::SSUBO:
4650 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4651 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4652 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4653 break;
4654 case ISD::USUBO:
4655 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4656 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4657 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4658 break;
4659 case ISD::UMULO:
4660 // We generate a UMUL_LOHI and then check if the high word is 0.
4661 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4662 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4663 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4664 LHS, RHS);
4665 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4666 DAG.getConstant(0, dl, MVT::i32));
4667 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4668 break;
4669 case ISD::SMULO:
4670 // We generate a SMUL_LOHI and then check if all the bits of the high word
4671 // are the same as the sign bit of the low word.
4672 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4673 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4674 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4675 LHS, RHS);
4676 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4677 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4678 Value.getValue(0),
4679 DAG.getConstant(31, dl, MVT::i32)));
4680 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4681 break;
4682 } // switch (...)
4683
4684 return std::make_pair(Value, OverflowCmp);
4685}
4686
4688 SelectionDAG &DAG) {
4689 SDLoc DL(BoolCarry);
4690 EVT CarryVT = BoolCarry.getValueType();
4691
4692 // This converts the boolean value carry into the carry flag by doing
4693 // ARMISD::SUBC Carry, 1
4694 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4695 DAG.getVTList(CarryVT, MVT::i32),
4696 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4697 return Carry.getValue(1);
4698}
4699
4701 SelectionDAG &DAG) {
4702 SDLoc DL(Flags);
4703
4704 // Now convert the carry flag into a boolean carry. We do this
4705 // using ARMISD:ADDE 0, 0, Carry
4706 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4707 DAG.getConstant(0, DL, MVT::i32),
4708 DAG.getConstant(0, DL, MVT::i32), Flags);
4709}
4710
4711SDValue ARMTargetLowering::LowerALUO(SDValue Op, SelectionDAG &DAG) const {
4712 // Let legalize expand this if it isn't a legal type yet.
4713 if (!isTypeLegal(Op.getValueType()))
4714 return SDValue();
4715
4716 SDValue LHS = Op.getOperand(0);
4717 SDValue RHS = Op.getOperand(1);
4718 SDLoc dl(Op);
4719
4720 EVT VT = Op.getValueType();
4721 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4722 SDValue Value;
4723 SDValue Overflow;
4724 switch (Op.getOpcode()) {
4725 case ISD::UADDO:
4726 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4727 // Convert the carry flag into a boolean value.
4728 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4729 break;
4730 case ISD::USUBO:
4731 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4732 // Convert the carry flag into a boolean value.
4733 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4734 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4735 // value. So compute 1 - C.
4736 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4737 DAG.getConstant(1, dl, MVT::i32), Overflow);
4738 break;
4739 default: {
4740 // Handle other operations with getARMXALUOOp
4741 SDValue OverflowCmp, ARMcc;
4742 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4743 // We use 0 and 1 as false and true values.
4744 // ARMcc represents the "no overflow" condition (e.g., VC for signed ops).
4745 // CMOV operand order is (FalseVal, TrueVal), so we put 1 in FalseVal
4746 // position to get Overflow=1 when the "no overflow" condition is false.
4747 Overflow =
4748 DAG.getNode(ARMISD::CMOV, dl, MVT::i32,
4749 DAG.getConstant(1, dl, MVT::i32), // FalseVal: overflow
4750 DAG.getConstant(0, dl, MVT::i32), // TrueVal: no overflow
4751 ARMcc, OverflowCmp);
4752 break;
4753 }
4754 }
4755
4756 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4757}
4758
4760 const ARMSubtarget *Subtarget) {
4761 EVT VT = Op.getValueType();
4762 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4763 return SDValue();
4764 if (!VT.isSimple())
4765 return SDValue();
4766
4767 unsigned NewOpcode;
4768 switch (VT.getSimpleVT().SimpleTy) {
4769 default:
4770 return SDValue();
4771 case MVT::i8:
4772 switch (Op->getOpcode()) {
4773 case ISD::UADDSAT:
4774 NewOpcode = ARMISD::UQADD8b;
4775 break;
4776 case ISD::SADDSAT:
4777 NewOpcode = ARMISD::QADD8b;
4778 break;
4779 case ISD::USUBSAT:
4780 NewOpcode = ARMISD::UQSUB8b;
4781 break;
4782 case ISD::SSUBSAT:
4783 NewOpcode = ARMISD::QSUB8b;
4784 break;
4785 }
4786 break;
4787 case MVT::i16:
4788 switch (Op->getOpcode()) {
4789 case ISD::UADDSAT:
4790 NewOpcode = ARMISD::UQADD16b;
4791 break;
4792 case ISD::SADDSAT:
4793 NewOpcode = ARMISD::QADD16b;
4794 break;
4795 case ISD::USUBSAT:
4796 NewOpcode = ARMISD::UQSUB16b;
4797 break;
4798 case ISD::SSUBSAT:
4799 NewOpcode = ARMISD::QSUB16b;
4800 break;
4801 }
4802 break;
4803 }
4804
4805 SDLoc dl(Op);
4806 SDValue Add =
4807 DAG.getNode(NewOpcode, dl, MVT::i32,
4808 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4809 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4810 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4811}
4812
4813SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4814 SDValue Cond = Op.getOperand(0);
4815 SDValue SelectTrue = Op.getOperand(1);
4816 SDValue SelectFalse = Op.getOperand(2);
4817 SDLoc dl(Op);
4818 unsigned Opc = Cond.getOpcode();
4819
4820 if (Cond.getResNo() == 1 &&
4821 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4822 Opc == ISD::USUBO)) {
4823 if (!isTypeLegal(Cond->getValueType(0)))
4824 return SDValue();
4825
4826 SDValue Value, OverflowCmp;
4827 SDValue ARMcc;
4828 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4829 EVT VT = Op.getValueType();
4830
4831 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4832 }
4833
4834 // Convert:
4835 //
4836 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4837 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4838 //
4839 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4840 const ConstantSDNode *CMOVTrue =
4841 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4842 const ConstantSDNode *CMOVFalse =
4843 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4844
4845 if (CMOVTrue && CMOVFalse) {
4846 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4847 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4848
4849 SDValue True;
4850 SDValue False;
4851 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4852 True = SelectTrue;
4853 False = SelectFalse;
4854 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4855 True = SelectFalse;
4856 False = SelectTrue;
4857 }
4858
4859 if (True.getNode() && False.getNode())
4860 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4861 Cond.getOperand(3), DAG);
4862 }
4863 }
4864
4865 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4866 // undefined bits before doing a full-word comparison with zero.
4867 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4868 DAG.getConstant(1, dl, Cond.getValueType()));
4869
4870 return DAG.getSelectCC(dl, Cond,
4871 DAG.getConstant(0, dl, Cond.getValueType()),
4872 SelectTrue, SelectFalse, ISD::SETNE);
4873}
4874
4876 bool &swpCmpOps, bool &swpVselOps) {
4877 // Start by selecting the GE condition code for opcodes that return true for
4878 // 'equality'
4879 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4880 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4881 CondCode = ARMCC::GE;
4882
4883 // and GT for opcodes that return false for 'equality'.
4884 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4885 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4886 CondCode = ARMCC::GT;
4887
4888 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4889 // to swap the compare operands.
4890 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4891 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4892 swpCmpOps = true;
4893
4894 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4895 // If we have an unordered opcode, we need to swap the operands to the VSEL
4896 // instruction (effectively negating the condition).
4897 //
4898 // This also has the effect of swapping which one of 'less' or 'greater'
4899 // returns true, so we also swap the compare operands. It also switches
4900 // whether we return true for 'equality', so we compensate by picking the
4901 // opposite condition code to our original choice.
4902 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4903 CC == ISD::SETUGT) {
4904 swpCmpOps = !swpCmpOps;
4905 swpVselOps = !swpVselOps;
4906 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4907 }
4908
4909 // 'ordered' is 'anything but unordered', so use the VS condition code and
4910 // swap the VSEL operands.
4911 if (CC == ISD::SETO) {
4912 CondCode = ARMCC::VS;
4913 swpVselOps = true;
4914 }
4915
4916 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4917 // code and swap the VSEL operands. Also do this if we don't care about the
4918 // unordered case.
4919 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4920 CondCode = ARMCC::EQ;
4921 swpVselOps = true;
4922 }
4923}
4924
4925SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4926 SDValue TrueVal, SDValue ARMcc,
4927 SDValue Flags, SelectionDAG &DAG) const {
4928 if (!Subtarget->hasFP64() && VT == MVT::f64) {
4929 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4930 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4931 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4932 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4933
4934 SDValue TrueLow = TrueVal.getValue(0);
4935 SDValue TrueHigh = TrueVal.getValue(1);
4936 SDValue FalseLow = FalseVal.getValue(0);
4937 SDValue FalseHigh = FalseVal.getValue(1);
4938
4939 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4940 ARMcc, Flags);
4941 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4942 ARMcc, Flags);
4943
4944 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4945 }
4946 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
4947}
4948
4949static bool isGTorGE(ISD::CondCode CC) {
4950 return CC == ISD::SETGT || CC == ISD::SETGE;
4951}
4952
4953static bool isLTorLE(ISD::CondCode CC) {
4954 return CC == ISD::SETLT || CC == ISD::SETLE;
4955}
4956
4957// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4958// All of these conditions (and their <= and >= counterparts) will do:
4959// x < k ? k : x
4960// x > k ? x : k
4961// k < x ? x : k
4962// k > x ? k : x
4963static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4964 const SDValue TrueVal, const SDValue FalseVal,
4965 const ISD::CondCode CC, const SDValue K) {
4966 return (isGTorGE(CC) &&
4967 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4968 (isLTorLE(CC) &&
4969 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4970}
4971
4972// Check if two chained conditionals could be converted into SSAT or USAT.
4973//
4974// SSAT can replace a set of two conditional selectors that bound a number to an
4975// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4976//
4977// x < -k ? -k : (x > k ? k : x)
4978// x < -k ? -k : (x < k ? x : k)
4979// x > -k ? (x > k ? k : x) : -k
4980// x < k ? (x < -k ? -k : x) : k
4981// etc.
4982//
4983// LLVM canonicalizes these to either a min(max()) or a max(min())
4984// pattern. This function tries to match one of these and will return a SSAT
4985// node if successful.
4986//
4987// USAT works similarly to SSAT but bounds on the interval [0, k] where k + 1
4988// is a power of 2.
4990 EVT VT = Op.getValueType();
4991 SDValue V1 = Op.getOperand(0);
4992 SDValue K1 = Op.getOperand(1);
4993 SDValue TrueVal1 = Op.getOperand(2);
4994 SDValue FalseVal1 = Op.getOperand(3);
4995 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4996
4997 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4998 if (Op2.getOpcode() != ISD::SELECT_CC)
4999 return SDValue();
5000
5001 SDValue V2 = Op2.getOperand(0);
5002 SDValue K2 = Op2.getOperand(1);
5003 SDValue TrueVal2 = Op2.getOperand(2);
5004 SDValue FalseVal2 = Op2.getOperand(3);
5005 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5006
5007 SDValue V1Tmp = V1;
5008 SDValue V2Tmp = V2;
5009
5010 // Check that the registers and the constants match a max(min()) or min(max())
5011 // pattern
5012 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5013 K2 != FalseVal2 ||
5014 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5015 return SDValue();
5016
5017 // Check that the constant in the lower-bound check is
5018 // the opposite of the constant in the upper-bound check
5019 // in 1's complement.
5021 return SDValue();
5022
5023 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5024 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5025 int64_t PosVal = std::max(Val1, Val2);
5026 int64_t NegVal = std::min(Val1, Val2);
5027
5028 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5029 !isPowerOf2_64(PosVal + 1))
5030 return SDValue();
5031
5032 // Handle the difference between USAT (unsigned) and SSAT (signed)
5033 // saturation
5034 // At this point, PosVal is guaranteed to be positive
5035 uint64_t K = PosVal;
5036 SDLoc dl(Op);
5037 if (Val1 == ~Val2)
5038 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5039 DAG.getConstant(llvm::countr_one(K), dl, VT));
5040 if (NegVal == 0)
5041 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5042 DAG.getConstant(llvm::countr_one(K), dl, VT));
5043
5044 return SDValue();
5045}
5046
5047// Check if a condition of the type x < k ? k : x can be converted into a
5048// bit operation instead of conditional moves.
5049// Currently this is allowed given:
5050// - The conditions and values match up
5051// - k is 0 or -1 (all ones)
5052// This function will not check the last condition, thats up to the caller
5053// It returns true if the transformation can be made, and in such case
5054// returns x in V, and k in SatK.
5056 SDValue &SatK)
5057{
5058 SDValue LHS = Op.getOperand(0);
5059 SDValue RHS = Op.getOperand(1);
5060 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5061 SDValue TrueVal = Op.getOperand(2);
5062 SDValue FalseVal = Op.getOperand(3);
5063
5065 ? &RHS
5066 : nullptr;
5067
5068 // No constant operation in comparison, early out
5069 if (!K)
5070 return false;
5071
5072 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5073 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5074 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5075
5076 // If the constant on left and right side, or variable on left and right,
5077 // does not match, early out
5078 if (*K != KTmp || V != VTmp)
5079 return false;
5080
5081 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5082 SatK = *K;
5083 return true;
5084 }
5085
5086 return false;
5087}
5088
5089bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5090 if (VT == MVT::f32)
5091 return !Subtarget->hasVFP2Base();
5092 if (VT == MVT::f64)
5093 return !Subtarget->hasFP64();
5094 if (VT == MVT::f16)
5095 return !Subtarget->hasFullFP16();
5096 return false;
5097}
5098
5099static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal,
5100 SDValue FalseVal, const ARMSubtarget *Subtarget) {
5101 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5102 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5103 if (!CFVal || !CTVal || !Subtarget->hasV8_1MMainlineOps())
5104 return SDValue();
5105
5106 unsigned TVal = CTVal->getZExtValue();
5107 unsigned FVal = CFVal->getZExtValue();
5108
5109 Opcode = 0;
5110 InvertCond = false;
5111 if (TVal == ~FVal) {
5112 Opcode = ARMISD::CSINV;
5113 } else if (TVal == ~FVal + 1) {
5114 Opcode = ARMISD::CSNEG;
5115 } else if (TVal + 1 == FVal) {
5116 Opcode = ARMISD::CSINC;
5117 } else if (TVal == FVal + 1) {
5118 Opcode = ARMISD::CSINC;
5119 std::swap(TrueVal, FalseVal);
5120 std::swap(TVal, FVal);
5121 InvertCond = !InvertCond;
5122 } else {
5123 return SDValue();
5124 }
5125
5126 // If one of the constants is cheaper than another, materialise the
5127 // cheaper one and let the csel generate the other.
5128 if (Opcode != ARMISD::CSINC &&
5129 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5130 std::swap(TrueVal, FalseVal);
5131 std::swap(TVal, FVal);
5132 InvertCond = !InvertCond;
5133 }
5134
5135 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5136 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5137 // -(-a) == a, but (a+1)+1 != a).
5138 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5139 std::swap(TrueVal, FalseVal);
5140 std::swap(TVal, FVal);
5141 InvertCond = !InvertCond;
5142 }
5143
5144 return TrueVal;
5145}
5146
5147SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5148 EVT VT = Op.getValueType();
5149 SDLoc dl(Op);
5150
5151 // Try to convert two saturating conditional selects into a single SSAT
5152 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5153 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5154 return SatValue;
5155
5156 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5157 // into more efficient bit operations, which is possible when k is 0 or -1
5158 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5159 // single instructions. On Thumb the shift and the bit operation will be two
5160 // instructions.
5161 // Only allow this transformation on full-width (32-bit) operations
5162 SDValue LowerSatConstant;
5163 SDValue SatValue;
5164 if (VT == MVT::i32 &&
5165 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5166 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5167 DAG.getConstant(31, dl, VT));
5168 if (isNullConstant(LowerSatConstant)) {
5169 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5170 DAG.getAllOnesConstant(dl, VT));
5171 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5172 } else if (isAllOnesConstant(LowerSatConstant))
5173 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5174 }
5175
5176 SDValue LHS = Op.getOperand(0);
5177 SDValue RHS = Op.getOperand(1);
5178 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5179 SDValue TrueVal = Op.getOperand(2);
5180 SDValue FalseVal = Op.getOperand(3);
5181 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5182 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5183 if (Op.getValueType().isInteger()) {
5184
5185 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5186 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5187 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5188 // Both require less instructions than compare and conditional select.
5189 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5190 RHSC->isZero() && CFVal && CFVal->isZero() &&
5191 LHS.getValueType() == RHS.getValueType()) {
5192 EVT VT = LHS.getValueType();
5193 SDValue Shift =
5194 DAG.getNode(ISD::SRA, dl, VT, LHS,
5195 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5196
5197 if (CC == ISD::SETGT)
5198 Shift = DAG.getNOT(dl, Shift, VT);
5199
5200 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5201 }
5202
5203 // (SELECT_CC setlt, x, 0, 1, 0) -> SRL(x, bw-1)
5204 if (CC == ISD::SETLT && isNullConstant(RHS) && isOneConstant(TrueVal) &&
5205 isNullConstant(FalseVal) && LHS.getValueType() == VT)
5206 return DAG.getNode(ISD::SRL, dl, VT, LHS,
5207 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5208 }
5209
5210 if (LHS.getValueType() == MVT::i32) {
5211 unsigned Opcode;
5212 bool InvertCond;
5213 if (SDValue Op =
5214 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
5215 if (InvertCond)
5216 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5217
5218 SDValue ARMcc;
5219 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5220 EVT VT = Op.getValueType();
5221 return DAG.getNode(Opcode, dl, VT, Op, Op, ARMcc, Cmp);
5222 }
5223 }
5224
5225 if (isUnsupportedFloatingType(LHS.getValueType())) {
5226 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5227
5228 // If softenSetCCOperands only returned one value, we should compare it to
5229 // zero.
5230 if (!RHS.getNode()) {
5231 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5232 CC = ISD::SETNE;
5233 }
5234 }
5235
5236 if (LHS.getValueType() == MVT::i32) {
5237 // Try to generate VSEL on ARMv8.
5238 // The VSEL instruction can't use all the usual ARM condition
5239 // codes: it only has two bits to select the condition code, so it's
5240 // constrained to use only GE, GT, VS and EQ.
5241 //
5242 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5243 // swap the operands of the previous compare instruction (effectively
5244 // inverting the compare condition, swapping 'less' and 'greater') and
5245 // sometimes need to swap the operands to the VSEL (which inverts the
5246 // condition in the sense of firing whenever the previous condition didn't)
5247 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5248 TrueVal.getValueType() == MVT::f32 ||
5249 TrueVal.getValueType() == MVT::f64)) {
5251 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5252 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5253 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5254 std::swap(TrueVal, FalseVal);
5255 }
5256 }
5257
5258 SDValue ARMcc;
5259 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5260 // Choose GE over PL, which vsel does now support
5261 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5262 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5263 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5264 }
5265
5266 ARMCC::CondCodes CondCode, CondCode2;
5267 FPCCToARMCC(CC, CondCode, CondCode2);
5268
5269 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5270 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5271 // must use VSEL (limited condition codes), due to not having conditional f16
5272 // moves.
5273 if (Subtarget->hasFPARMv8Base() &&
5274 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5275 (TrueVal.getValueType() == MVT::f16 ||
5276 TrueVal.getValueType() == MVT::f32 ||
5277 TrueVal.getValueType() == MVT::f64)) {
5278 bool swpCmpOps = false;
5279 bool swpVselOps = false;
5280 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5281
5282 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5283 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5284 if (swpCmpOps)
5285 std::swap(LHS, RHS);
5286 if (swpVselOps)
5287 std::swap(TrueVal, FalseVal);
5288 }
5289 }
5290
5291 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5292 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5293 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5294 if (CondCode2 != ARMCC::AL) {
5295 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5296 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5297 }
5298 return Result;
5299}
5300
5301/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5302/// to morph to an integer compare sequence.
5303static bool canChangeToInt(SDValue Op, bool &SeenZero,
5304 const ARMSubtarget *Subtarget) {
5305 SDNode *N = Op.getNode();
5306 if (!N->hasOneUse())
5307 // Otherwise it requires moving the value from fp to integer registers.
5308 return false;
5309 if (!N->getNumValues())
5310 return false;
5311 EVT VT = Op.getValueType();
5312 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5313 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5314 // vmrs are very slow, e.g. cortex-a8.
5315 return false;
5316
5317 if (isFloatingPointZero(Op)) {
5318 SeenZero = true;
5319 return true;
5320 }
5321 return ISD::isNormalLoad(N);
5322}
5323
5326 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5327
5329 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5330 Ld->getPointerInfo(), Ld->getAlign(),
5331 Ld->getMemOperand()->getFlags());
5332
5333 llvm_unreachable("Unknown VFP cmp argument!");
5334}
5335
5337 SDValue &RetVal1, SDValue &RetVal2) {
5338 SDLoc dl(Op);
5339
5340 if (isFloatingPointZero(Op)) {
5341 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5342 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5343 return;
5344 }
5345
5346 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5347 SDValue Ptr = Ld->getBasePtr();
5348 RetVal1 =
5349 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5350 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5351
5352 EVT PtrType = Ptr.getValueType();
5353 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5354 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5355 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5356 Ld->getPointerInfo().getWithOffset(4),
5357 commonAlignment(Ld->getAlign(), 4),
5358 Ld->getMemOperand()->getFlags());
5359 return;
5360 }
5361
5362 llvm_unreachable("Unknown VFP cmp argument!");
5363}
5364
5365/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5366/// f32 and even f64 comparisons to integer ones.
5367SDValue
5368ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5369 SDValue Chain = Op.getOperand(0);
5370 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5371 SDValue LHS = Op.getOperand(2);
5372 SDValue RHS = Op.getOperand(3);
5373 SDValue Dest = Op.getOperand(4);
5374 SDLoc dl(Op);
5375
5376 bool LHSSeenZero = false;
5377 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5378 bool RHSSeenZero = false;
5379 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5380 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5381 // If unsafe fp math optimization is enabled and there are no other uses of
5382 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5383 // to an integer comparison.
5384 if (CC == ISD::SETOEQ)
5385 CC = ISD::SETEQ;
5386 else if (CC == ISD::SETUNE)
5387 CC = ISD::SETNE;
5388
5389 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5390 SDValue ARMcc;
5391 if (LHS.getValueType() == MVT::f32) {
5392 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5393 bitcastf32Toi32(LHS, DAG), Mask);
5394 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5395 bitcastf32Toi32(RHS, DAG), Mask);
5396 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5397 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5398 Cmp);
5399 }
5400
5401 SDValue LHS1, LHS2;
5402 SDValue RHS1, RHS2;
5403 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5404 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5405 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5406 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5408 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5409 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5410 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5411 }
5412
5413 return SDValue();
5414}
5415
5416// Generate CMP + CMOV for integer abs.
5417SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5418 SDLoc DL(Op);
5419
5420 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5421
5422 // Generate CMP & CMOV.
5423 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5424 DAG.getConstant(0, DL, MVT::i32));
5425 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5426 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5427}
5428
5430 ARMCC::CondCodes CondCode =
5431 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
5432 CondCode = ARMCC::getOppositeCondition(CondCode);
5433 return DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5434}
5435
5436SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5437 SDValue Chain = Op.getOperand(0);
5438 SDValue Cond = Op.getOperand(1);
5439 SDValue Dest = Op.getOperand(2);
5440 SDLoc dl(Op);
5441
5442 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5443 // instruction.
5444 unsigned Opc = Cond.getOpcode();
5445 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5446 !Subtarget->isThumb1Only();
5447 if (Cond.getResNo() == 1 &&
5448 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5449 Opc == ISD::USUBO || OptimizeMul)) {
5450 // Only lower legal XALUO ops.
5451 if (!isTypeLegal(Cond->getValueType(0)))
5452 return SDValue();
5453
5454 // The actual operation with overflow check.
5455 SDValue Value, OverflowCmp;
5456 SDValue ARMcc;
5457 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5458
5459 // Reverse the condition code.
5460 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5461
5462 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5463 OverflowCmp);
5464 }
5465
5466 return SDValue();
5467}
5468
5469SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5470 SDValue Chain = Op.getOperand(0);
5471 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5472 SDValue LHS = Op.getOperand(2);
5473 SDValue RHS = Op.getOperand(3);
5474 SDValue Dest = Op.getOperand(4);
5475 SDLoc dl(Op);
5476
5477 if (isUnsupportedFloatingType(LHS.getValueType())) {
5478 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5479
5480 // If softenSetCCOperands only returned one value, we should compare it to
5481 // zero.
5482 if (!RHS.getNode()) {
5483 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5484 CC = ISD::SETNE;
5485 }
5486 }
5487
5488 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5489 // instruction.
5490 unsigned Opc = LHS.getOpcode();
5491 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5492 !Subtarget->isThumb1Only();
5493 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5494 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5495 Opc == ISD::USUBO || OptimizeMul) &&
5496 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5497 // Only lower legal XALUO ops.
5498 if (!isTypeLegal(LHS->getValueType(0)))
5499 return SDValue();
5500
5501 // The actual operation with overflow check.
5502 SDValue Value, OverflowCmp;
5503 SDValue ARMcc;
5504 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5505
5506 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5507 // Reverse the condition code.
5508 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5509 }
5510
5511 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5512 OverflowCmp);
5513 }
5514
5515 if (LHS.getValueType() == MVT::i32) {
5516 SDValue ARMcc;
5517 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5518 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5519 }
5520
5521 SDNodeFlags Flags = Op->getFlags();
5522 if (Flags.hasNoNaNs() &&
5523 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5524 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5525 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5526 CC == ISD::SETUNE)) {
5527 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5528 return Result;
5529 }
5530
5531 ARMCC::CondCodes CondCode, CondCode2;
5532 FPCCToARMCC(CC, CondCode, CondCode2);
5533
5534 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5535 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5536 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5537 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5538 if (CondCode2 != ARMCC::AL) {
5539 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5540 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5541 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5542 }
5543 return Res;
5544}
5545
5546SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5547 SDValue Chain = Op.getOperand(0);
5548 SDValue Table = Op.getOperand(1);
5549 SDValue Index = Op.getOperand(2);
5550 SDLoc dl(Op);
5551
5552 EVT PTy = getPointerTy(DAG.getDataLayout());
5553 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5554 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5555 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5556 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5557 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5558 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5559 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5560 // which does another jump to the destination. This also makes it easier
5561 // to translate it to TBB / TBH later (Thumb2 only).
5562 // FIXME: This might not work if the function is extremely large.
5563 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5564 Addr, Op.getOperand(2), JTI);
5565 }
5566 if (isPositionIndependent() || Subtarget->isROPI()) {
5567 Addr =
5568 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5570 Chain = Addr.getValue(1);
5571 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5572 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5573 } else {
5574 Addr =
5575 DAG.getLoad(PTy, dl, Chain, Addr,
5577 Chain = Addr.getValue(1);
5578 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5579 }
5580}
5581
5583 EVT VT = Op.getValueType();
5584 SDLoc dl(Op);
5585
5586 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5587 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5588 return Op;
5589 return DAG.UnrollVectorOp(Op.getNode());
5590 }
5591
5592 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5593
5594 EVT NewTy;
5595 const EVT OpTy = Op.getOperand(0).getValueType();
5596 if (OpTy == MVT::v4f32)
5597 NewTy = MVT::v4i32;
5598 else if (OpTy == MVT::v4f16 && HasFullFP16)
5599 NewTy = MVT::v4i16;
5600 else if (OpTy == MVT::v8f16 && HasFullFP16)
5601 NewTy = MVT::v8i16;
5602 else
5603 llvm_unreachable("Invalid type for custom lowering!");
5604
5605 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5606 return DAG.UnrollVectorOp(Op.getNode());
5607
5608 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5609 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5610}
5611
5612SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5613 EVT VT = Op.getValueType();
5614 if (VT.isVector())
5615 return LowerVectorFP_TO_INT(Op, DAG);
5616
5617 bool IsStrict = Op->isStrictFPOpcode();
5618 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5619
5620 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5621 RTLIB::Libcall LC;
5622 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5623 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5624 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5625 Op.getValueType());
5626 else
5627 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5628 Op.getValueType());
5629 SDLoc Loc(Op);
5630 MakeLibCallOptions CallOptions;
5631 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5633 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5634 CallOptions, Loc, Chain);
5635 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5636 }
5637
5638 // FIXME: Remove this when we have strict fp instruction selection patterns
5639 if (IsStrict) {
5640 SDLoc Loc(Op);
5641 SDValue Result =
5644 Loc, Op.getValueType(), SrcVal);
5645 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5646 }
5647
5648 return Op;
5649}
5650
5652 const ARMSubtarget *Subtarget) {
5653 EVT VT = Op.getValueType();
5654 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5655 EVT FromVT = Op.getOperand(0).getValueType();
5656
5657 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5658 return Op;
5659 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5660 Subtarget->hasFP64())
5661 return Op;
5662 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5663 Subtarget->hasFullFP16())
5664 return Op;
5665 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5666 Subtarget->hasMVEFloatOps())
5667 return Op;
5668 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5669 Subtarget->hasMVEFloatOps())
5670 return Op;
5671
5672 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5673 return SDValue();
5674
5675 SDLoc DL(Op);
5676 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5677 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5678 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5679 DAG.getValueType(VT.getScalarType()));
5680 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5681 DAG.getConstant((1 << BW) - 1, DL, VT));
5682 if (IsSigned)
5683 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5684 DAG.getSignedConstant(-(1 << BW), DL, VT));
5685 return Max;
5686}
5687
5689 EVT VT = Op.getValueType();
5690 SDLoc dl(Op);
5691
5692 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5693 if (VT.getVectorElementType() == MVT::f32)
5694 return Op;
5695 return DAG.UnrollVectorOp(Op.getNode());
5696 }
5697
5698 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5699 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5700 "Invalid type for custom lowering!");
5701
5702 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5703
5704 EVT DestVecType;
5705 if (VT == MVT::v4f32)
5706 DestVecType = MVT::v4i32;
5707 else if (VT == MVT::v4f16 && HasFullFP16)
5708 DestVecType = MVT::v4i16;
5709 else if (VT == MVT::v8f16 && HasFullFP16)
5710 DestVecType = MVT::v8i16;
5711 else
5712 return DAG.UnrollVectorOp(Op.getNode());
5713
5714 unsigned CastOpc;
5715 unsigned Opc;
5716 switch (Op.getOpcode()) {
5717 default: llvm_unreachable("Invalid opcode!");
5718 case ISD::SINT_TO_FP:
5719 CastOpc = ISD::SIGN_EXTEND;
5721 break;
5722 case ISD::UINT_TO_FP:
5723 CastOpc = ISD::ZERO_EXTEND;
5725 break;
5726 }
5727
5728 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5729 return DAG.getNode(Opc, dl, VT, Op);
5730}
5731
5732SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5733 EVT VT = Op.getValueType();
5734 if (VT.isVector())
5735 return LowerVectorINT_TO_FP(Op, DAG);
5736 if (isUnsupportedFloatingType(VT)) {
5737 RTLIB::Libcall LC;
5738 if (Op.getOpcode() == ISD::SINT_TO_FP)
5739 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5740 Op.getValueType());
5741 else
5742 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5743 Op.getValueType());
5744 MakeLibCallOptions CallOptions;
5745 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5746 CallOptions, SDLoc(Op)).first;
5747 }
5748
5749 return Op;
5750}
5751
5752SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5753 // Implement fcopysign with a fabs and a conditional fneg.
5754 SDValue Tmp0 = Op.getOperand(0);
5755 SDValue Tmp1 = Op.getOperand(1);
5756 SDLoc dl(Op);
5757 EVT VT = Op.getValueType();
5758 EVT SrcVT = Tmp1.getValueType();
5759 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5760 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5761 bool UseNEON = !InGPR && Subtarget->hasNEON();
5762
5763 if (UseNEON) {
5764 // Use VBSL to copy the sign bit.
5765 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5766 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5767 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5768 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5769 if (VT == MVT::f64)
5770 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5771 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5772 DAG.getConstant(32, dl, MVT::i32));
5773 else /*if (VT == MVT::f32)*/
5774 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5775 if (SrcVT == MVT::f32) {
5776 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5777 if (VT == MVT::f64)
5778 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5779 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5780 DAG.getConstant(32, dl, MVT::i32));
5781 } else if (VT == MVT::f32)
5782 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5783 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5784 DAG.getConstant(32, dl, MVT::i32));
5785 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5786 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5787
5789 dl, MVT::i32);
5790 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5791 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5792 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5793
5794 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5795 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5796 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5797 if (VT == MVT::f32) {
5798 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5799 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5800 DAG.getConstant(0, dl, MVT::i32));
5801 } else {
5802 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5803 }
5804
5805 return Res;
5806 }
5807
5808 // Bitcast operand 1 to i32.
5809 if (SrcVT == MVT::f64)
5810 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5811 Tmp1).getValue(1);
5812 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5813
5814 // Or in the signbit with integer operations.
5815 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5816 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5817 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5818 if (VT == MVT::f32) {
5819 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5820 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5821 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5822 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5823 }
5824
5825 // f64: Or the high part with signbit and then combine two parts.
5826 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5827 Tmp0);
5828 SDValue Lo = Tmp0.getValue(0);
5829 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5830 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5831 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5832}
5833
5834SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5835 MachineFunction &MF = DAG.getMachineFunction();
5836 MachineFrameInfo &MFI = MF.getFrameInfo();
5837 MFI.setReturnAddressIsTaken(true);
5838
5839 EVT VT = Op.getValueType();
5840 SDLoc dl(Op);
5841 unsigned Depth = Op.getConstantOperandVal(0);
5842 if (Depth) {
5843 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5844 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5845 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5846 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5847 MachinePointerInfo());
5848 }
5849
5850 // Return LR, which contains the return address. Mark it an implicit live-in.
5851 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5852 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5853}
5854
5855SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5856 const ARMBaseRegisterInfo &ARI =
5857 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5858 MachineFunction &MF = DAG.getMachineFunction();
5859 MachineFrameInfo &MFI = MF.getFrameInfo();
5860 MFI.setFrameAddressIsTaken(true);
5861
5862 EVT VT = Op.getValueType();
5863 SDLoc dl(Op); // FIXME probably not meaningful
5864 unsigned Depth = Op.getConstantOperandVal(0);
5865 Register FrameReg = ARI.getFrameRegister(MF);
5866 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5867 while (Depth--)
5868 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5869 MachinePointerInfo());
5870 return FrameAddr;
5871}
5872
5873// FIXME? Maybe this could be a TableGen attribute on some registers and
5874// this table could be generated automatically from RegInfo.
5875Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5876 const MachineFunction &MF) const {
5877 return StringSwitch<Register>(RegName)
5878 .Case("sp", ARM::SP)
5879 .Default(Register());
5880}
5881
5882// Result is 64 bit value so split into two 32 bit values and return as a
5883// pair of values.
5885 SelectionDAG &DAG) {
5886 SDLoc DL(N);
5887
5888 // This function is only supposed to be called for i64 type destination.
5889 assert(N->getValueType(0) == MVT::i64
5890 && "ExpandREAD_REGISTER called for non-i64 type result.");
5891
5893 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5894 N->getOperand(0),
5895 N->getOperand(1));
5896
5897 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5898 Read.getValue(1)));
5899 Results.push_back(Read.getValue(2)); // Chain
5900}
5901
5902/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5903/// When \p DstVT, the destination type of \p BC, is on the vector
5904/// register bank and the source of bitcast, \p Op, operates on the same bank,
5905/// it might be possible to combine them, such that everything stays on the
5906/// vector register bank.
5907/// \p return The node that would replace \p BT, if the combine
5908/// is possible.
5910 SelectionDAG &DAG) {
5911 SDValue Op = BC->getOperand(0);
5912 EVT DstVT = BC->getValueType(0);
5913
5914 // The only vector instruction that can produce a scalar (remember,
5915 // since the bitcast was about to be turned into VMOVDRR, the source
5916 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5917 // Moreover, we can do this combine only if there is one use.
5918 // Finally, if the destination type is not a vector, there is not
5919 // much point on forcing everything on the vector bank.
5920 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5921 !Op.hasOneUse())
5922 return SDValue();
5923
5924 // If the index is not constant, we will introduce an additional
5925 // multiply that will stick.
5926 // Give up in that case.
5927 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5928 if (!Index)
5929 return SDValue();
5930 unsigned DstNumElt = DstVT.getVectorNumElements();
5931
5932 // Compute the new index.
5933 const APInt &APIntIndex = Index->getAPIntValue();
5934 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5935 NewIndex *= APIntIndex;
5936 // Check if the new constant index fits into i32.
5937 if (NewIndex.getBitWidth() > 32)
5938 return SDValue();
5939
5940 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5941 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5942 SDLoc dl(Op);
5943 SDValue ExtractSrc = Op.getOperand(0);
5944 EVT VecVT = EVT::getVectorVT(
5945 *DAG.getContext(), DstVT.getScalarType(),
5946 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5947 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5948 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5949 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5950}
5951
5952/// ExpandBITCAST - If the target supports VFP, this function is called to
5953/// expand a bit convert where either the source or destination type is i64 to
5954/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
5955/// operand type is illegal (e.g., v2f32 for a target that doesn't support
5956/// vectors), since the legalizer won't know what to do with that.
5957SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5958 const ARMSubtarget *Subtarget) const {
5959 SDLoc dl(N);
5960 SDValue Op = N->getOperand(0);
5961
5962 // This function is only supposed to be called for i16 and i64 types, either
5963 // as the source or destination of the bit convert.
5964 EVT SrcVT = Op.getValueType();
5965 EVT DstVT = N->getValueType(0);
5966
5967 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
5968 (DstVT == MVT::f16 || DstVT == MVT::bf16))
5969 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
5970 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
5971
5972 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
5973 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
5974 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
5975 Op = DAG.getBitcast(MVT::f16, Op);
5976 return DAG.getNode(
5977 ISD::TRUNCATE, SDLoc(N), DstVT,
5978 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
5979 }
5980
5981 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5982 return SDValue();
5983
5984 // Turn i64->f64 into VMOVDRR.
5985 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
5986 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5987 // if we can combine the bitcast with its source.
5989 return Val;
5990 SDValue Lo, Hi;
5991 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
5992 return DAG.getNode(ISD::BITCAST, dl, DstVT,
5993 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5994 }
5995
5996 // Turn f64->i64 into VMOVRRD.
5997 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
5998 SDValue Cvt;
5999 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6000 SrcVT.getVectorNumElements() > 1)
6001 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6002 DAG.getVTList(MVT::i32, MVT::i32),
6003 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6004 else
6005 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6006 DAG.getVTList(MVT::i32, MVT::i32), Op);
6007 // Merge the pieces into a single i64 value.
6008 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6009 }
6010
6011 return SDValue();
6012}
6013
6014/// getZeroVector - Returns a vector of specified type with all zero elements.
6015/// Zero vectors are used to represent vector negation and in those cases
6016/// will be implemented with the NEON VNEG instruction. However, VNEG does
6017/// not support i64 elements, so sometimes the zero vectors will need to be
6018/// explicitly constructed. Regardless, use a canonical VMOV to create the
6019/// zero vector.
6020static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6021 assert(VT.isVector() && "Expected a vector type");
6022 // The canonical modified immediate encoding of a zero vector is....0!
6023 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6024 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6025 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6026 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6027}
6028
6029/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6030/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6031SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6032 SelectionDAG &DAG) const {
6033 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6034 EVT VT = Op.getValueType();
6035 unsigned VTBits = VT.getSizeInBits();
6036 SDLoc dl(Op);
6037 SDValue ShOpLo = Op.getOperand(0);
6038 SDValue ShOpHi = Op.getOperand(1);
6039 SDValue ShAmt = Op.getOperand(2);
6040 SDValue ARMcc;
6041 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6042
6043 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6044
6045 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6046 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6047 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6048 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6049 DAG.getConstant(VTBits, dl, MVT::i32));
6050 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6051 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6052 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6053 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6054 ISD::SETGE, ARMcc, DAG, dl);
6055 SDValue Lo =
6056 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6057
6058 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6059 SDValue HiBigShift = Opc == ISD::SRA
6060 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6061 DAG.getConstant(VTBits - 1, dl, VT))
6062 : DAG.getConstant(0, dl, VT);
6063 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6064 ISD::SETGE, ARMcc, DAG, dl);
6065 SDValue Hi =
6066 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6067
6068 SDValue Ops[2] = { Lo, Hi };
6069 return DAG.getMergeValues(Ops, dl);
6070}
6071
6072/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6073/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6074SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6075 SelectionDAG &DAG) const {
6076 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6077 EVT VT = Op.getValueType();
6078 unsigned VTBits = VT.getSizeInBits();
6079 SDLoc dl(Op);
6080 SDValue ShOpLo = Op.getOperand(0);
6081 SDValue ShOpHi = Op.getOperand(1);
6082 SDValue ShAmt = Op.getOperand(2);
6083 SDValue ARMcc;
6084
6085 assert(Op.getOpcode() == ISD::SHL_PARTS);
6086 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6087 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6088 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6089 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6090 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6091
6092 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6093 DAG.getConstant(VTBits, dl, MVT::i32));
6094 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6095 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6096 ISD::SETGE, ARMcc, DAG, dl);
6097 SDValue Hi =
6098 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6099
6100 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6101 ISD::SETGE, ARMcc, DAG, dl);
6102 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6103 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6104 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6105
6106 SDValue Ops[2] = { Lo, Hi };
6107 return DAG.getMergeValues(Ops, dl);
6108}
6109
6110SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6111 SelectionDAG &DAG) const {
6112 // The rounding mode is in bits 23:22 of the FPSCR.
6113 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6114 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6115 // so that the shift + and get folded into a bitfield extract.
6116 SDLoc dl(Op);
6117 SDValue Chain = Op.getOperand(0);
6118 SDValue Ops[] = {Chain,
6119 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6120
6121 SDValue FPSCR =
6122 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6123 Chain = FPSCR.getValue(1);
6124 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6125 DAG.getConstant(1U << 22, dl, MVT::i32));
6126 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6127 DAG.getConstant(22, dl, MVT::i32));
6128 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6129 DAG.getConstant(3, dl, MVT::i32));
6130 return DAG.getMergeValues({And, Chain}, dl);
6131}
6132
6133SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6134 SelectionDAG &DAG) const {
6135 SDLoc DL(Op);
6136 SDValue Chain = Op->getOperand(0);
6137 SDValue RMValue = Op->getOperand(1);
6138
6139 // The rounding mode is in bits 23:22 of the FPSCR.
6140 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6141 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6142 // ((arg - 1) & 3) << 22).
6143 //
6144 // It is expected that the argument of llvm.set.rounding is within the
6145 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6146 // responsibility of the code generated llvm.set.rounding to ensure this
6147 // condition.
6148
6149 // Calculate new value of FPSCR[23:22].
6150 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6151 DAG.getConstant(1, DL, MVT::i32));
6152 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6153 DAG.getConstant(0x3, DL, MVT::i32));
6154 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6155 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6156
6157 // Get current value of FPSCR.
6158 SDValue Ops[] = {Chain,
6159 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6160 SDValue FPSCR =
6161 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6162 Chain = FPSCR.getValue(1);
6163 FPSCR = FPSCR.getValue(0);
6164
6165 // Put new rounding mode into FPSCR[23:22].
6166 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6167 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6168 DAG.getConstant(RMMask, DL, MVT::i32));
6169 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6170 SDValue Ops2[] = {
6171 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6172 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6173}
6174
6175SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6176 SelectionDAG &DAG) const {
6177 SDLoc DL(Op);
6178 SDValue Chain = Op->getOperand(0);
6179 SDValue Mode = Op->getOperand(1);
6180
6181 // Generate nodes to build:
6182 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6183 SDValue Ops[] = {Chain,
6184 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6185 SDValue FPSCR =
6186 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6187 Chain = FPSCR.getValue(1);
6188 FPSCR = FPSCR.getValue(0);
6189
6190 SDValue FPSCRMasked =
6191 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6192 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6193 SDValue InputMasked =
6194 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6195 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6196 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6197
6198 SDValue Ops2[] = {
6199 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6200 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6201}
6202
6203SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6204 SelectionDAG &DAG) const {
6205 SDLoc DL(Op);
6206 SDValue Chain = Op->getOperand(0);
6207
6208 // To get the default FP mode all control bits are cleared:
6209 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6210 SDValue Ops[] = {Chain,
6211 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6212 SDValue FPSCR =
6213 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6214 Chain = FPSCR.getValue(1);
6215 FPSCR = FPSCR.getValue(0);
6216
6217 SDValue FPSCRMasked = DAG.getNode(
6218 ISD::AND, DL, MVT::i32, FPSCR,
6220 SDValue Ops2[] = {Chain,
6221 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6222 FPSCRMasked};
6223 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6224}
6225
6227 const ARMSubtarget *ST) {
6228 SDLoc dl(N);
6229 EVT VT = N->getValueType(0);
6230 if (VT.isVector() && ST->hasNEON()) {
6231
6232 // Compute the least significant set bit: LSB = X & -X
6233 SDValue X = N->getOperand(0);
6234 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6235 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6236
6237 EVT ElemTy = VT.getVectorElementType();
6238
6239 if (ElemTy == MVT::i8) {
6240 // Compute with: cttz(x) = ctpop(lsb - 1)
6241 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6242 DAG.getTargetConstant(1, dl, ElemTy));
6243 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6244 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6245 }
6246
6247 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6248 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6249 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6250 unsigned NumBits = ElemTy.getSizeInBits();
6251 SDValue WidthMinus1 =
6252 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6253 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6254 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6255 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6256 }
6257
6258 // Compute with: cttz(x) = ctpop(lsb - 1)
6259
6260 // Compute LSB - 1.
6261 SDValue Bits;
6262 if (ElemTy == MVT::i64) {
6263 // Load constant 0xffff'ffff'ffff'ffff to register.
6264 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6265 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6266 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6267 } else {
6268 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6269 DAG.getTargetConstant(1, dl, ElemTy));
6270 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6271 }
6272 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6273 }
6274
6275 if (!ST->hasV6T2Ops())
6276 return SDValue();
6277
6278 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6279 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6280}
6281
6283 const ARMSubtarget *ST) {
6284 EVT VT = N->getValueType(0);
6285 SDLoc DL(N);
6286
6287 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6288 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6289 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6290 "Unexpected type for custom ctpop lowering");
6291
6292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6293 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6294 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6295 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6296
6297 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6298 unsigned EltSize = 8;
6299 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6300 while (EltSize != VT.getScalarSizeInBits()) {
6302 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6303 TLI.getPointerTy(DAG.getDataLayout())));
6304 Ops.push_back(Res);
6305
6306 EltSize *= 2;
6307 NumElts /= 2;
6308 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6309 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6310 }
6311
6312 return Res;
6313}
6314
6315/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6316/// operand of a vector shift operation, where all the elements of the
6317/// build_vector must have the same constant integer value.
6318static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6319 // Ignore bit_converts.
6320 while (Op.getOpcode() == ISD::BITCAST)
6321 Op = Op.getOperand(0);
6323 APInt SplatBits, SplatUndef;
6324 unsigned SplatBitSize;
6325 bool HasAnyUndefs;
6326 if (!BVN ||
6327 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6328 ElementBits) ||
6329 SplatBitSize > ElementBits)
6330 return false;
6331 Cnt = SplatBits.getSExtValue();
6332 return true;
6333}
6334
6335/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6336/// operand of a vector shift left operation. That value must be in the range:
6337/// 0 <= Value < ElementBits for a left shift; or
6338/// 0 <= Value <= ElementBits for a long left shift.
6339static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6340 assert(VT.isVector() && "vector shift count is not a vector type");
6341 int64_t ElementBits = VT.getScalarSizeInBits();
6342 if (!getVShiftImm(Op, ElementBits, Cnt))
6343 return false;
6344 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6345}
6346
6347/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6348/// operand of a vector shift right operation. For a shift opcode, the value
6349/// is positive, but for an intrinsic the value count must be negative. The
6350/// absolute value must be in the range:
6351/// 1 <= |Value| <= ElementBits for a right shift; or
6352/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6353static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6354 int64_t &Cnt) {
6355 assert(VT.isVector() && "vector shift count is not a vector type");
6356 int64_t ElementBits = VT.getScalarSizeInBits();
6357 if (!getVShiftImm(Op, ElementBits, Cnt))
6358 return false;
6359 if (!isIntrinsic)
6360 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6361 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6362 Cnt = -Cnt;
6363 return true;
6364 }
6365 return false;
6366}
6367
6369 const ARMSubtarget *ST) {
6370 EVT VT = N->getValueType(0);
6371 SDLoc dl(N);
6372 int64_t Cnt;
6373
6374 if (!VT.isVector())
6375 return SDValue();
6376
6377 // We essentially have two forms here. Shift by an immediate and shift by a
6378 // vector register (there are also shift by a gpr, but that is just handled
6379 // with a tablegen pattern). We cannot easily match shift by an immediate in
6380 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6381 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6382 // signed or unsigned, and a negative shift indicates a shift right).
6383 if (N->getOpcode() == ISD::SHL) {
6384 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6385 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6386 DAG.getConstant(Cnt, dl, MVT::i32));
6387 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6388 N->getOperand(1));
6389 }
6390
6391 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6392 "unexpected vector shift opcode");
6393
6394 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6395 unsigned VShiftOpc =
6396 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6397 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6398 DAG.getConstant(Cnt, dl, MVT::i32));
6399 }
6400
6401 // Other right shifts we don't have operations for (we use a shift left by a
6402 // negative number).
6403 EVT ShiftVT = N->getOperand(1).getValueType();
6404 SDValue NegatedCount = DAG.getNode(
6405 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6406 unsigned VShiftOpc =
6407 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6408 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6409}
6410
6412 const ARMSubtarget *ST) {
6413 EVT VT = N->getValueType(0);
6414 SDLoc dl(N);
6415
6416 // We can get here for a node like i32 = ISD::SHL i32, i64
6417 if (VT != MVT::i64)
6418 return SDValue();
6419
6420 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6421 N->getOpcode() == ISD::SHL) &&
6422 "Unknown shift to lower!");
6423
6424 unsigned ShOpc = N->getOpcode();
6425 if (ST->hasMVEIntegerOps()) {
6426 SDValue ShAmt = N->getOperand(1);
6427 unsigned ShPartsOpc = ARMISD::LSLL;
6429
6430 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6431 // then do the default optimisation
6432 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6433 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6434 return SDValue();
6435
6436 // Extract the lower 32 bits of the shift amount if it's not an i32
6437 if (ShAmt->getValueType(0) != MVT::i32)
6438 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6439
6440 if (ShOpc == ISD::SRL) {
6441 if (!Con)
6442 // There is no t2LSRLr instruction so negate and perform an lsll if the
6443 // shift amount is in a register, emulating a right shift.
6444 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6445 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6446 else
6447 // Else generate an lsrl on the immediate shift amount
6448 ShPartsOpc = ARMISD::LSRL;
6449 } else if (ShOpc == ISD::SRA)
6450 ShPartsOpc = ARMISD::ASRL;
6451
6452 // Split Lower/Upper 32 bits of the destination/source
6453 SDValue Lo, Hi;
6454 std::tie(Lo, Hi) =
6455 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6456 // Generate the shift operation as computed above
6457 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6458 ShAmt);
6459 // The upper 32 bits come from the second return value of lsll
6460 Hi = SDValue(Lo.getNode(), 1);
6461 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6462 }
6463
6464 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6465 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6466 return SDValue();
6467
6468 // If we are in thumb mode, we don't have RRX.
6469 if (ST->isThumb1Only())
6470 return SDValue();
6471
6472 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6473 SDValue Lo, Hi;
6474 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6475
6476 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6477 // captures the shifted out bit into a carry flag.
6478 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6479 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6480
6481 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6482 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6483
6484 // Merge the pieces into a single i64 value.
6485 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6486}
6487
6489 const ARMSubtarget *ST) {
6490 bool Invert = false;
6491 bool Swap = false;
6492 unsigned Opc = ARMCC::AL;
6493
6494 SDValue Op0 = Op.getOperand(0);
6495 SDValue Op1 = Op.getOperand(1);
6496 SDValue CC = Op.getOperand(2);
6497 EVT VT = Op.getValueType();
6498 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6499 SDLoc dl(Op);
6500
6501 EVT CmpVT;
6502 if (ST->hasNEON())
6504 else {
6505 assert(ST->hasMVEIntegerOps() &&
6506 "No hardware support for integer vector comparison!");
6507
6508 if (Op.getValueType().getVectorElementType() != MVT::i1)
6509 return SDValue();
6510
6511 // Make sure we expand floating point setcc to scalar if we do not have
6512 // mve.fp, so that we can handle them from there.
6513 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6514 return SDValue();
6515
6516 CmpVT = VT;
6517 }
6518
6519 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6520 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6521 // Special-case integer 64-bit equality comparisons. They aren't legal,
6522 // but they can be lowered with a few vector instructions.
6523 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6524 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6525 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6526 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6527 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6528 DAG.getCondCode(ISD::SETEQ));
6529 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6530 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6531 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6532 if (SetCCOpcode == ISD::SETNE)
6533 Merged = DAG.getNOT(dl, Merged, CmpVT);
6534 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6535 return Merged;
6536 }
6537
6538 if (CmpVT.getVectorElementType() == MVT::i64)
6539 // 64-bit comparisons are not legal in general.
6540 return SDValue();
6541
6542 if (Op1.getValueType().isFloatingPoint()) {
6543 switch (SetCCOpcode) {
6544 default: llvm_unreachable("Illegal FP comparison");
6545 case ISD::SETUNE:
6546 case ISD::SETNE:
6547 if (ST->hasMVEFloatOps()) {
6548 Opc = ARMCC::NE; break;
6549 } else {
6550 Invert = true; [[fallthrough]];
6551 }
6552 case ISD::SETOEQ:
6553 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6554 case ISD::SETOLT:
6555 case ISD::SETLT: Swap = true; [[fallthrough]];
6556 case ISD::SETOGT:
6557 case ISD::SETGT: Opc = ARMCC::GT; break;
6558 case ISD::SETOLE:
6559 case ISD::SETLE: Swap = true; [[fallthrough]];
6560 case ISD::SETOGE:
6561 case ISD::SETGE: Opc = ARMCC::GE; break;
6562 case ISD::SETUGE: Swap = true; [[fallthrough]];
6563 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6564 case ISD::SETUGT: Swap = true; [[fallthrough]];
6565 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6566 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6567 case ISD::SETONE: {
6568 // Expand this to (OLT | OGT).
6569 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6570 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6571 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6572 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6573 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6574 if (Invert)
6575 Result = DAG.getNOT(dl, Result, VT);
6576 return Result;
6577 }
6578 case ISD::SETUO: Invert = true; [[fallthrough]];
6579 case ISD::SETO: {
6580 // Expand this to (OLT | OGE).
6581 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6582 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6583 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6584 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6585 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6586 if (Invert)
6587 Result = DAG.getNOT(dl, Result, VT);
6588 return Result;
6589 }
6590 }
6591 } else {
6592 // Integer comparisons.
6593 switch (SetCCOpcode) {
6594 default: llvm_unreachable("Illegal integer comparison");
6595 case ISD::SETNE:
6596 if (ST->hasMVEIntegerOps()) {
6597 Opc = ARMCC::NE; break;
6598 } else {
6599 Invert = true; [[fallthrough]];
6600 }
6601 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6602 case ISD::SETLT: Swap = true; [[fallthrough]];
6603 case ISD::SETGT: Opc = ARMCC::GT; break;
6604 case ISD::SETLE: Swap = true; [[fallthrough]];
6605 case ISD::SETGE: Opc = ARMCC::GE; break;
6606 case ISD::SETULT: Swap = true; [[fallthrough]];
6607 case ISD::SETUGT: Opc = ARMCC::HI; break;
6608 case ISD::SETULE: Swap = true; [[fallthrough]];
6609 case ISD::SETUGE: Opc = ARMCC::HS; break;
6610 }
6611
6612 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6613 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6614 SDValue AndOp;
6616 AndOp = Op0;
6617 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6618 AndOp = Op1;
6619
6620 // Ignore bitconvert.
6621 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6622 AndOp = AndOp.getOperand(0);
6623
6624 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6625 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6626 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6627 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6628 if (!Invert)
6629 Result = DAG.getNOT(dl, Result, VT);
6630 return Result;
6631 }
6632 }
6633 }
6634
6635 if (Swap)
6636 std::swap(Op0, Op1);
6637
6638 // If one of the operands is a constant vector zero, attempt to fold the
6639 // comparison to a specialized compare-against-zero form.
6641 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6642 Opc == ARMCC::NE)) {
6643 if (Opc == ARMCC::GE)
6644 Opc = ARMCC::LE;
6645 else if (Opc == ARMCC::GT)
6646 Opc = ARMCC::LT;
6647 std::swap(Op0, Op1);
6648 }
6649
6650 SDValue Result;
6652 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6653 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6654 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6655 DAG.getConstant(Opc, dl, MVT::i32));
6656 else
6657 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6658 DAG.getConstant(Opc, dl, MVT::i32));
6659
6660 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6661
6662 if (Invert)
6663 Result = DAG.getNOT(dl, Result, VT);
6664
6665 return Result;
6666}
6667
6669 SDValue LHS = Op.getOperand(0);
6670 SDValue RHS = Op.getOperand(1);
6671 SDValue Carry = Op.getOperand(2);
6672 SDValue Cond = Op.getOperand(3);
6673 SDLoc DL(Op);
6674
6675 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6676
6677 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6678 // have to invert the carry first.
6679 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6680 DAG.getConstant(1, DL, MVT::i32), Carry);
6681 // This converts the boolean value carry into the carry flag.
6682 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6683
6684 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6685 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6686
6687 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6688 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6689 SDValue ARMcc = DAG.getConstant(
6690 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6691 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6692 Cmp.getValue(1));
6693}
6694
6695/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6696/// valid vector constant for a NEON or MVE instruction with a "modified
6697/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6698static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6699 unsigned SplatBitSize, SelectionDAG &DAG,
6700 const SDLoc &dl, EVT &VT, EVT VectorVT,
6701 VMOVModImmType type) {
6702 unsigned OpCmode, Imm;
6703 bool is128Bits = VectorVT.is128BitVector();
6704
6705 // SplatBitSize is set to the smallest size that splats the vector, so a
6706 // zero vector will always have SplatBitSize == 8. However, NEON modified
6707 // immediate instructions others than VMOV do not support the 8-bit encoding
6708 // of a zero vector, and the default encoding of zero is supposed to be the
6709 // 32-bit version.
6710 if (SplatBits == 0)
6711 SplatBitSize = 32;
6712
6713 switch (SplatBitSize) {
6714 case 8:
6715 if (type != VMOVModImm)
6716 return SDValue();
6717 // Any 1-byte value is OK. Op=0, Cmode=1110.
6718 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6719 OpCmode = 0xe;
6720 Imm = SplatBits;
6721 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6722 break;
6723
6724 case 16:
6725 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6726 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6727 if ((SplatBits & ~0xff) == 0) {
6728 // Value = 0x00nn: Op=x, Cmode=100x.
6729 OpCmode = 0x8;
6730 Imm = SplatBits;
6731 break;
6732 }
6733 if ((SplatBits & ~0xff00) == 0) {
6734 // Value = 0xnn00: Op=x, Cmode=101x.
6735 OpCmode = 0xa;
6736 Imm = SplatBits >> 8;
6737 break;
6738 }
6739 return SDValue();
6740
6741 case 32:
6742 // NEON's 32-bit VMOV supports splat values where:
6743 // * only one byte is nonzero, or
6744 // * the least significant byte is 0xff and the second byte is nonzero, or
6745 // * the least significant 2 bytes are 0xff and the third is nonzero.
6746 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6747 if ((SplatBits & ~0xff) == 0) {
6748 // Value = 0x000000nn: Op=x, Cmode=000x.
6749 OpCmode = 0;
6750 Imm = SplatBits;
6751 break;
6752 }
6753 if ((SplatBits & ~0xff00) == 0) {
6754 // Value = 0x0000nn00: Op=x, Cmode=001x.
6755 OpCmode = 0x2;
6756 Imm = SplatBits >> 8;
6757 break;
6758 }
6759 if ((SplatBits & ~0xff0000) == 0) {
6760 // Value = 0x00nn0000: Op=x, Cmode=010x.
6761 OpCmode = 0x4;
6762 Imm = SplatBits >> 16;
6763 break;
6764 }
6765 if ((SplatBits & ~0xff000000) == 0) {
6766 // Value = 0xnn000000: Op=x, Cmode=011x.
6767 OpCmode = 0x6;
6768 Imm = SplatBits >> 24;
6769 break;
6770 }
6771
6772 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6773 if (type == OtherModImm) return SDValue();
6774
6775 if ((SplatBits & ~0xffff) == 0 &&
6776 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6777 // Value = 0x0000nnff: Op=x, Cmode=1100.
6778 OpCmode = 0xc;
6779 Imm = SplatBits >> 8;
6780 break;
6781 }
6782
6783 // cmode == 0b1101 is not supported for MVE VMVN
6784 if (type == MVEVMVNModImm)
6785 return SDValue();
6786
6787 if ((SplatBits & ~0xffffff) == 0 &&
6788 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6789 // Value = 0x00nnffff: Op=x, Cmode=1101.
6790 OpCmode = 0xd;
6791 Imm = SplatBits >> 16;
6792 break;
6793 }
6794
6795 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6796 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6797 // VMOV.I32. A (very) minor optimization would be to replicate the value
6798 // and fall through here to test for a valid 64-bit splat. But, then the
6799 // caller would also need to check and handle the change in size.
6800 return SDValue();
6801
6802 case 64: {
6803 if (type != VMOVModImm)
6804 return SDValue();
6805 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6806 uint64_t BitMask = 0xff;
6807 unsigned ImmMask = 1;
6808 Imm = 0;
6809 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6810 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6811 Imm |= ImmMask;
6812 } else if ((SplatBits & BitMask) != 0) {
6813 return SDValue();
6814 }
6815 BitMask <<= 8;
6816 ImmMask <<= 1;
6817 }
6818
6819 // Op=1, Cmode=1110.
6820 OpCmode = 0x1e;
6821 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6822 break;
6823 }
6824
6825 default:
6826 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6827 }
6828
6829 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6830 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6831}
6832
6833SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6834 const ARMSubtarget *ST) const {
6835 EVT VT = Op.getValueType();
6836 bool IsDouble = (VT == MVT::f64);
6837 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6838 const APFloat &FPVal = CFP->getValueAPF();
6839
6840 // Prevent floating-point constants from using literal loads
6841 // when execute-only is enabled.
6842 if (ST->genExecuteOnly()) {
6843 // We shouldn't trigger this for v6m execute-only
6844 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6845 "Unexpected architecture");
6846
6847 // If we can represent the constant as an immediate, don't lower it
6848 if (isFPImmLegal(FPVal, VT))
6849 return Op;
6850 // Otherwise, construct as integer, and move to float register
6851 APInt INTVal = FPVal.bitcastToAPInt();
6852 SDLoc DL(CFP);
6853 switch (VT.getSimpleVT().SimpleTy) {
6854 default:
6855 llvm_unreachable("Unknown floating point type!");
6856 break;
6857 case MVT::f64: {
6858 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6859 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6860 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6861 }
6862 case MVT::f32:
6863 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6864 DAG.getConstant(INTVal, DL, MVT::i32));
6865 }
6866 }
6867
6868 if (!ST->hasVFP3Base())
6869 return SDValue();
6870
6871 // Use the default (constant pool) lowering for double constants when we have
6872 // an SP-only FPU
6873 if (IsDouble && !Subtarget->hasFP64())
6874 return SDValue();
6875
6876 // Try splatting with a VMOV.f32...
6877 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6878
6879 if (ImmVal != -1) {
6880 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6881 // We have code in place to select a valid ConstantFP already, no need to
6882 // do any mangling.
6883 return Op;
6884 }
6885
6886 // It's a float and we are trying to use NEON operations where
6887 // possible. Lower it to a splat followed by an extract.
6888 SDLoc DL(Op);
6889 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6890 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6891 NewVal);
6892 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6893 DAG.getConstant(0, DL, MVT::i32));
6894 }
6895
6896 // The rest of our options are NEON only, make sure that's allowed before
6897 // proceeding..
6898 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6899 return SDValue();
6900
6901 EVT VMovVT;
6902 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6903
6904 // It wouldn't really be worth bothering for doubles except for one very
6905 // important value, which does happen to match: 0.0. So make sure we don't do
6906 // anything stupid.
6907 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6908 return SDValue();
6909
6910 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6911 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6912 VMovVT, VT, VMOVModImm);
6913 if (NewVal != SDValue()) {
6914 SDLoc DL(Op);
6915 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6916 NewVal);
6917 if (IsDouble)
6918 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6919
6920 // It's a float: cast and extract a vector element.
6921 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6922 VecConstant);
6923 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6924 DAG.getConstant(0, DL, MVT::i32));
6925 }
6926
6927 // Finally, try a VMVN.i32
6928 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6929 VT, VMVNModImm);
6930 if (NewVal != SDValue()) {
6931 SDLoc DL(Op);
6932 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6933
6934 if (IsDouble)
6935 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6936
6937 // It's a float: cast and extract a vector element.
6938 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6939 VecConstant);
6940 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6941 DAG.getConstant(0, DL, MVT::i32));
6942 }
6943
6944 return SDValue();
6945}
6946
6947// check if an VEXT instruction can handle the shuffle mask when the
6948// vector sources of the shuffle are the same.
6949static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6950 unsigned NumElts = VT.getVectorNumElements();
6951
6952 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6953 if (M[0] < 0)
6954 return false;
6955
6956 Imm = M[0];
6957
6958 // If this is a VEXT shuffle, the immediate value is the index of the first
6959 // element. The other shuffle indices must be the successive elements after
6960 // the first one.
6961 unsigned ExpectedElt = Imm;
6962 for (unsigned i = 1; i < NumElts; ++i) {
6963 // Increment the expected index. If it wraps around, just follow it
6964 // back to index zero and keep going.
6965 ++ExpectedElt;
6966 if (ExpectedElt == NumElts)
6967 ExpectedElt = 0;
6968
6969 if (M[i] < 0) continue; // ignore UNDEF indices
6970 if (ExpectedElt != static_cast<unsigned>(M[i]))
6971 return false;
6972 }
6973
6974 return true;
6975}
6976
6977static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6978 bool &ReverseVEXT, unsigned &Imm) {
6979 unsigned NumElts = VT.getVectorNumElements();
6980 ReverseVEXT = false;
6981
6982 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6983 if (M[0] < 0)
6984 return false;
6985
6986 Imm = M[0];
6987
6988 // If this is a VEXT shuffle, the immediate value is the index of the first
6989 // element. The other shuffle indices must be the successive elements after
6990 // the first one.
6991 unsigned ExpectedElt = Imm;
6992 for (unsigned i = 1; i < NumElts; ++i) {
6993 // Increment the expected index. If it wraps around, it may still be
6994 // a VEXT but the source vectors must be swapped.
6995 ExpectedElt += 1;
6996 if (ExpectedElt == NumElts * 2) {
6997 ExpectedElt = 0;
6998 ReverseVEXT = true;
6999 }
7000
7001 if (M[i] < 0) continue; // ignore UNDEF indices
7002 if (ExpectedElt != static_cast<unsigned>(M[i]))
7003 return false;
7004 }
7005
7006 // Adjust the index value if the source operands will be swapped.
7007 if (ReverseVEXT)
7008 Imm -= NumElts;
7009
7010 return true;
7011}
7012
7013static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7014 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7015 // range, then 0 is placed into the resulting vector. So pretty much any mask
7016 // of 8 elements can work here.
7017 return VT == MVT::v8i8 && M.size() == 8;
7018}
7019
7020static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7021 unsigned Index) {
7022 if (Mask.size() == Elements * 2)
7023 return Index / Elements;
7024 return Mask[Index] == 0 ? 0 : 1;
7025}
7026
7027// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7028// checking that pairs of elements in the shuffle mask represent the same index
7029// in each vector, incrementing the expected index by 2 at each step.
7030// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7031// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7032// v2={e,f,g,h}
7033// WhichResult gives the offset for each element in the mask based on which
7034// of the two results it belongs to.
7035//
7036// The transpose can be represented either as:
7037// result1 = shufflevector v1, v2, result1_shuffle_mask
7038// result2 = shufflevector v1, v2, result2_shuffle_mask
7039// where v1/v2 and the shuffle masks have the same number of elements
7040// (here WhichResult (see below) indicates which result is being checked)
7041//
7042// or as:
7043// results = shufflevector v1, v2, shuffle_mask
7044// where both results are returned in one vector and the shuffle mask has twice
7045// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7046// want to check the low half and high half of the shuffle mask as if it were
7047// the other case
7048static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7049 unsigned EltSz = VT.getScalarSizeInBits();
7050 if (EltSz == 64)
7051 return false;
7052
7053 unsigned NumElts = VT.getVectorNumElements();
7054 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7055 return false;
7056
7057 // If the mask is twice as long as the input vector then we need to check the
7058 // upper and lower parts of the mask with a matching value for WhichResult
7059 // FIXME: A mask with only even values will be rejected in case the first
7060 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7061 // M[0] is used to determine WhichResult
7062 for (unsigned i = 0; i < M.size(); i += NumElts) {
7063 WhichResult = SelectPairHalf(NumElts, M, i);
7064 for (unsigned j = 0; j < NumElts; j += 2) {
7065 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7066 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7067 return false;
7068 }
7069 }
7070
7071 if (M.size() == NumElts*2)
7072 WhichResult = 0;
7073
7074 return true;
7075}
7076
7077/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7078/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7079/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7080static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7081 unsigned EltSz = VT.getScalarSizeInBits();
7082 if (EltSz == 64)
7083 return false;
7084
7085 unsigned NumElts = VT.getVectorNumElements();
7086 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7087 return false;
7088
7089 for (unsigned i = 0; i < M.size(); i += NumElts) {
7090 WhichResult = SelectPairHalf(NumElts, M, i);
7091 for (unsigned j = 0; j < NumElts; j += 2) {
7092 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7093 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7094 return false;
7095 }
7096 }
7097
7098 if (M.size() == NumElts*2)
7099 WhichResult = 0;
7100
7101 return true;
7102}
7103
7104// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7105// that the mask elements are either all even and in steps of size 2 or all odd
7106// and in steps of size 2.
7107// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7108// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7109// v2={e,f,g,h}
7110// Requires similar checks to that of isVTRNMask with
7111// respect the how results are returned.
7112static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7113 unsigned EltSz = VT.getScalarSizeInBits();
7114 if (EltSz == 64)
7115 return false;
7116
7117 unsigned NumElts = VT.getVectorNumElements();
7118 if (M.size() != NumElts && M.size() != NumElts*2)
7119 return false;
7120
7121 for (unsigned i = 0; i < M.size(); i += NumElts) {
7122 WhichResult = SelectPairHalf(NumElts, M, i);
7123 for (unsigned j = 0; j < NumElts; ++j) {
7124 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7125 return false;
7126 }
7127 }
7128
7129 if (M.size() == NumElts*2)
7130 WhichResult = 0;
7131
7132 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7133 if (VT.is64BitVector() && EltSz == 32)
7134 return false;
7135
7136 return true;
7137}
7138
7139/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7140/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7141/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7142static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7143 unsigned EltSz = VT.getScalarSizeInBits();
7144 if (EltSz == 64)
7145 return false;
7146
7147 unsigned NumElts = VT.getVectorNumElements();
7148 if (M.size() != NumElts && M.size() != NumElts*2)
7149 return false;
7150
7151 unsigned Half = NumElts / 2;
7152 for (unsigned i = 0; i < M.size(); i += NumElts) {
7153 WhichResult = SelectPairHalf(NumElts, M, i);
7154 for (unsigned j = 0; j < NumElts; j += Half) {
7155 unsigned Idx = WhichResult;
7156 for (unsigned k = 0; k < Half; ++k) {
7157 int MIdx = M[i + j + k];
7158 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7159 return false;
7160 Idx += 2;
7161 }
7162 }
7163 }
7164
7165 if (M.size() == NumElts*2)
7166 WhichResult = 0;
7167
7168 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7169 if (VT.is64BitVector() && EltSz == 32)
7170 return false;
7171
7172 return true;
7173}
7174
7175// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7176// that pairs of elements of the shufflemask represent the same index in each
7177// vector incrementing sequentially through the vectors.
7178// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7179// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7180// v2={e,f,g,h}
7181// Requires similar checks to that of isVTRNMask with respect the how results
7182// are returned.
7183static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7184 unsigned EltSz = VT.getScalarSizeInBits();
7185 if (EltSz == 64)
7186 return false;
7187
7188 unsigned NumElts = VT.getVectorNumElements();
7189 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7190 return false;
7191
7192 for (unsigned i = 0; i < M.size(); i += NumElts) {
7193 WhichResult = SelectPairHalf(NumElts, M, i);
7194 unsigned Idx = WhichResult * NumElts / 2;
7195 for (unsigned j = 0; j < NumElts; j += 2) {
7196 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7197 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7198 return false;
7199 Idx += 1;
7200 }
7201 }
7202
7203 if (M.size() == NumElts*2)
7204 WhichResult = 0;
7205
7206 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7207 if (VT.is64BitVector() && EltSz == 32)
7208 return false;
7209
7210 return true;
7211}
7212
7213/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7214/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7215/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7216static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7217 unsigned EltSz = VT.getScalarSizeInBits();
7218 if (EltSz == 64)
7219 return false;
7220
7221 unsigned NumElts = VT.getVectorNumElements();
7222 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7223 return false;
7224
7225 for (unsigned i = 0; i < M.size(); i += NumElts) {
7226 WhichResult = SelectPairHalf(NumElts, M, i);
7227 unsigned Idx = WhichResult * NumElts / 2;
7228 for (unsigned j = 0; j < NumElts; j += 2) {
7229 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7230 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7231 return false;
7232 Idx += 1;
7233 }
7234 }
7235
7236 if (M.size() == NumElts*2)
7237 WhichResult = 0;
7238
7239 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7240 if (VT.is64BitVector() && EltSz == 32)
7241 return false;
7242
7243 return true;
7244}
7245
7246/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7247/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7248static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7249 unsigned &WhichResult,
7250 bool &isV_UNDEF) {
7251 isV_UNDEF = false;
7252 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7253 return ARMISD::VTRN;
7254 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7255 return ARMISD::VUZP;
7256 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7257 return ARMISD::VZIP;
7258
7259 isV_UNDEF = true;
7260 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7261 return ARMISD::VTRN;
7262 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7263 return ARMISD::VUZP;
7264 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7265 return ARMISD::VZIP;
7266
7267 return 0;
7268}
7269
7270/// \return true if this is a reverse operation on an vector.
7271static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7272 unsigned NumElts = VT.getVectorNumElements();
7273 // Make sure the mask has the right size.
7274 if (NumElts != M.size())
7275 return false;
7276
7277 // Look for <15, ..., 3, -1, 1, 0>.
7278 for (unsigned i = 0; i != NumElts; ++i)
7279 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7280 return false;
7281
7282 return true;
7283}
7284
7285static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7286 unsigned NumElts = VT.getVectorNumElements();
7287 // Make sure the mask has the right size.
7288 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7289 return false;
7290
7291 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7292 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7293 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7294 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7295 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7296 int Ofs = Top ? 1 : 0;
7297 int Upper = SingleSource ? 0 : NumElts;
7298 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7299 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7300 return false;
7301 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7302 return false;
7303 }
7304 return true;
7305}
7306
7307static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7308 unsigned NumElts = VT.getVectorNumElements();
7309 // Make sure the mask has the right size.
7310 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7311 return false;
7312
7313 // If Top
7314 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7315 // This inserts Input2 into Input1
7316 // else if not Top
7317 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7318 // This inserts Input1 into Input2
7319 unsigned Offset = Top ? 0 : 1;
7320 unsigned N = SingleSource ? 0 : NumElts;
7321 for (unsigned i = 0; i < NumElts; i += 2) {
7322 if (M[i] >= 0 && M[i] != (int)i)
7323 return false;
7324 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7325 return false;
7326 }
7327
7328 return true;
7329}
7330
7331static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7332 unsigned NumElts = ToVT.getVectorNumElements();
7333 if (NumElts != M.size())
7334 return false;
7335
7336 // Test if the Trunc can be convertible to a VMOVN with this shuffle. We are
7337 // looking for patterns of:
7338 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7339 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7340
7341 unsigned Off0 = rev ? NumElts / 2 : 0;
7342 unsigned Off1 = rev ? 0 : NumElts / 2;
7343 for (unsigned i = 0; i < NumElts; i += 2) {
7344 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7345 return false;
7346 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7347 return false;
7348 }
7349
7350 return true;
7351}
7352
7353// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7354// from a pair of inputs. For example:
7355// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7356// FP_ROUND(EXTRACT_ELT(Y, 0),
7357// FP_ROUND(EXTRACT_ELT(X, 1),
7358// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7360 const ARMSubtarget *ST) {
7361 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7362 if (!ST->hasMVEFloatOps())
7363 return SDValue();
7364
7365 SDLoc dl(BV);
7366 EVT VT = BV.getValueType();
7367 if (VT != MVT::v8f16)
7368 return SDValue();
7369
7370 // We are looking for a buildvector of fptrunc elements, where all the
7371 // elements are interleavingly extracted from two sources. Check the first two
7372 // items are valid enough and extract some info from them (they are checked
7373 // properly in the loop below).
7374 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7377 return SDValue();
7378 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7381 return SDValue();
7382 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7383 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7384 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7385 return SDValue();
7386
7387 // Check all the values in the BuildVector line up with our expectations.
7388 for (unsigned i = 1; i < 4; i++) {
7389 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7390 return Trunc.getOpcode() == ISD::FP_ROUND &&
7392 Trunc.getOperand(0).getOperand(0) == Op &&
7393 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7394 };
7395 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7396 return SDValue();
7397 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7398 return SDValue();
7399 }
7400
7401 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7402 DAG.getConstant(0, dl, MVT::i32));
7403 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7404 DAG.getConstant(1, dl, MVT::i32));
7405}
7406
7407// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7408// from a single input on alternating lanes. For example:
7409// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7410// FP_ROUND(EXTRACT_ELT(X, 2),
7411// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7413 const ARMSubtarget *ST) {
7414 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7415 if (!ST->hasMVEFloatOps())
7416 return SDValue();
7417
7418 SDLoc dl(BV);
7419 EVT VT = BV.getValueType();
7420 if (VT != MVT::v4f32)
7421 return SDValue();
7422
7423 // We are looking for a buildvector of fptext elements, where all the
7424 // elements are alternating lanes from a single source. For example <0,2,4,6>
7425 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7426 // info from them (they are checked properly in the loop below).
7427 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7429 return SDValue();
7430 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7432 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7433 return SDValue();
7434
7435 // Check all the values in the BuildVector line up with our expectations.
7436 for (unsigned i = 1; i < 4; i++) {
7437 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7438 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7440 Trunc.getOperand(0).getOperand(0) == Op &&
7441 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7442 };
7443 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7444 return SDValue();
7445 }
7446
7447 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7448 DAG.getConstant(Offset, dl, MVT::i32));
7449}
7450
7451// If N is an integer constant that can be moved into a register in one
7452// instruction, return an SDValue of such a constant (will become a MOV
7453// instruction). Otherwise return null.
7455 const ARMSubtarget *ST, const SDLoc &dl) {
7456 uint64_t Val;
7457 if (!isa<ConstantSDNode>(N))
7458 return SDValue();
7459 Val = N->getAsZExtVal();
7460
7461 if (ST->isThumb1Only()) {
7462 if (Val <= 255 || ~Val <= 255)
7463 return DAG.getConstant(Val, dl, MVT::i32);
7464 } else {
7465 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7466 return DAG.getConstant(Val, dl, MVT::i32);
7467 }
7468 return SDValue();
7469}
7470
7472 const ARMSubtarget *ST) {
7473 SDLoc dl(Op);
7474 EVT VT = Op.getValueType();
7475
7476 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7477
7478 unsigned NumElts = VT.getVectorNumElements();
7479 unsigned BoolMask;
7480 unsigned BitsPerBool;
7481 if (NumElts == 2) {
7482 BitsPerBool = 8;
7483 BoolMask = 0xff;
7484 } else if (NumElts == 4) {
7485 BitsPerBool = 4;
7486 BoolMask = 0xf;
7487 } else if (NumElts == 8) {
7488 BitsPerBool = 2;
7489 BoolMask = 0x3;
7490 } else if (NumElts == 16) {
7491 BitsPerBool = 1;
7492 BoolMask = 0x1;
7493 } else
7494 return SDValue();
7495
7496 // If this is a single value copied into all lanes (a splat), we can just sign
7497 // extend that single value
7498 SDValue FirstOp = Op.getOperand(0);
7499 if (!isa<ConstantSDNode>(FirstOp) &&
7500 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7501 return U.get().isUndef() || U.get() == FirstOp;
7502 })) {
7503 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7504 DAG.getValueType(MVT::i1));
7505 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7506 }
7507
7508 // First create base with bits set where known
7509 unsigned Bits32 = 0;
7510 for (unsigned i = 0; i < NumElts; ++i) {
7511 SDValue V = Op.getOperand(i);
7512 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7513 continue;
7514 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7515 if (BitSet)
7516 Bits32 |= BoolMask << (i * BitsPerBool);
7517 }
7518
7519 // Add in unknown nodes
7520 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7521 DAG.getConstant(Bits32, dl, MVT::i32));
7522 for (unsigned i = 0; i < NumElts; ++i) {
7523 SDValue V = Op.getOperand(i);
7524 if (isa<ConstantSDNode>(V) || V.isUndef())
7525 continue;
7526 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7527 DAG.getConstant(i, dl, MVT::i32));
7528 }
7529
7530 return Base;
7531}
7532
7534 const ARMSubtarget *ST) {
7535 if (!ST->hasMVEIntegerOps())
7536 return SDValue();
7537
7538 // We are looking for a buildvector where each element is Op[0] + i*N
7539 EVT VT = Op.getValueType();
7540 SDValue Op0 = Op.getOperand(0);
7541 unsigned NumElts = VT.getVectorNumElements();
7542
7543 // Get the increment value from operand 1
7544 SDValue Op1 = Op.getOperand(1);
7545 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7547 return SDValue();
7548 unsigned N = Op1.getConstantOperandVal(1);
7549 if (N != 1 && N != 2 && N != 4 && N != 8)
7550 return SDValue();
7551
7552 // Check that each other operand matches
7553 for (unsigned I = 2; I < NumElts; I++) {
7554 SDValue OpI = Op.getOperand(I);
7555 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7557 OpI.getConstantOperandVal(1) != I * N)
7558 return SDValue();
7559 }
7560
7561 SDLoc DL(Op);
7562 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7563 DAG.getConstant(N, DL, MVT::i32));
7564}
7565
7566// Returns true if the operation N can be treated as qr instruction variant at
7567// operand Op.
7568static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7569 switch (N->getOpcode()) {
7570 case ISD::ADD:
7571 case ISD::MUL:
7572 case ISD::SADDSAT:
7573 case ISD::UADDSAT:
7574 case ISD::AVGFLOORS:
7575 case ISD::AVGFLOORU:
7576 return true;
7577 case ISD::SUB:
7578 case ISD::SSUBSAT:
7579 case ISD::USUBSAT:
7580 return N->getOperand(1).getNode() == Op;
7582 switch (N->getConstantOperandVal(0)) {
7583 case Intrinsic::arm_mve_add_predicated:
7584 case Intrinsic::arm_mve_mul_predicated:
7585 case Intrinsic::arm_mve_qadd_predicated:
7586 case Intrinsic::arm_mve_vhadd:
7587 case Intrinsic::arm_mve_hadd_predicated:
7588 case Intrinsic::arm_mve_vqdmulh:
7589 case Intrinsic::arm_mve_qdmulh_predicated:
7590 case Intrinsic::arm_mve_vqrdmulh:
7591 case Intrinsic::arm_mve_qrdmulh_predicated:
7592 case Intrinsic::arm_mve_vqdmull:
7593 case Intrinsic::arm_mve_vqdmull_predicated:
7594 return true;
7595 case Intrinsic::arm_mve_sub_predicated:
7596 case Intrinsic::arm_mve_qsub_predicated:
7597 case Intrinsic::arm_mve_vhsub:
7598 case Intrinsic::arm_mve_hsub_predicated:
7599 return N->getOperand(2).getNode() == Op;
7600 default:
7601 return false;
7602 }
7603 default:
7604 return false;
7605 }
7606}
7607
7608// If this is a case we can't handle, return null and let the default
7609// expansion code take care of it.
7610SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7611 const ARMSubtarget *ST) const {
7612 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7613 SDLoc dl(Op);
7614 EVT VT = Op.getValueType();
7615
7616 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7617 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7618
7619 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7620 return R;
7621
7622 APInt SplatBits, SplatUndef;
7623 unsigned SplatBitSize;
7624 bool HasAnyUndefs;
7625 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7626 if (SplatUndef.isAllOnes())
7627 return DAG.getUNDEF(VT);
7628
7629 // If all the users of this constant splat are qr instruction variants,
7630 // generate a vdup of the constant.
7631 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7632 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7633 all_of(BVN->users(),
7634 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7635 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7636 : SplatBitSize == 16 ? MVT::v8i16
7637 : MVT::v16i8;
7638 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7639 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7640 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7641 }
7642
7643 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7644 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7645 // Check if an immediate VMOV works.
7646 EVT VmovVT;
7647 SDValue Val =
7648 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7649 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7650
7651 if (Val.getNode()) {
7652 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7653 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7654 }
7655
7656 // Try an immediate VMVN.
7657 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7658 Val = isVMOVModifiedImm(
7659 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7660 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7661 if (Val.getNode()) {
7662 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7663 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7664 }
7665
7666 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7667 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7668 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7669 if (ImmVal != -1) {
7670 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7671 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7672 }
7673 }
7674
7675 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7676 // type.
7677 if (ST->hasMVEIntegerOps() &&
7678 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7679 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7680 : SplatBitSize == 16 ? MVT::v8i16
7681 : MVT::v16i8;
7682 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7683 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7684 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7685 }
7686 }
7687 }
7688
7689 // Scan through the operands to see if only one value is used.
7690 //
7691 // As an optimisation, even if more than one value is used it may be more
7692 // profitable to splat with one value then change some lanes.
7693 //
7694 // Heuristically we decide to do this if the vector has a "dominant" value,
7695 // defined as splatted to more than half of the lanes.
7696 unsigned NumElts = VT.getVectorNumElements();
7697 bool isOnlyLowElement = true;
7698 bool usesOnlyOneValue = true;
7699 bool hasDominantValue = false;
7700 bool isConstant = true;
7701
7702 // Map of the number of times a particular SDValue appears in the
7703 // element list.
7704 DenseMap<SDValue, unsigned> ValueCounts;
7705 SDValue Value;
7706 for (unsigned i = 0; i < NumElts; ++i) {
7707 SDValue V = Op.getOperand(i);
7708 if (V.isUndef())
7709 continue;
7710 if (i > 0)
7711 isOnlyLowElement = false;
7713 isConstant = false;
7714
7715 unsigned &Count = ValueCounts[V];
7716
7717 // Is this value dominant? (takes up more than half of the lanes)
7718 if (++Count > (NumElts / 2)) {
7719 hasDominantValue = true;
7720 Value = V;
7721 }
7722 }
7723 if (ValueCounts.size() != 1)
7724 usesOnlyOneValue = false;
7725 if (!Value.getNode() && !ValueCounts.empty())
7726 Value = ValueCounts.begin()->first;
7727
7728 if (ValueCounts.empty())
7729 return DAG.getUNDEF(VT);
7730
7731 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7732 // Keep going if we are hitting this case.
7733 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7734 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7735
7736 unsigned EltSize = VT.getScalarSizeInBits();
7737
7738 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7739 // i32 and try again.
7740 if (hasDominantValue && EltSize <= 32) {
7741 if (!isConstant) {
7742 SDValue N;
7743
7744 // If we are VDUPing a value that comes directly from a vector, that will
7745 // cause an unnecessary move to and from a GPR, where instead we could
7746 // just use VDUPLANE. We can only do this if the lane being extracted
7747 // is at a constant index, as the VDUP from lane instructions only have
7748 // constant-index forms.
7749 ConstantSDNode *constIndex;
7750 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7751 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7752 // We need to create a new undef vector to use for the VDUPLANE if the
7753 // size of the vector from which we get the value is different than the
7754 // size of the vector that we need to create. We will insert the element
7755 // such that the register coalescer will remove unnecessary copies.
7756 if (VT != Value->getOperand(0).getValueType()) {
7757 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7759 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7760 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7761 Value, DAG.getConstant(index, dl, MVT::i32)),
7762 DAG.getConstant(index, dl, MVT::i32));
7763 } else
7764 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7765 Value->getOperand(0), Value->getOperand(1));
7766 } else
7767 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7768
7769 if (!usesOnlyOneValue) {
7770 // The dominant value was splatted as 'N', but we now have to insert
7771 // all differing elements.
7772 for (unsigned I = 0; I < NumElts; ++I) {
7773 if (Op.getOperand(I) == Value)
7774 continue;
7776 Ops.push_back(N);
7777 Ops.push_back(Op.getOperand(I));
7778 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7779 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7780 }
7781 }
7782 return N;
7783 }
7786 MVT FVT = VT.getVectorElementType().getSimpleVT();
7787 assert(FVT == MVT::f32 || FVT == MVT::f16);
7788 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7789 for (unsigned i = 0; i < NumElts; ++i)
7790 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7791 Op.getOperand(i)));
7792 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7793 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7794 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7795 if (Val.getNode())
7796 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7797 }
7798 if (usesOnlyOneValue) {
7799 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7800 if (isConstant && Val.getNode())
7801 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7802 }
7803 }
7804
7805 // If all elements are constants and the case above didn't get hit, fall back
7806 // to the default expansion, which will generate a load from the constant
7807 // pool.
7808 if (isConstant)
7809 return SDValue();
7810
7811 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7812 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7813 // length <= 2.
7814 if (NumElts >= 4)
7815 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7816 return shuffle;
7817
7818 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7819 // VCVT's
7820 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7821 return VCVT;
7822 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7823 return VCVT;
7824
7825 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7826 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7827 // into two 64-bit vectors; we might discover a better way to lower it.
7828 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7829 EVT ExtVT = VT.getVectorElementType();
7830 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7831 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7832 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7833 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7834 SDValue Upper =
7835 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7836 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7837 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7838 if (Lower && Upper)
7839 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7840 }
7841
7842 // Vectors with 32- or 64-bit elements can be built by directly assigning
7843 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7844 // will be legalized.
7845 if (EltSize >= 32) {
7846 // Do the expansion with floating-point types, since that is what the VFP
7847 // registers are defined to use, and since i64 is not legal.
7848 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7849 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7851 for (unsigned i = 0; i < NumElts; ++i)
7852 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7853 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7854 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7855 }
7856
7857 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7858 // know the default expansion would otherwise fall back on something even
7859 // worse. For a vector with one or two non-undef values, that's
7860 // scalar_to_vector for the elements followed by a shuffle (provided the
7861 // shuffle is valid for the target) and materialization element by element
7862 // on the stack followed by a load for everything else.
7863 if (!isConstant && !usesOnlyOneValue) {
7864 SDValue Vec = DAG.getUNDEF(VT);
7865 for (unsigned i = 0 ; i < NumElts; ++i) {
7866 SDValue V = Op.getOperand(i);
7867 if (V.isUndef())
7868 continue;
7869 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7870 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7871 }
7872 return Vec;
7873 }
7874
7875 return SDValue();
7876}
7877
7878// Gather data to see if the operation can be modelled as a
7879// shuffle in combination with VEXTs.
7880SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7881 SelectionDAG &DAG) const {
7882 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7883 SDLoc dl(Op);
7884 EVT VT = Op.getValueType();
7885 unsigned NumElts = VT.getVectorNumElements();
7886
7887 struct ShuffleSourceInfo {
7888 SDValue Vec;
7889 unsigned MinElt = std::numeric_limits<unsigned>::max();
7890 unsigned MaxElt = 0;
7891
7892 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7893 // be compatible with the shuffle we intend to construct. As a result
7894 // ShuffleVec will be some sliding window into the original Vec.
7895 SDValue ShuffleVec;
7896
7897 // Code should guarantee that element i in Vec starts at element "WindowBase
7898 // + i * WindowScale in ShuffleVec".
7899 int WindowBase = 0;
7900 int WindowScale = 1;
7901
7902 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7903
7904 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7905 };
7906
7907 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7908 // node.
7910 for (unsigned i = 0; i < NumElts; ++i) {
7911 SDValue V = Op.getOperand(i);
7912 if (V.isUndef())
7913 continue;
7914 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7915 // A shuffle can only come from building a vector from various
7916 // elements of other vectors.
7917 return SDValue();
7918 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7919 // Furthermore, shuffles require a constant mask, whereas extractelts
7920 // accept variable indices.
7921 return SDValue();
7922 }
7923
7924 // Add this element source to the list if it's not already there.
7925 SDValue SourceVec = V.getOperand(0);
7926 auto Source = llvm::find(Sources, SourceVec);
7927 if (Source == Sources.end())
7928 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7929
7930 // Update the minimum and maximum lane number seen.
7931 unsigned EltNo = V.getConstantOperandVal(1);
7932 Source->MinElt = std::min(Source->MinElt, EltNo);
7933 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7934 }
7935
7936 // Currently only do something sane when at most two source vectors
7937 // are involved.
7938 if (Sources.size() > 2)
7939 return SDValue();
7940
7941 // Find out the smallest element size among result and two sources, and use
7942 // it as element size to build the shuffle_vector.
7943 EVT SmallestEltTy = VT.getVectorElementType();
7944 for (auto &Source : Sources) {
7945 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7946 if (SrcEltTy.bitsLT(SmallestEltTy))
7947 SmallestEltTy = SrcEltTy;
7948 }
7949 unsigned ResMultiplier =
7950 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7951 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7952 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7953
7954 // If the source vector is too wide or too narrow, we may nevertheless be able
7955 // to construct a compatible shuffle either by concatenating it with UNDEF or
7956 // extracting a suitable range of elements.
7957 for (auto &Src : Sources) {
7958 EVT SrcVT = Src.ShuffleVec.getValueType();
7959
7960 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
7961 uint64_t VTSize = VT.getFixedSizeInBits();
7962 if (SrcVTSize == VTSize)
7963 continue;
7964
7965 // This stage of the search produces a source with the same element type as
7966 // the original, but with a total width matching the BUILD_VECTOR output.
7967 EVT EltVT = SrcVT.getVectorElementType();
7968 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
7969 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
7970
7971 if (SrcVTSize < VTSize) {
7972 if (2 * SrcVTSize != VTSize)
7973 return SDValue();
7974 // We can pad out the smaller vector for free, so if it's part of a
7975 // shuffle...
7976 Src.ShuffleVec =
7977 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7978 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7979 continue;
7980 }
7981
7982 if (SrcVTSize != 2 * VTSize)
7983 return SDValue();
7984
7985 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7986 // Span too large for a VEXT to cope
7987 return SDValue();
7988 }
7989
7990 if (Src.MinElt >= NumSrcElts) {
7991 // The extraction can just take the second half
7992 Src.ShuffleVec =
7993 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7994 DAG.getConstant(NumSrcElts, dl, MVT::i32));
7995 Src.WindowBase = -NumSrcElts;
7996 } else if (Src.MaxElt < NumSrcElts) {
7997 // The extraction can just take the first half
7998 Src.ShuffleVec =
7999 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8000 DAG.getConstant(0, dl, MVT::i32));
8001 } else {
8002 // An actual VEXT is needed
8003 SDValue VEXTSrc1 =
8004 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8005 DAG.getConstant(0, dl, MVT::i32));
8006 SDValue VEXTSrc2 =
8007 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8008 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8009
8010 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8011 VEXTSrc2,
8012 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8013 Src.WindowBase = -Src.MinElt;
8014 }
8015 }
8016
8017 // Another possible incompatibility occurs from the vector element types. We
8018 // can fix this by bitcasting the source vectors to the same type we intend
8019 // for the shuffle.
8020 for (auto &Src : Sources) {
8021 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8022 if (SrcEltTy == SmallestEltTy)
8023 continue;
8024 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8025 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8026 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8027 Src.WindowBase *= Src.WindowScale;
8028 }
8029
8030 // Final check before we try to actually produce a shuffle.
8031 LLVM_DEBUG({
8032 for (auto Src : Sources)
8033 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8034 });
8035
8036 // The stars all align, our next step is to produce the mask for the shuffle.
8037 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8038 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8039 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8040 SDValue Entry = Op.getOperand(i);
8041 if (Entry.isUndef())
8042 continue;
8043
8044 auto Src = llvm::find(Sources, Entry.getOperand(0));
8045 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8046
8047 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8048 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8049 // segment.
8050 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8051 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8052 VT.getScalarSizeInBits());
8053 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8054
8055 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8056 // starting at the appropriate offset.
8057 int *LaneMask = &Mask[i * ResMultiplier];
8058
8059 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8060 ExtractBase += NumElts * (Src - Sources.begin());
8061 for (int j = 0; j < LanesDefined; ++j)
8062 LaneMask[j] = ExtractBase + j;
8063 }
8064
8065
8066 // We can't handle more than two sources. This should have already
8067 // been checked before this point.
8068 assert(Sources.size() <= 2 && "Too many sources!");
8069
8070 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8071 for (unsigned i = 0; i < Sources.size(); ++i)
8072 ShuffleOps[i] = Sources[i].ShuffleVec;
8073
8074 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8075 ShuffleOps[1], Mask, DAG);
8076 if (!Shuffle)
8077 return SDValue();
8078 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8079}
8080
8082 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8091 OP_VUZPL, // VUZP, left result
8092 OP_VUZPR, // VUZP, right result
8093 OP_VZIPL, // VZIP, left result
8094 OP_VZIPR, // VZIP, right result
8095 OP_VTRNL, // VTRN, left result
8096 OP_VTRNR // VTRN, right result
8097};
8098
8099static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8100 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8101 switch (OpNum) {
8102 case OP_COPY:
8103 case OP_VREV:
8104 case OP_VDUP0:
8105 case OP_VDUP1:
8106 case OP_VDUP2:
8107 case OP_VDUP3:
8108 return true;
8109 }
8110 return false;
8111}
8112
8113/// isShuffleMaskLegal - Targets can use this to indicate that they only
8114/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8115/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8116/// are assumed to be legal.
8118 if (VT.getVectorNumElements() == 4 &&
8119 (VT.is128BitVector() || VT.is64BitVector())) {
8120 unsigned PFIndexes[4];
8121 for (unsigned i = 0; i != 4; ++i) {
8122 if (M[i] < 0)
8123 PFIndexes[i] = 8;
8124 else
8125 PFIndexes[i] = M[i];
8126 }
8127
8128 // Compute the index in the perfect shuffle table.
8129 unsigned PFTableIndex =
8130 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8131 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8132 unsigned Cost = (PFEntry >> 30);
8133
8134 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8135 return true;
8136 }
8137
8138 bool ReverseVEXT, isV_UNDEF;
8139 unsigned Imm, WhichResult;
8140
8141 unsigned EltSize = VT.getScalarSizeInBits();
8142 if (EltSize >= 32 ||
8144 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8145 isVREVMask(M, VT, 64) ||
8146 isVREVMask(M, VT, 32) ||
8147 isVREVMask(M, VT, 16))
8148 return true;
8149 else if (Subtarget->hasNEON() &&
8150 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8151 isVTBLMask(M, VT) ||
8152 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8153 return true;
8154 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8155 isReverseMask(M, VT))
8156 return true;
8157 else if (Subtarget->hasMVEIntegerOps() &&
8158 (isVMOVNMask(M, VT, true, false) ||
8159 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8160 return true;
8161 else if (Subtarget->hasMVEIntegerOps() &&
8162 (isTruncMask(M, VT, false, false) ||
8163 isTruncMask(M, VT, false, true) ||
8164 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8165 return true;
8166 else
8167 return false;
8168}
8169
8170/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8171/// the specified operations to build the shuffle.
8173 SDValue RHS, SelectionDAG &DAG,
8174 const SDLoc &dl) {
8175 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8176 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8177 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8178
8179 if (OpNum == OP_COPY) {
8180 if (LHSID == (1*9+2)*9+3) return LHS;
8181 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8182 return RHS;
8183 }
8184
8185 SDValue OpLHS, OpRHS;
8186 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8187 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8188 EVT VT = OpLHS.getValueType();
8189
8190 switch (OpNum) {
8191 default: llvm_unreachable("Unknown shuffle opcode!");
8192 case OP_VREV:
8193 // VREV divides the vector in half and swaps within the half.
8194 if (VT.getScalarSizeInBits() == 32)
8195 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8196 // vrev <4 x i16> -> VREV32
8197 if (VT.getScalarSizeInBits() == 16)
8198 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8199 // vrev <4 x i8> -> VREV16
8200 assert(VT.getScalarSizeInBits() == 8);
8201 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8202 case OP_VDUP0:
8203 case OP_VDUP1:
8204 case OP_VDUP2:
8205 case OP_VDUP3:
8206 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8207 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8208 case OP_VEXT1:
8209 case OP_VEXT2:
8210 case OP_VEXT3:
8211 return DAG.getNode(ARMISD::VEXT, dl, VT,
8212 OpLHS, OpRHS,
8213 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8214 case OP_VUZPL:
8215 case OP_VUZPR:
8216 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8217 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8218 case OP_VZIPL:
8219 case OP_VZIPR:
8220 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8221 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8222 case OP_VTRNL:
8223 case OP_VTRNR:
8224 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8225 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8226 }
8227}
8228
8230 ArrayRef<int> ShuffleMask,
8231 SelectionDAG &DAG) {
8232 // Check to see if we can use the VTBL instruction.
8233 SDValue V1 = Op.getOperand(0);
8234 SDValue V2 = Op.getOperand(1);
8235 SDLoc DL(Op);
8236
8237 SmallVector<SDValue, 8> VTBLMask;
8238 for (int I : ShuffleMask)
8239 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8240
8241 if (V2.getNode()->isUndef())
8242 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8243 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8244
8245 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8246 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8247}
8248
8250 SDLoc DL(Op);
8251 EVT VT = Op.getValueType();
8252
8253 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8254 "Expect an v8i16/v16i8 type");
8255 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8256 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8257 // extract the first 8 bytes into the top double word and the last 8 bytes
8258 // into the bottom double word, through a new vector shuffle that will be
8259 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8260 std::vector<int> NewMask;
8261 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8262 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8263 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8264 NewMask.push_back(i);
8265 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8266}
8267
8269 switch (VT.getSimpleVT().SimpleTy) {
8270 case MVT::v2i1:
8271 return MVT::v2f64;
8272 case MVT::v4i1:
8273 return MVT::v4i32;
8274 case MVT::v8i1:
8275 return MVT::v8i16;
8276 case MVT::v16i1:
8277 return MVT::v16i8;
8278 default:
8279 llvm_unreachable("Unexpected vector predicate type");
8280 }
8281}
8282
8284 SelectionDAG &DAG) {
8285 // Converting from boolean predicates to integers involves creating a vector
8286 // of all ones or all zeroes and selecting the lanes based upon the real
8287 // predicate.
8289 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8290 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8291
8292 SDValue AllZeroes =
8293 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8294 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8295
8296 // Get full vector type from predicate type
8298
8299 SDValue RecastV1;
8300 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8301 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8302 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8303 // since we know in hardware the sizes are really the same.
8304 if (VT != MVT::v16i1)
8305 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8306 else
8307 RecastV1 = Pred;
8308
8309 // Select either all ones or zeroes depending upon the real predicate bits.
8310 SDValue PredAsVector =
8311 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8312
8313 // Recast our new predicate-as-integer v16i8 vector into something
8314 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8315 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8316}
8317
8319 const ARMSubtarget *ST) {
8320 EVT VT = Op.getValueType();
8322 ArrayRef<int> ShuffleMask = SVN->getMask();
8323
8324 assert(ST->hasMVEIntegerOps() &&
8325 "No support for vector shuffle of boolean predicates");
8326
8327 SDValue V1 = Op.getOperand(0);
8328 SDValue V2 = Op.getOperand(1);
8329 SDLoc dl(Op);
8330 if (isReverseMask(ShuffleMask, VT)) {
8331 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8332 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8333 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8334 DAG.getConstant(16, dl, MVT::i32));
8335 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8336 }
8337
8338 // Until we can come up with optimised cases for every single vector
8339 // shuffle in existence we have chosen the least painful strategy. This is
8340 // to essentially promote the boolean predicate to a 8-bit integer, where
8341 // each predicate represents a byte. Then we fall back on a normal integer
8342 // vector shuffle and convert the result back into a predicate vector. In
8343 // many cases the generated code might be even better than scalar code
8344 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8345 // fields in a register into 8 other arbitrary 2-bit fields!
8346 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8347 EVT NewVT = PredAsVector1.getValueType();
8348 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8349 : PromoteMVEPredVector(dl, V2, VT, DAG);
8350 assert(PredAsVector2.getValueType() == NewVT &&
8351 "Expected identical vector type in expanded i1 shuffle!");
8352
8353 // Do the shuffle!
8354 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8355 PredAsVector2, ShuffleMask);
8356
8357 // Now return the result of comparing the shuffled vector with zero,
8358 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8359 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8360 if (VT == MVT::v2i1) {
8361 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8362 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8363 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8364 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8365 }
8366 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8367 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8368}
8369
8371 ArrayRef<int> ShuffleMask,
8372 SelectionDAG &DAG) {
8373 // Attempt to lower the vector shuffle using as many whole register movs as
8374 // possible. This is useful for types smaller than 32bits, which would
8375 // often otherwise become a series for grp movs.
8376 SDLoc dl(Op);
8377 EVT VT = Op.getValueType();
8378 if (VT.getScalarSizeInBits() >= 32)
8379 return SDValue();
8380
8381 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8382 "Unexpected vector type");
8383 int NumElts = VT.getVectorNumElements();
8384 int QuarterSize = NumElts / 4;
8385 // The four final parts of the vector, as i32's
8386 SDValue Parts[4];
8387
8388 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8389 // <u,u,u,u>), returning the vmov lane index
8390 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8391 // Detect which mov lane this would be from the first non-undef element.
8392 int MovIdx = -1;
8393 for (int i = 0; i < Length; i++) {
8394 if (ShuffleMask[Start + i] >= 0) {
8395 if (ShuffleMask[Start + i] % Length != i)
8396 return -1;
8397 MovIdx = ShuffleMask[Start + i] / Length;
8398 break;
8399 }
8400 }
8401 // If all items are undef, leave this for other combines
8402 if (MovIdx == -1)
8403 return -1;
8404 // Check the remaining values are the correct part of the same mov
8405 for (int i = 1; i < Length; i++) {
8406 if (ShuffleMask[Start + i] >= 0 &&
8407 (ShuffleMask[Start + i] / Length != MovIdx ||
8408 ShuffleMask[Start + i] % Length != i))
8409 return -1;
8410 }
8411 return MovIdx;
8412 };
8413
8414 for (int Part = 0; Part < 4; ++Part) {
8415 // Does this part look like a mov
8416 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8417 if (Elt != -1) {
8418 SDValue Input = Op->getOperand(0);
8419 if (Elt >= 4) {
8420 Input = Op->getOperand(1);
8421 Elt -= 4;
8422 }
8423 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8424 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8425 DAG.getConstant(Elt, dl, MVT::i32));
8426 }
8427 }
8428
8429 // Nothing interesting found, just return
8430 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8431 return SDValue();
8432
8433 // The other parts need to be built with the old shuffle vector, cast to a
8434 // v4i32 and extract_vector_elts
8435 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8436 SmallVector<int, 16> NewShuffleMask;
8437 for (int Part = 0; Part < 4; ++Part)
8438 for (int i = 0; i < QuarterSize; i++)
8439 NewShuffleMask.push_back(
8440 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8441 SDValue NewShuffle = DAG.getVectorShuffle(
8442 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8443 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8444
8445 for (int Part = 0; Part < 4; ++Part)
8446 if (!Parts[Part])
8447 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8448 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8449 }
8450 // Build a vector out of the various parts and bitcast it back to the original
8451 // type.
8452 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8453 return DAG.getBitcast(VT, NewVec);
8454}
8455
8457 ArrayRef<int> ShuffleMask,
8458 SelectionDAG &DAG) {
8459 SDValue V1 = Op.getOperand(0);
8460 SDValue V2 = Op.getOperand(1);
8461 EVT VT = Op.getValueType();
8462 unsigned NumElts = VT.getVectorNumElements();
8463
8464 // An One-Off Identity mask is one that is mostly an identity mask from as
8465 // single source but contains a single element out-of-place, either from a
8466 // different vector or from another position in the same vector. As opposed to
8467 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8468 // pair directly.
8469 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8470 int &OffElement) {
8471 OffElement = -1;
8472 int NonUndef = 0;
8473 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8474 if (Mask[i] == -1)
8475 continue;
8476 NonUndef++;
8477 if (Mask[i] != i + BaseOffset) {
8478 if (OffElement == -1)
8479 OffElement = i;
8480 else
8481 return false;
8482 }
8483 }
8484 return NonUndef > 2 && OffElement != -1;
8485 };
8486 int OffElement;
8487 SDValue VInput;
8488 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8489 VInput = V1;
8490 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8491 VInput = V2;
8492 else
8493 return SDValue();
8494
8495 SDLoc dl(Op);
8496 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8497 ? MVT::i32
8498 : VT.getScalarType();
8499 SDValue Elt = DAG.getNode(
8500 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8501 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8502 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8503 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8504 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8505}
8506
8508 const ARMSubtarget *ST) {
8509 SDValue V1 = Op.getOperand(0);
8510 SDValue V2 = Op.getOperand(1);
8511 SDLoc dl(Op);
8512 EVT VT = Op.getValueType();
8514 unsigned EltSize = VT.getScalarSizeInBits();
8515
8516 if (ST->hasMVEIntegerOps() && EltSize == 1)
8517 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8518
8519 // Convert shuffles that are directly supported on NEON to target-specific
8520 // DAG nodes, instead of keeping them as shuffles and matching them again
8521 // during code selection. This is more efficient and avoids the possibility
8522 // of inconsistencies between legalization and selection.
8523 // FIXME: floating-point vectors should be canonicalized to integer vectors
8524 // of the same time so that they get CSEd properly.
8525 ArrayRef<int> ShuffleMask = SVN->getMask();
8526
8527 if (EltSize <= 32) {
8528 if (SVN->isSplat()) {
8529 int Lane = SVN->getSplatIndex();
8530 // If this is undef splat, generate it via "just" vdup, if possible.
8531 if (Lane == -1) Lane = 0;
8532
8533 // Test if V1 is a SCALAR_TO_VECTOR.
8534 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8535 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8536 }
8537 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8538 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8539 // reaches it).
8540 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8542 bool IsScalarToVector = true;
8543 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8544 if (!V1.getOperand(i).isUndef()) {
8545 IsScalarToVector = false;
8546 break;
8547 }
8548 if (IsScalarToVector)
8549 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8550 }
8551 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8552 DAG.getConstant(Lane, dl, MVT::i32));
8553 }
8554
8555 bool ReverseVEXT = false;
8556 unsigned Imm = 0;
8557 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8558 if (ReverseVEXT)
8559 std::swap(V1, V2);
8560 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8561 DAG.getConstant(Imm, dl, MVT::i32));
8562 }
8563
8564 if (isVREVMask(ShuffleMask, VT, 64))
8565 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8566 if (isVREVMask(ShuffleMask, VT, 32))
8567 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8568 if (isVREVMask(ShuffleMask, VT, 16))
8569 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8570
8571 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8572 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8573 DAG.getConstant(Imm, dl, MVT::i32));
8574 }
8575
8576 // Check for Neon shuffles that modify both input vectors in place.
8577 // If both results are used, i.e., if there are two shuffles with the same
8578 // source operands and with masks corresponding to both results of one of
8579 // these operations, DAG memoization will ensure that a single node is
8580 // used for both shuffles.
8581 unsigned WhichResult = 0;
8582 bool isV_UNDEF = false;
8583 if (ST->hasNEON()) {
8584 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8585 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8586 if (isV_UNDEF)
8587 V2 = V1;
8588 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8589 .getValue(WhichResult);
8590 }
8591 }
8592 if (ST->hasMVEIntegerOps()) {
8593 if (isVMOVNMask(ShuffleMask, VT, false, false))
8594 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8595 DAG.getConstant(0, dl, MVT::i32));
8596 if (isVMOVNMask(ShuffleMask, VT, true, false))
8597 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8598 DAG.getConstant(1, dl, MVT::i32));
8599 if (isVMOVNMask(ShuffleMask, VT, true, true))
8600 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8601 DAG.getConstant(1, dl, MVT::i32));
8602 }
8603
8604 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8605 // shuffles that produce a result larger than their operands with:
8606 // shuffle(concat(v1, undef), concat(v2, undef))
8607 // ->
8608 // shuffle(concat(v1, v2), undef)
8609 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8610 //
8611 // This is useful in the general case, but there are special cases where
8612 // native shuffles produce larger results: the two-result ops.
8613 //
8614 // Look through the concat when lowering them:
8615 // shuffle(concat(v1, v2), undef)
8616 // ->
8617 // concat(VZIP(v1, v2):0, :1)
8618 //
8619 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8620 SDValue SubV1 = V1->getOperand(0);
8621 SDValue SubV2 = V1->getOperand(1);
8622 EVT SubVT = SubV1.getValueType();
8623
8624 // We expect these to have been canonicalized to -1.
8625 assert(llvm::all_of(ShuffleMask, [&](int i) {
8626 return i < (int)VT.getVectorNumElements();
8627 }) && "Unexpected shuffle index into UNDEF operand!");
8628
8629 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8630 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8631 if (isV_UNDEF)
8632 SubV2 = SubV1;
8633 assert((WhichResult == 0) &&
8634 "In-place shuffle of concat can only have one result!");
8635 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8636 SubV1, SubV2);
8637 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8638 Res.getValue(1));
8639 }
8640 }
8641 }
8642
8643 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8644 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8645 return V;
8646
8647 for (bool Top : {false, true}) {
8648 for (bool SingleSource : {false, true}) {
8649 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8650 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8651 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8652 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8653 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8654 SingleSource ? V1 : V2);
8655 if (Top) {
8656 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8657 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8658 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8659 }
8660 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8661 }
8662 }
8663 }
8664 }
8665
8666 // If the shuffle is not directly supported and it has 4 elements, use
8667 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8668 unsigned NumElts = VT.getVectorNumElements();
8669 if (NumElts == 4) {
8670 unsigned PFIndexes[4];
8671 for (unsigned i = 0; i != 4; ++i) {
8672 if (ShuffleMask[i] < 0)
8673 PFIndexes[i] = 8;
8674 else
8675 PFIndexes[i] = ShuffleMask[i];
8676 }
8677
8678 // Compute the index in the perfect shuffle table.
8679 unsigned PFTableIndex =
8680 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8681 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8682 unsigned Cost = (PFEntry >> 30);
8683
8684 if (Cost <= 4) {
8685 if (ST->hasNEON())
8686 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8687 else if (isLegalMVEShuffleOp(PFEntry)) {
8688 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8689 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8690 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8691 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8692 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8693 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8694 }
8695 }
8696 }
8697
8698 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8699 if (EltSize >= 32) {
8700 // Do the expansion with floating-point types, since that is what the VFP
8701 // registers are defined to use, and since i64 is not legal.
8702 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8703 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8704 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8705 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8707 for (unsigned i = 0; i < NumElts; ++i) {
8708 if (ShuffleMask[i] < 0)
8709 Ops.push_back(DAG.getUNDEF(EltVT));
8710 else
8711 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8712 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8713 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8714 dl, MVT::i32)));
8715 }
8716 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8717 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8718 }
8719
8720 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8721 isReverseMask(ShuffleMask, VT))
8722 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8723
8724 if (ST->hasNEON() && VT == MVT::v8i8)
8725 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8726 return NewOp;
8727
8728 if (ST->hasMVEIntegerOps())
8729 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8730 return NewOp;
8731
8732 return SDValue();
8733}
8734
8736 const ARMSubtarget *ST) {
8737 EVT VecVT = Op.getOperand(0).getValueType();
8738 SDLoc dl(Op);
8739
8740 assert(ST->hasMVEIntegerOps() &&
8741 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8742
8743 SDValue Conv =
8744 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8745 unsigned Lane = Op.getConstantOperandVal(2);
8746 unsigned LaneWidth =
8748 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8749 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8750 Op.getOperand(1), DAG.getValueType(MVT::i1));
8751 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8752 DAG.getConstant(~Mask, dl, MVT::i32));
8753 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8754}
8755
8756SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8757 SelectionDAG &DAG) const {
8758 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8759 SDValue Lane = Op.getOperand(2);
8760 if (!isa<ConstantSDNode>(Lane))
8761 return SDValue();
8762
8763 SDValue Elt = Op.getOperand(1);
8764 EVT EltVT = Elt.getValueType();
8765
8766 if (Subtarget->hasMVEIntegerOps() &&
8767 Op.getValueType().getScalarSizeInBits() == 1)
8768 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8769
8770 if (getTypeAction(*DAG.getContext(), EltVT) ==
8772 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8773 // but the type system will try to do that if we don't intervene.
8774 // Reinterpret any such vector-element insertion as one with the
8775 // corresponding integer types.
8776
8777 SDLoc dl(Op);
8778
8779 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8780 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8782
8783 SDValue VecIn = Op.getOperand(0);
8784 EVT VecVT = VecIn.getValueType();
8785 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8786 VecVT.getVectorNumElements());
8787
8788 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8789 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8790 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8791 IVecIn, IElt, Lane);
8792 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8793 }
8794
8795 return Op;
8796}
8797
8799 const ARMSubtarget *ST) {
8800 EVT VecVT = Op.getOperand(0).getValueType();
8801 SDLoc dl(Op);
8802
8803 assert(ST->hasMVEIntegerOps() &&
8804 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8805
8806 SDValue Conv =
8807 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8808 unsigned Lane = Op.getConstantOperandVal(1);
8809 unsigned LaneWidth =
8811 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8812 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8813 return Shift;
8814}
8815
8817 const ARMSubtarget *ST) {
8818 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8819 SDValue Lane = Op.getOperand(1);
8820 if (!isa<ConstantSDNode>(Lane))
8821 return SDValue();
8822
8823 SDValue Vec = Op.getOperand(0);
8824 EVT VT = Vec.getValueType();
8825
8826 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8827 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8828
8829 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8830 SDLoc dl(Op);
8831 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8832 }
8833
8834 return Op;
8835}
8836
8838 const ARMSubtarget *ST) {
8839 SDLoc dl(Op);
8840 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8841 "Unexpected custom CONCAT_VECTORS lowering");
8842 assert(isPowerOf2_32(Op.getNumOperands()) &&
8843 "Unexpected custom CONCAT_VECTORS lowering");
8844 assert(ST->hasMVEIntegerOps() &&
8845 "CONCAT_VECTORS lowering only supported for MVE");
8846
8847 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8848 EVT Op1VT = V1.getValueType();
8849 EVT Op2VT = V2.getValueType();
8850 assert(Op1VT == Op2VT && "Operand types don't match!");
8851 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8852 "Unexpected i1 concat operations!");
8853 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8854
8855 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8856 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8857
8858 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8859 // promoted to v8i16, etc.
8860 MVT ElType =
8862 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8863
8864 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8865 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8866 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8867 // ConcatVT.
8868 SDValue ConVec =
8869 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8870 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8871 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8872 }
8873
8874 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8875 // to be the right size for the destination. For example, if Op1 is v4i1
8876 // then the promoted vector is v4i32. The result of concatenation gives a
8877 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8878 // needs truncating to i16 and inserting in the result.
8879 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8880 EVT NewVT = NewV.getValueType();
8881 EVT ConcatVT = ConVec.getValueType();
8882 unsigned ExtScale = 1;
8883 if (NewVT == MVT::v2f64) {
8884 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8885 ExtScale = 2;
8886 }
8887 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8888 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8889 DAG.getIntPtrConstant(i * ExtScale, dl));
8890 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8891 DAG.getConstant(j, dl, MVT::i32));
8892 }
8893 return ConVec;
8894 };
8895 unsigned j = 0;
8896 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8897 ConVec = ExtractInto(NewV1, ConVec, j);
8898 ConVec = ExtractInto(NewV2, ConVec, j);
8899
8900 // Now return the result of comparing the subvector with zero, which will
8901 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8902 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8903 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8904 };
8905
8906 // Concat each pair of subvectors and pack into the lower half of the array.
8907 SmallVector<SDValue> ConcatOps(Op->ops());
8908 while (ConcatOps.size() > 1) {
8909 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8910 SDValue V1 = ConcatOps[I];
8911 SDValue V2 = ConcatOps[I + 1];
8912 ConcatOps[I / 2] = ConcatPair(V1, V2);
8913 }
8914 ConcatOps.resize(ConcatOps.size() / 2);
8915 }
8916 return ConcatOps[0];
8917}
8918
8920 const ARMSubtarget *ST) {
8921 EVT VT = Op->getValueType(0);
8922 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8923 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8924
8925 // The only time a CONCAT_VECTORS operation can have legal types is when
8926 // two 64-bit vectors are concatenated to a 128-bit vector.
8927 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8928 "unexpected CONCAT_VECTORS");
8929 SDLoc dl(Op);
8930 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8931 SDValue Op0 = Op.getOperand(0);
8932 SDValue Op1 = Op.getOperand(1);
8933 if (!Op0.isUndef())
8934 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8935 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8936 DAG.getIntPtrConstant(0, dl));
8937 if (!Op1.isUndef())
8938 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8939 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8940 DAG.getIntPtrConstant(1, dl));
8941 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8942}
8943
8945 const ARMSubtarget *ST) {
8946 SDValue V1 = Op.getOperand(0);
8947 SDValue V2 = Op.getOperand(1);
8948 SDLoc dl(Op);
8949 EVT VT = Op.getValueType();
8950 EVT Op1VT = V1.getValueType();
8951 unsigned NumElts = VT.getVectorNumElements();
8952 unsigned Index = V2->getAsZExtVal();
8953
8954 assert(VT.getScalarSizeInBits() == 1 &&
8955 "Unexpected custom EXTRACT_SUBVECTOR lowering");
8956 assert(ST->hasMVEIntegerOps() &&
8957 "EXTRACT_SUBVECTOR lowering only supported for MVE");
8958
8959 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8960
8961 // We now have Op1 promoted to a vector of integers, where v8i1 gets
8962 // promoted to v8i16, etc.
8963
8965
8966 if (NumElts == 2) {
8967 EVT SubVT = MVT::v4i32;
8968 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8969 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
8970 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8971 DAG.getIntPtrConstant(i, dl));
8972 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8973 DAG.getConstant(j, dl, MVT::i32));
8974 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8975 DAG.getConstant(j + 1, dl, MVT::i32));
8976 }
8977 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
8978 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8979 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8980 }
8981
8982 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
8983 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8984 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8985 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8986 DAG.getIntPtrConstant(i, dl));
8987 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8988 DAG.getConstant(j, dl, MVT::i32));
8989 }
8990
8991 // Now return the result of comparing the subvector with zero,
8992 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8993 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8994 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8995}
8996
8997// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8999 const ARMSubtarget *ST) {
9000 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9001 EVT VT = N->getValueType(0);
9002 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9003 "Expected a vector i1 type!");
9004 SDValue Op = N->getOperand(0);
9005 EVT FromVT = Op.getValueType();
9006 SDLoc DL(N);
9007
9008 SDValue And =
9009 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9010 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9011 DAG.getCondCode(ISD::SETNE));
9012}
9013
9015 const ARMSubtarget *Subtarget) {
9016 if (!Subtarget->hasMVEIntegerOps())
9017 return SDValue();
9018
9019 EVT ToVT = N->getValueType(0);
9020 if (ToVT.getScalarType() == MVT::i1)
9021 return LowerTruncatei1(N, DAG, Subtarget);
9022
9023 // MVE does not have a single instruction to perform the truncation of a v4i32
9024 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9025 // Most of the instructions in MVE follow the 'Beats' system, where moving
9026 // values from different lanes is usually something that the instructions
9027 // avoid.
9028 //
9029 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9030 // which take a the top/bottom half of a larger lane and extend it (or do the
9031 // opposite, truncating into the top/bottom lane from a larger lane). Note
9032 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9033 // bottom 16bits from each vector lane. This works really well with T/B
9034 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9035 // to move order.
9036 //
9037 // But truncates and sext/zext are always going to be fairly common from llvm.
9038 // We have several options for how to deal with them:
9039 // - Wherever possible combine them into an instruction that makes them
9040 // "free". This includes loads/stores, which can perform the trunc as part
9041 // of the memory operation. Or certain shuffles that can be turned into
9042 // VMOVN/VMOVL.
9043 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9044 // trunc(mul(sext(a), sext(b))) may become
9045 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9046 // this case can use VMULL). This is performed in the
9047 // MVELaneInterleavingPass.
9048 // - Otherwise we have an option. By default we would expand the
9049 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9050 // registers. One for each vector lane in the vector. This can obviously be
9051 // very expensive.
9052 // - The other option is to use the fact that loads/store can extend/truncate
9053 // to turn a trunc into two truncating stack stores and a stack reload. This
9054 // becomes 3 back-to-back memory operations, but at least that is less than
9055 // all the insert/extracts.
9056 //
9057 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9058 // are either optimized where they can be, or eventually lowered into stack
9059 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9060 // two early, where other instructions would be better, and stops us from
9061 // having to reconstruct multiple buildvector shuffles into loads/stores.
9062 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9063 return SDValue();
9064 EVT FromVT = N->getOperand(0).getValueType();
9065 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9066 return SDValue();
9067
9068 SDValue Lo, Hi;
9069 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9070 SDLoc DL(N);
9071 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9072}
9073
9075 const ARMSubtarget *Subtarget) {
9076 if (!Subtarget->hasMVEIntegerOps())
9077 return SDValue();
9078
9079 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9080
9081 EVT ToVT = N->getValueType(0);
9082 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9083 return SDValue();
9084 SDValue Op = N->getOperand(0);
9085 EVT FromVT = Op.getValueType();
9086 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9087 return SDValue();
9088
9089 SDLoc DL(N);
9090 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9091 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9092 ExtVT = MVT::v8i16;
9093
9094 unsigned Opcode =
9096 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9097 SDValue Ext1 = Ext.getValue(1);
9098
9099 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9100 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9101 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9102 }
9103
9104 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9105}
9106
9107/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9108/// element has been zero/sign-extended, depending on the isSigned parameter,
9109/// from an integer type half its size.
9111 bool isSigned) {
9112 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9113 EVT VT = N->getValueType(0);
9114 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9115 SDNode *BVN = N->getOperand(0).getNode();
9116 if (BVN->getValueType(0) != MVT::v4i32 ||
9117 BVN->getOpcode() != ISD::BUILD_VECTOR)
9118 return false;
9119 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9120 unsigned HiElt = 1 - LoElt;
9125 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9126 return false;
9127 if (isSigned) {
9128 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9129 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9130 return true;
9131 } else {
9132 if (Hi0->isZero() && Hi1->isZero())
9133 return true;
9134 }
9135 return false;
9136 }
9137
9138 if (N->getOpcode() != ISD::BUILD_VECTOR)
9139 return false;
9140
9141 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9142 SDNode *Elt = N->getOperand(i).getNode();
9144 unsigned EltSize = VT.getScalarSizeInBits();
9145 unsigned HalfSize = EltSize / 2;
9146 if (isSigned) {
9147 if (!isIntN(HalfSize, C->getSExtValue()))
9148 return false;
9149 } else {
9150 if (!isUIntN(HalfSize, C->getZExtValue()))
9151 return false;
9152 }
9153 continue;
9154 }
9155 return false;
9156 }
9157
9158 return true;
9159}
9160
9161/// isSignExtended - Check if a node is a vector value that is sign-extended
9162/// or a constant BUILD_VECTOR with sign-extended elements.
9164 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9165 return true;
9166 if (isExtendedBUILD_VECTOR(N, DAG, true))
9167 return true;
9168 return false;
9169}
9170
9171/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9172/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9174 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9176 return true;
9177 if (isExtendedBUILD_VECTOR(N, DAG, false))
9178 return true;
9179 return false;
9180}
9181
9182static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9183 if (OrigVT.getSizeInBits() >= 64)
9184 return OrigVT;
9185
9186 assert(OrigVT.isSimple() && "Expecting a simple value type");
9187
9188 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9189 switch (OrigSimpleTy) {
9190 default: llvm_unreachable("Unexpected Vector Type");
9191 case MVT::v2i8:
9192 case MVT::v2i16:
9193 return MVT::v2i32;
9194 case MVT::v4i8:
9195 return MVT::v4i16;
9196 }
9197}
9198
9199/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9200/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9201/// We insert the required extension here to get the vector to fill a D register.
9203 const EVT &OrigTy,
9204 const EVT &ExtTy,
9205 unsigned ExtOpcode) {
9206 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9207 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9208 // 64-bits we need to insert a new extension so that it will be 64-bits.
9209 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9210 if (OrigTy.getSizeInBits() >= 64)
9211 return N;
9212
9213 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9214 EVT NewVT = getExtensionTo64Bits(OrigTy);
9215
9216 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9217}
9218
9219/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9220/// does not do any sign/zero extension. If the original vector is less
9221/// than 64 bits, an appropriate extension will be added after the load to
9222/// reach a total size of 64 bits. We have to add the extension separately
9223/// because ARM does not have a sign/zero extending load for vectors.
9225 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9226
9227 // The load already has the right type.
9228 if (ExtendedTy == LD->getMemoryVT())
9229 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9230 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9231 LD->getMemOperand()->getFlags());
9232
9233 // We need to create a zextload/sextload. We cannot just create a load
9234 // followed by a zext/zext node because LowerMUL is also run during normal
9235 // operation legalization where we can't create illegal types.
9236 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9237 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9238 LD->getMemoryVT(), LD->getAlign(),
9239 LD->getMemOperand()->getFlags());
9240}
9241
9242/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9243/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9244/// the unextended value. The unextended vector should be 64 bits so that it can
9245/// be used as an operand to a VMULL instruction. If the original vector size
9246/// before extension is less than 64 bits we add a an extension to resize
9247/// the vector to 64 bits.
9249 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9250 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9251 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9252 N->getOperand(0)->getValueType(0),
9253 N->getValueType(0),
9254 N->getOpcode());
9255
9256 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9257 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9258 "Expected extending load");
9259
9260 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9261 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9262 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9263 SDValue extLoad =
9264 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9265 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9266
9267 return newLoad;
9268 }
9269
9270 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9271 // have been legalized as a BITCAST from v4i32.
9272 if (N->getOpcode() == ISD::BITCAST) {
9273 SDNode *BVN = N->getOperand(0).getNode();
9275 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9276 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9277 return DAG.getBuildVector(
9278 MVT::v2i32, SDLoc(N),
9279 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9280 }
9281 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9282 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9283 EVT VT = N->getValueType(0);
9284 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9285 unsigned NumElts = VT.getVectorNumElements();
9286 MVT TruncVT = MVT::getIntegerVT(EltSize);
9288 SDLoc dl(N);
9289 for (unsigned i = 0; i != NumElts; ++i) {
9290 const APInt &CInt = N->getConstantOperandAPInt(i);
9291 // Element types smaller than 32 bits are not legal, so use i32 elements.
9292 // The values are implicitly truncated so sext vs. zext doesn't matter.
9293 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9294 }
9295 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9296}
9297
9298static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9299 unsigned Opcode = N->getOpcode();
9300 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9301 SDNode *N0 = N->getOperand(0).getNode();
9302 SDNode *N1 = N->getOperand(1).getNode();
9303 return N0->hasOneUse() && N1->hasOneUse() &&
9304 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9305 }
9306 return false;
9307}
9308
9309static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9310 unsigned Opcode = N->getOpcode();
9311 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9312 SDNode *N0 = N->getOperand(0).getNode();
9313 SDNode *N1 = N->getOperand(1).getNode();
9314 return N0->hasOneUse() && N1->hasOneUse() &&
9315 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9316 }
9317 return false;
9318}
9319
9321 // Multiplications are only custom-lowered for 128-bit vectors so that
9322 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9323 EVT VT = Op.getValueType();
9324 assert(VT.is128BitVector() && VT.isInteger() &&
9325 "unexpected type for custom-lowering ISD::MUL");
9326 SDNode *N0 = Op.getOperand(0).getNode();
9327 SDNode *N1 = Op.getOperand(1).getNode();
9328 unsigned NewOpc = 0;
9329 bool isMLA = false;
9330 bool isN0SExt = isSignExtended(N0, DAG);
9331 bool isN1SExt = isSignExtended(N1, DAG);
9332 if (isN0SExt && isN1SExt)
9333 NewOpc = ARMISD::VMULLs;
9334 else {
9335 bool isN0ZExt = isZeroExtended(N0, DAG);
9336 bool isN1ZExt = isZeroExtended(N1, DAG);
9337 if (isN0ZExt && isN1ZExt)
9338 NewOpc = ARMISD::VMULLu;
9339 else if (isN1SExt || isN1ZExt) {
9340 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9341 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9342 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9343 NewOpc = ARMISD::VMULLs;
9344 isMLA = true;
9345 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9346 NewOpc = ARMISD::VMULLu;
9347 isMLA = true;
9348 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9349 std::swap(N0, N1);
9350 NewOpc = ARMISD::VMULLu;
9351 isMLA = true;
9352 }
9353 }
9354
9355 if (!NewOpc) {
9356 if (VT == MVT::v2i64)
9357 // Fall through to expand this. It is not legal.
9358 return SDValue();
9359 else
9360 // Other vector multiplications are legal.
9361 return Op;
9362 }
9363 }
9364
9365 // Legalize to a VMULL instruction.
9366 SDLoc DL(Op);
9367 SDValue Op0;
9368 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9369 if (!isMLA) {
9370 Op0 = SkipExtensionForVMULL(N0, DAG);
9372 Op1.getValueType().is64BitVector() &&
9373 "unexpected types for extended operands to VMULL");
9374 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9375 }
9376
9377 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9378 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9379 // vmull q0, d4, d6
9380 // vmlal q0, d5, d6
9381 // is faster than
9382 // vaddl q0, d4, d5
9383 // vmovl q1, d6
9384 // vmul q0, q0, q1
9385 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9386 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9387 EVT Op1VT = Op1.getValueType();
9388 return DAG.getNode(N0->getOpcode(), DL, VT,
9389 DAG.getNode(NewOpc, DL, VT,
9390 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9391 DAG.getNode(NewOpc, DL, VT,
9392 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9393}
9394
9396 SelectionDAG &DAG) {
9397 // TODO: Should this propagate fast-math-flags?
9398
9399 // Convert to float
9400 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9401 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9402 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9403 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9404 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9405 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9406 // Get reciprocal estimate.
9407 // float4 recip = vrecpeq_f32(yf);
9408 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9409 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9410 Y);
9411 // Because char has a smaller range than uchar, we can actually get away
9412 // without any newton steps. This requires that we use a weird bias
9413 // of 0xb000, however (again, this has been exhaustively tested).
9414 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9415 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9416 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9417 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9418 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9419 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9420 // Convert back to short.
9421 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9422 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9423 return X;
9424}
9425
9427 SelectionDAG &DAG) {
9428 // TODO: Should this propagate fast-math-flags?
9429
9430 SDValue N2;
9431 // Convert to float.
9432 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9433 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9434 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9435 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9436 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9437 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9438
9439 // Use reciprocal estimate and one refinement step.
9440 // float4 recip = vrecpeq_f32(yf);
9441 // recip *= vrecpsq_f32(yf, recip);
9442 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9443 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9444 N1);
9445 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9446 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9447 N1, N2);
9448 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9449 // Because short has a smaller range than ushort, we can actually get away
9450 // with only a single newton step. This requires that we use a weird bias
9451 // of 89, however (again, this has been exhaustively tested).
9452 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9453 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9454 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9455 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9456 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9457 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9458 // Convert back to integer and return.
9459 // return vmovn_s32(vcvt_s32_f32(result));
9460 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9461 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9462 return N0;
9463}
9464
9466 const ARMSubtarget *ST) {
9467 EVT VT = Op.getValueType();
9468 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9469 "unexpected type for custom-lowering ISD::SDIV");
9470
9471 SDLoc dl(Op);
9472 SDValue N0 = Op.getOperand(0);
9473 SDValue N1 = Op.getOperand(1);
9474 SDValue N2, N3;
9475
9476 if (VT == MVT::v8i8) {
9477 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9478 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9479
9480 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9481 DAG.getIntPtrConstant(4, dl));
9482 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9483 DAG.getIntPtrConstant(4, dl));
9484 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9485 DAG.getIntPtrConstant(0, dl));
9486 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9487 DAG.getIntPtrConstant(0, dl));
9488
9489 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9490 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9491
9492 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9493 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9494
9495 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9496 return N0;
9497 }
9498 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9499}
9500
9502 const ARMSubtarget *ST) {
9503 // TODO: Should this propagate fast-math-flags?
9504 EVT VT = Op.getValueType();
9505 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9506 "unexpected type for custom-lowering ISD::UDIV");
9507
9508 SDLoc dl(Op);
9509 SDValue N0 = Op.getOperand(0);
9510 SDValue N1 = Op.getOperand(1);
9511 SDValue N2, N3;
9512
9513 if (VT == MVT::v8i8) {
9514 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9515 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9516
9517 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9518 DAG.getIntPtrConstant(4, dl));
9519 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9520 DAG.getIntPtrConstant(4, dl));
9521 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9522 DAG.getIntPtrConstant(0, dl));
9523 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9524 DAG.getIntPtrConstant(0, dl));
9525
9526 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9527 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9528
9529 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9530 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9531
9532 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9533 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9534 MVT::i32),
9535 N0);
9536 return N0;
9537 }
9538
9539 // v4i16 sdiv ... Convert to float.
9540 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9541 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9542 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9543 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9544 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9545 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9546
9547 // Use reciprocal estimate and two refinement steps.
9548 // float4 recip = vrecpeq_f32(yf);
9549 // recip *= vrecpsq_f32(yf, recip);
9550 // recip *= vrecpsq_f32(yf, recip);
9551 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9552 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9553 BN1);
9554 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9555 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9556 BN1, N2);
9557 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9558 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9559 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9560 BN1, N2);
9561 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9562 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9563 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9564 // and that it will never cause us to return an answer too large).
9565 // float4 result = as_float4(as_int4(xf*recip) + 2);
9566 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9567 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9568 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9569 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9570 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9571 // Convert back to integer and return.
9572 // return vmovn_u32(vcvt_s32_f32(result));
9573 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9574 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9575 return N0;
9576}
9577
9579 SDNode *N = Op.getNode();
9580 EVT VT = N->getValueType(0);
9581 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9582
9583 SDValue Carry = Op.getOperand(2);
9584
9585 SDLoc DL(Op);
9586
9587 SDValue Result;
9588 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9589 // This converts the boolean value carry into the carry flag.
9590 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9591
9592 // Do the addition proper using the carry flag we wanted.
9593 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9594 Op.getOperand(1), Carry);
9595
9596 // Now convert the carry flag into a boolean value.
9597 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9598 } else {
9599 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9600 // have to invert the carry first.
9601 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9602 DAG.getConstant(1, DL, MVT::i32), Carry);
9603 // This converts the boolean value carry into the carry flag.
9604 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9605
9606 // Do the subtraction proper using the carry flag we wanted.
9607 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9608 Op.getOperand(1), Carry);
9609
9610 // Now convert the carry flag into a boolean value.
9611 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9612 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9613 // by ISD::USUBO_CARRY, so compute 1 - C.
9614 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9615 DAG.getConstant(1, DL, MVT::i32), Carry);
9616 }
9617
9618 // Return both values.
9619 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9620}
9621
9622SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9623 bool Signed,
9624 SDValue &Chain) const {
9625 EVT VT = Op.getValueType();
9626 assert((VT == MVT::i32 || VT == MVT::i64) &&
9627 "unexpected type for custom lowering DIV");
9628 SDLoc dl(Op);
9629
9630 const auto &DL = DAG.getDataLayout();
9631 RTLIB::Libcall LC;
9632 if (Signed)
9633 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9634 else
9635 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9636
9637 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9638 SDValue ES = DAG.getExternalSymbol(LCImpl, getPointerTy(DL));
9639
9641
9642 for (auto AI : {1, 0}) {
9643 SDValue Operand = Op.getOperand(AI);
9644 Args.emplace_back(Operand,
9645 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9646 }
9647
9648 CallLoweringInfo CLI(DAG);
9649 CLI.setDebugLoc(dl).setChain(Chain).setCallee(
9651 VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args));
9652
9653 return LowerCallTo(CLI).first;
9654}
9655
9656// This is a code size optimisation: return the original SDIV node to
9657// DAGCombiner when we don't want to expand SDIV into a sequence of
9658// instructions, and an empty node otherwise which will cause the
9659// SDIV to be expanded in DAGCombine.
9660SDValue
9661ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9662 SelectionDAG &DAG,
9663 SmallVectorImpl<SDNode *> &Created) const {
9664 // TODO: Support SREM
9665 if (N->getOpcode() != ISD::SDIV)
9666 return SDValue();
9667
9668 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9669 const bool MinSize = ST.hasMinSize();
9670 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9671 : ST.hasDivideInARMMode();
9672
9673 // Don't touch vector types; rewriting this may lead to scalarizing
9674 // the int divs.
9675 if (N->getOperand(0).getValueType().isVector())
9676 return SDValue();
9677
9678 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9679 // hwdiv support for this to be really profitable.
9680 if (!(MinSize && HasDivide))
9681 return SDValue();
9682
9683 // ARM mode is a bit simpler than Thumb: we can handle large power
9684 // of 2 immediates with 1 mov instruction; no further checks required,
9685 // just return the sdiv node.
9686 if (!ST.isThumb())
9687 return SDValue(N, 0);
9688
9689 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9690 // and thus lose the code size benefits of a MOVS that requires only 2.
9691 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9692 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9693 if (Divisor.sgt(128))
9694 return SDValue();
9695
9696 return SDValue(N, 0);
9697}
9698
9699SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9700 bool Signed) const {
9701 assert(Op.getValueType() == MVT::i32 &&
9702 "unexpected type for custom lowering DIV");
9703 SDLoc dl(Op);
9704
9705 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9706 DAG.getEntryNode(), Op.getOperand(1));
9707
9708 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9709}
9710
9712 SDLoc DL(N);
9713 SDValue Op = N->getOperand(1);
9714 if (N->getValueType(0) == MVT::i32)
9715 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9716 SDValue Lo, Hi;
9717 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9718 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9719 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9720}
9721
9722void ARMTargetLowering::ExpandDIV_Windows(
9723 SDValue Op, SelectionDAG &DAG, bool Signed,
9725 const auto &DL = DAG.getDataLayout();
9726
9727 assert(Op.getValueType() == MVT::i64 &&
9728 "unexpected type for custom lowering DIV");
9729 SDLoc dl(Op);
9730
9731 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9732
9733 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9734
9735 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9736 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9737 DAG.getConstant(32, dl, getPointerTy(DL)));
9738 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9739
9740 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9741}
9742
9743std::pair<SDValue, SDValue>
9744ARMTargetLowering::LowerAEABIUnalignedLoad(SDValue Op,
9745 SelectionDAG &DAG) const {
9746 // If we have an unaligned load from a i32 or i64 that would normally be
9747 // split into separate ldrb's, we can use the __aeabi_uread4/__aeabi_uread8
9748 // functions instead.
9749 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9750 EVT MemVT = LD->getMemoryVT();
9751 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9752 return std::make_pair(SDValue(), SDValue());
9753
9754 const auto &MF = DAG.getMachineFunction();
9755 unsigned AS = LD->getAddressSpace();
9756 Align Alignment = LD->getAlign();
9757 const DataLayout &DL = DAG.getDataLayout();
9758 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9759
9760 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9761 Alignment <= llvm::Align(2)) {
9762
9763 RTLIB::Libcall LC =
9764 (MemVT == MVT::i32) ? RTLIB::AEABI_UREAD4 : RTLIB::AEABI_UREAD8;
9765
9766 MakeLibCallOptions Opts;
9767 SDLoc dl(Op);
9768
9769 auto Pair = makeLibCall(DAG, LC, MemVT.getSimpleVT(), LD->getBasePtr(),
9770 Opts, dl, LD->getChain());
9771
9772 // If necessary, extend the node to 64bit
9773 if (LD->getExtensionType() != ISD::NON_EXTLOAD) {
9774 unsigned ExtType = LD->getExtensionType() == ISD::SEXTLOAD
9777 SDValue EN = DAG.getNode(ExtType, dl, LD->getValueType(0), Pair.first);
9778 Pair.first = EN;
9779 }
9780 return Pair;
9781 }
9782
9783 // Default expand to individual loads
9784 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9785 return expandUnalignedLoad(LD, DAG);
9786 return std::make_pair(SDValue(), SDValue());
9787}
9788
9789SDValue ARMTargetLowering::LowerAEABIUnalignedStore(SDValue Op,
9790 SelectionDAG &DAG) const {
9791 // If we have an unaligned store to a i32 or i64 that would normally be
9792 // split into separate ldrb's, we can use the __aeabi_uwrite4/__aeabi_uwrite8
9793 // functions instead.
9794 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9795 EVT MemVT = ST->getMemoryVT();
9796 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9797 return SDValue();
9798
9799 const auto &MF = DAG.getMachineFunction();
9800 unsigned AS = ST->getAddressSpace();
9801 Align Alignment = ST->getAlign();
9802 const DataLayout &DL = DAG.getDataLayout();
9803 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9804
9805 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9806 Alignment <= llvm::Align(2)) {
9807
9808 SDLoc dl(Op);
9809
9810 // If necessary, trunc the value to 32bit
9811 SDValue StoreVal = ST->getOperand(1);
9812 if (ST->isTruncatingStore())
9813 StoreVal = DAG.getNode(ISD::TRUNCATE, dl, MemVT, ST->getOperand(1));
9814
9815 RTLIB::Libcall LC =
9816 (MemVT == MVT::i32) ? RTLIB::AEABI_UWRITE4 : RTLIB::AEABI_UWRITE8;
9817
9818 MakeLibCallOptions Opts;
9819 auto CallResult =
9820 makeLibCall(DAG, LC, MVT::isVoid, {StoreVal, ST->getBasePtr()}, Opts,
9821 dl, ST->getChain());
9822
9823 return CallResult.second;
9824 }
9825
9826 // Default expand to individual stores
9827 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9828 return expandUnalignedStore(ST, DAG);
9829 return SDValue();
9830}
9831
9833 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9834 EVT MemVT = LD->getMemoryVT();
9835 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9836 MemVT == MVT::v16i1) &&
9837 "Expected a predicate type!");
9838 assert(MemVT == Op.getValueType());
9839 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9840 "Expected a non-extending load");
9841 assert(LD->isUnindexed() && "Expected a unindexed load");
9842
9843 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9844 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9845 // need to make sure that 8/4/2 bits are actually loaded into the correct
9846 // place, which means loading the value and then shuffling the values into
9847 // the bottom bits of the predicate.
9848 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9849 // for BE).
9850 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9851 // a natural VMSR(load), so needs to be reversed.
9852
9853 SDLoc dl(Op);
9854 SDValue Load = DAG.getExtLoad(
9855 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9857 LD->getMemOperand());
9858 SDValue Val = Load;
9859 if (DAG.getDataLayout().isBigEndian())
9860 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9861 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9862 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9863 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9864 if (MemVT != MVT::v16i1)
9865 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9866 DAG.getConstant(0, dl, MVT::i32));
9867 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9868}
9869
9870void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9871 SelectionDAG &DAG) const {
9872 LoadSDNode *LD = cast<LoadSDNode>(N);
9873 EVT MemVT = LD->getMemoryVT();
9874
9875 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9876 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9877 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9878 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9879 SDLoc dl(N);
9881 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9882 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9883 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9884 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9885 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9886 Results.append({Pair, Result.getValue(2)});
9887 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9888 auto Pair = LowerAEABIUnalignedLoad(SDValue(N, 0), DAG);
9889 if (Pair.first) {
9890 Results.push_back(Pair.first);
9891 Results.push_back(Pair.second);
9892 }
9893 }
9894}
9895
9897 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9898 EVT MemVT = ST->getMemoryVT();
9899 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9900 MemVT == MVT::v16i1) &&
9901 "Expected a predicate type!");
9902 assert(MemVT == ST->getValue().getValueType());
9903 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9904 assert(ST->isUnindexed() && "Expected a unindexed store");
9905
9906 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9907 // top bits unset and a scalar store.
9908 SDLoc dl(Op);
9909 SDValue Build = ST->getValue();
9910 if (MemVT != MVT::v16i1) {
9912 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9913 unsigned Elt = DAG.getDataLayout().isBigEndian()
9914 ? MemVT.getVectorNumElements() - I - 1
9915 : I;
9916 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9917 DAG.getConstant(Elt, dl, MVT::i32)));
9918 }
9919 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9920 Ops.push_back(DAG.getUNDEF(MVT::i32));
9921 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9922 }
9923 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9924 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9925 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9926 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9927 DAG.getConstant(16, dl, MVT::i32));
9928 return DAG.getTruncStore(
9929 ST->getChain(), dl, GRP, ST->getBasePtr(),
9931 ST->getMemOperand());
9932}
9933
9934SDValue ARMTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG,
9935 const ARMSubtarget *Subtarget) const {
9936 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9937 EVT MemVT = ST->getMemoryVT();
9938
9939 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9940 !Subtarget->isThumb1Only() && ST->isVolatile() &&
9941 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9942 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9943 SDNode *N = Op.getNode();
9944 SDLoc dl(N);
9945
9946 SDValue Lo = DAG.getNode(
9947 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9948 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9949 MVT::i32));
9950 SDValue Hi = DAG.getNode(
9951 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9952 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9953 MVT::i32));
9954
9955 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9956 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9957 MemVT, ST->getMemOperand());
9958 } else if (Subtarget->hasMVEIntegerOps() &&
9959 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9960 MemVT == MVT::v16i1))) {
9961 return LowerPredicateStore(Op, DAG);
9962 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9963 return LowerAEABIUnalignedStore(Op, DAG);
9964 }
9965 return SDValue();
9966}
9967
9968static bool isZeroVector(SDValue N) {
9969 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9970 (N->getOpcode() == ARMISD::VMOVIMM &&
9971 isNullConstant(N->getOperand(0))));
9972}
9973
9976 MVT VT = Op.getSimpleValueType();
9977 SDValue Mask = N->getMask();
9978 SDValue PassThru = N->getPassThru();
9979 SDLoc dl(Op);
9980
9981 if (isZeroVector(PassThru))
9982 return Op;
9983
9984 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9985 // zero too, and other values are lowered to a select.
9986 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9987 DAG.getTargetConstant(0, dl, MVT::i32));
9988 SDValue NewLoad = DAG.getMaskedLoad(
9989 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9990 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9991 N->getExtensionType(), N->isExpandingLoad());
9992 SDValue Combo = NewLoad;
9993 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9994 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9995 isZeroVector(PassThru->getOperand(0));
9996 if (!PassThru.isUndef() && !PassThruIsCastZero)
9997 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9998 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9999}
10000
10002 const ARMSubtarget *ST) {
10003 if (!ST->hasMVEIntegerOps())
10004 return SDValue();
10005
10006 SDLoc dl(Op);
10007 unsigned BaseOpcode = 0;
10008 switch (Op->getOpcode()) {
10009 default: llvm_unreachable("Expected VECREDUCE opcode");
10010 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10011 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10012 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10013 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10014 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10015 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10016 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10017 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10018 }
10019
10020 SDValue Op0 = Op->getOperand(0);
10021 EVT VT = Op0.getValueType();
10022 EVT EltVT = VT.getVectorElementType();
10023 unsigned NumElts = VT.getVectorNumElements();
10024 unsigned NumActiveLanes = NumElts;
10025
10026 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10027 NumActiveLanes == 2) &&
10028 "Only expected a power 2 vector size");
10029
10030 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10031 // allows us to easily extract vector elements from the lanes.
10032 while (NumActiveLanes > 4) {
10033 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10034 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10035 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10036 NumActiveLanes /= 2;
10037 }
10038
10039 SDValue Res;
10040 if (NumActiveLanes == 4) {
10041 // The remaining 4 elements are summed sequentially
10042 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10043 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10044 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10045 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10046 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10047 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10048 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10049 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10050 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10051 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10052 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10053 } else {
10054 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10055 DAG.getConstant(0, dl, MVT::i32));
10056 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10057 DAG.getConstant(1, dl, MVT::i32));
10058 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10059 }
10060
10061 // Result type may be wider than element type.
10062 if (EltVT != Op->getValueType(0))
10063 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10064 return Res;
10065}
10066
10068 const ARMSubtarget *ST) {
10069 if (!ST->hasMVEFloatOps())
10070 return SDValue();
10071 return LowerVecReduce(Op, DAG, ST);
10072}
10073
10075 const ARMSubtarget *ST) {
10076 if (!ST->hasNEON())
10077 return SDValue();
10078
10079 SDLoc dl(Op);
10080 SDValue Op0 = Op->getOperand(0);
10081 EVT VT = Op0.getValueType();
10082 EVT EltVT = VT.getVectorElementType();
10083
10084 unsigned PairwiseIntrinsic = 0;
10085 switch (Op->getOpcode()) {
10086 default:
10087 llvm_unreachable("Expected VECREDUCE opcode");
10089 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10090 break;
10092 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10093 break;
10095 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10096 break;
10098 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10099 break;
10100 }
10101 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10102
10103 unsigned NumElts = VT.getVectorNumElements();
10104 unsigned NumActiveLanes = NumElts;
10105
10106 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10107 NumActiveLanes == 2) &&
10108 "Only expected a power 2 vector size");
10109
10110 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10111 if (VT.is128BitVector()) {
10112 SDValue Lo, Hi;
10113 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10114 VT = Lo.getValueType();
10115 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10116 NumActiveLanes /= 2;
10117 }
10118
10119 // Use pairwise reductions until one lane remains
10120 while (NumActiveLanes > 1) {
10121 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10122 NumActiveLanes /= 2;
10123 }
10124
10125 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10126 DAG.getConstant(0, dl, MVT::i32));
10127
10128 // Result type may be wider than element type.
10129 if (EltVT != Op.getValueType()) {
10130 unsigned Extend = 0;
10131 switch (Op->getOpcode()) {
10132 default:
10133 llvm_unreachable("Expected VECREDUCE opcode");
10136 Extend = ISD::ZERO_EXTEND;
10137 break;
10140 Extend = ISD::SIGN_EXTEND;
10141 break;
10142 }
10143 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10144 }
10145 return Res;
10146}
10147
10149 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10150 // Acquire/Release load/store is not legal for targets without a dmb or
10151 // equivalent available.
10152 return SDValue();
10153
10154 // Monotonic load/store is legal for all targets.
10155 return Op;
10156}
10157
10160 SelectionDAG &DAG,
10161 const ARMSubtarget *Subtarget) {
10162 SDLoc DL(N);
10163 // Under Power Management extensions, the cycle-count is:
10164 // mrc p15, #0, <Rt>, c9, c13, #0
10165 SDValue Ops[] = { N->getOperand(0), // Chain
10166 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10167 DAG.getTargetConstant(15, DL, MVT::i32),
10168 DAG.getTargetConstant(0, DL, MVT::i32),
10169 DAG.getTargetConstant(9, DL, MVT::i32),
10170 DAG.getTargetConstant(13, DL, MVT::i32),
10171 DAG.getTargetConstant(0, DL, MVT::i32)
10172 };
10173
10174 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10175 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10176 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10177 DAG.getConstant(0, DL, MVT::i32)));
10178 Results.push_back(Cycles32.getValue(1));
10179}
10180
10182 SDValue V1) {
10183 SDLoc dl(V0.getNode());
10184 SDValue RegClass =
10185 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10186 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10187 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10188 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10189 return SDValue(
10190 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10191}
10192
10194 SDLoc dl(V.getNode());
10195 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10196 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10197 if (isBigEndian)
10198 std::swap(VLo, VHi);
10199 return createGPRPairNode2xi32(DAG, VLo, VHi);
10200}
10201
10204 SelectionDAG &DAG) {
10205 assert(N->getValueType(0) == MVT::i64 &&
10206 "AtomicCmpSwap on types less than 64 should be legal");
10207 SDValue Ops[] = {
10208 createGPRPairNode2xi32(DAG, N->getOperand(1),
10209 DAG.getUNDEF(MVT::i32)), // pointer, temp
10210 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10211 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10212 N->getOperand(0), // chain in
10213 };
10214 SDNode *CmpSwap = DAG.getMachineNode(
10215 ARM::CMP_SWAP_64, SDLoc(N),
10216 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10217
10218 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10219 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10220
10221 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10222
10223 SDValue Lo =
10224 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10225 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10226 SDValue Hi =
10227 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10228 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10229 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10230 Results.push_back(SDValue(CmpSwap, 2));
10231}
10232
10233SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10234 SDLoc dl(Op);
10235 EVT VT = Op.getValueType();
10236 SDValue Chain = Op.getOperand(0);
10237 SDValue LHS = Op.getOperand(1);
10238 SDValue RHS = Op.getOperand(2);
10239 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10240 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10241
10242 // If we don't have instructions of this float type then soften to a libcall
10243 // and use SETCC instead.
10244 if (isUnsupportedFloatingType(LHS.getValueType())) {
10245 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10246 Chain, IsSignaling);
10247 if (!RHS.getNode()) {
10248 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10249 CC = ISD::SETNE;
10250 }
10251 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10252 DAG.getCondCode(CC));
10253 return DAG.getMergeValues({Result, Chain}, dl);
10254 }
10255
10256 ARMCC::CondCodes CondCode, CondCode2;
10257 FPCCToARMCC(CC, CondCode, CondCode2);
10258
10259 SDValue True = DAG.getConstant(1, dl, VT);
10260 SDValue False = DAG.getConstant(0, dl, VT);
10261 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10262 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10263 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10264 if (CondCode2 != ARMCC::AL) {
10265 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10266 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10267 }
10268 return DAG.getMergeValues({Result, Chain}, dl);
10269}
10270
10271SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10272 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10273
10274 EVT VT = getPointerTy(DAG.getDataLayout());
10275 int FI = MFI.CreateFixedObject(4, 0, false);
10276 return DAG.getFrameIndex(FI, VT);
10277}
10278
10279SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10280 SelectionDAG &DAG) const {
10281 SDLoc DL(Op);
10282 MakeLibCallOptions CallOptions;
10283 MVT SVT = Op.getOperand(0).getSimpleValueType();
10284 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10285 SDValue Res =
10286 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10287 return DAG.getBitcast(MVT::i32, Res);
10288}
10289
10290SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10291 SDLoc dl(Op);
10292 SDValue LHS = Op.getOperand(0);
10293 SDValue RHS = Op.getOperand(1);
10294
10295 // Determine if this is signed or unsigned comparison
10296 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10297
10298 // Special case for Thumb1 UCMP only
10299 if (!IsSigned && Subtarget->isThumb1Only()) {
10300 // For Thumb unsigned comparison, use this sequence:
10301 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10302 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10303 // cmp r1, r0 ; compare RHS with LHS
10304 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10305 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10306
10307 // First subtraction: LHS - RHS
10308 SDValue Sub1WithFlags = DAG.getNode(
10309 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10310 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10311 SDValue Flags1 = Sub1WithFlags.getValue(1);
10312
10313 // SUBE: Sub1Result - Sub1Result - !carry
10314 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10315 SDValue Sbc1 =
10316 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10317 Sub1Result, Sub1Result, Flags1);
10318 SDValue Sbc1Result = Sbc1.getValue(0);
10319
10320 // Second comparison: RHS vs LHS (reverse comparison)
10321 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10322
10323 // SUBE: RHS - RHS - !carry
10324 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10325 SDValue Sbc2 = DAG.getNode(
10326 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10327 SDValue Sbc2Result = Sbc2.getValue(0);
10328
10329 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10330 SDValue Result =
10331 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10332 if (Op.getValueType() != MVT::i32)
10333 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10334
10335 return Result;
10336 }
10337
10338 // For the ARM assembly pattern:
10339 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10340 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10341 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10342 // signed, LO for unsigned)
10343 // ; if LHS == RHS, result remains 0 from the subs
10344
10345 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10346 unsigned Opcode = ARMISD::SUBC;
10347
10348 // Check if RHS is a subtraction against 0: (0 - X)
10349 if (RHS.getOpcode() == ISD::SUB) {
10350 SDValue SubLHS = RHS.getOperand(0);
10351 SDValue SubRHS = RHS.getOperand(1);
10352
10353 // Check if it's 0 - X
10354 if (isNullConstant(SubLHS)) {
10355 bool CanUseAdd = false;
10356 if (IsSigned) {
10357 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10358 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10360 .isMinSignedValue()) {
10361 CanUseAdd = true;
10362 }
10363 } else {
10364 // For UCMP: only if X is known to never be zero
10365 if (DAG.isKnownNeverZero(SubRHS)) {
10366 CanUseAdd = true;
10367 }
10368 }
10369
10370 if (CanUseAdd) {
10371 Opcode = ARMISD::ADDC;
10372 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10373 // LHS - (0 - X)
10374 }
10375 }
10376 }
10377
10378 // Generate the operation with flags
10379 SDValue OpWithFlags =
10380 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10381
10382 SDValue OpResult = OpWithFlags.getValue(0);
10383 SDValue Flags = OpWithFlags.getValue(1);
10384
10385 // Constants for conditional moves
10386 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10387 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10388
10389 // Select condition codes based on signed vs unsigned
10390 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10391 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10392
10393 // First conditional move: if greater than, set to 1
10394 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10395 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10396 GTCondValue, Flags);
10397
10398 // Second conditional move: if less than, set to -1
10399 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10400 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10401 LTCondValue, Flags);
10402
10403 if (Op.getValueType() != MVT::i32)
10404 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10405
10406 return Result2;
10407}
10408
10410 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10411 switch (Op.getOpcode()) {
10412 default: llvm_unreachable("Don't know how to custom lower this!");
10413 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10414 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10415 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10416 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10417 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10418 case ISD::SELECT: return LowerSELECT(Op, DAG);
10419 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10420 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10421 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10422 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10423 case ISD::VASTART: return LowerVASTART(Op, DAG);
10424 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10425 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10426 case ISD::SINT_TO_FP:
10427 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10430 case ISD::FP_TO_SINT:
10431 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10433 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10434 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10435 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10436 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10437 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10438 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10439 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10440 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10441 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10442 Subtarget);
10443 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10444 case ISD::SHL:
10445 case ISD::SRL:
10446 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10447 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10448 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10449 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10450 case ISD::SRL_PARTS:
10451 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10452 case ISD::CTTZ:
10453 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10454 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10455 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10456 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10457 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10458 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10459 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10460 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10461 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10462 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10463 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10464 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10465 case ISD::SIGN_EXTEND:
10466 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10467 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10468 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10469 case ISD::SET_FPMODE:
10470 return LowerSET_FPMODE(Op, DAG);
10471 case ISD::RESET_FPMODE:
10472 return LowerRESET_FPMODE(Op, DAG);
10473 case ISD::MUL: return LowerMUL(Op, DAG);
10474 case ISD::SDIV:
10475 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10476 !Op.getValueType().isVector())
10477 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10478 return LowerSDIV(Op, DAG, Subtarget);
10479 case ISD::UDIV:
10480 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10481 !Op.getValueType().isVector())
10482 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10483 return LowerUDIV(Op, DAG, Subtarget);
10484 case ISD::UADDO_CARRY:
10485 case ISD::USUBO_CARRY:
10486 return LowerUADDSUBO_CARRY(Op, DAG);
10487 case ISD::UADDO:
10488 case ISD::USUBO:
10489 case ISD::UMULO:
10490 case ISD::SADDO:
10491 case ISD::SSUBO:
10492 case ISD::SMULO:
10493 return LowerALUO(Op, DAG);
10494 case ISD::SADDSAT:
10495 case ISD::SSUBSAT:
10496 case ISD::UADDSAT:
10497 case ISD::USUBSAT:
10498 return LowerADDSUBSAT(Op, DAG, Subtarget);
10499 case ISD::LOAD: {
10500 auto *LD = cast<LoadSDNode>(Op);
10501 EVT MemVT = LD->getMemoryVT();
10502 if (Subtarget->hasMVEIntegerOps() &&
10503 (MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10504 MemVT == MVT::v16i1))
10505 return LowerPredicateLoad(Op, DAG);
10506
10507 auto Pair = LowerAEABIUnalignedLoad(Op, DAG);
10508 if (Pair.first)
10509 return DAG.getMergeValues({Pair.first, Pair.second}, SDLoc(Pair.first));
10510 return SDValue();
10511 }
10512 case ISD::STORE:
10513 return LowerSTORE(Op, DAG, Subtarget);
10514 case ISD::MLOAD:
10515 return LowerMLOAD(Op, DAG);
10516 case ISD::VECREDUCE_MUL:
10517 case ISD::VECREDUCE_AND:
10518 case ISD::VECREDUCE_OR:
10519 case ISD::VECREDUCE_XOR:
10520 return LowerVecReduce(Op, DAG, Subtarget);
10525 return LowerVecReduceF(Op, DAG, Subtarget);
10530 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10531 case ISD::ATOMIC_LOAD:
10532 case ISD::ATOMIC_STORE:
10533 return LowerAtomicLoadStore(Op, DAG);
10534 case ISD::SDIVREM:
10535 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10537 if (getTargetMachine().getTargetTriple().isOSWindows())
10538 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10539 llvm_unreachable("Don't know how to custom lower this!");
10541 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10543 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10544 case ISD::STRICT_FSETCC:
10545 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10546 case ISD::SPONENTRY:
10547 return LowerSPONENTRY(Op, DAG);
10548 case ISD::FP_TO_BF16:
10549 return LowerFP_TO_BF16(Op, DAG);
10550 case ARMISD::WIN__DBZCHK: return SDValue();
10551 case ISD::UCMP:
10552 case ISD::SCMP:
10553 return LowerCMP(Op, DAG);
10554 case ISD::ABS:
10555 return LowerABS(Op, DAG);
10556 case ISD::STRICT_LROUND:
10558 case ISD::STRICT_LRINT:
10559 case ISD::STRICT_LLRINT: {
10560 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10561 Op.getOperand(1).getValueType() == MVT::bf16) &&
10562 "Expected custom lowering of rounding operations only for f16");
10563 SDLoc DL(Op);
10564 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10565 {Op.getOperand(0), Op.getOperand(1)});
10566 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10567 {Ext.getValue(1), Ext.getValue(0)});
10568 }
10569 }
10570}
10571
10573 SelectionDAG &DAG) {
10574 unsigned IntNo = N->getConstantOperandVal(0);
10575 unsigned Opc = 0;
10576 if (IntNo == Intrinsic::arm_smlald)
10577 Opc = ARMISD::SMLALD;
10578 else if (IntNo == Intrinsic::arm_smlaldx)
10579 Opc = ARMISD::SMLALDX;
10580 else if (IntNo == Intrinsic::arm_smlsld)
10581 Opc = ARMISD::SMLSLD;
10582 else if (IntNo == Intrinsic::arm_smlsldx)
10583 Opc = ARMISD::SMLSLDX;
10584 else
10585 return;
10586
10587 SDLoc dl(N);
10588 SDValue Lo, Hi;
10589 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10590
10591 SDValue LongMul = DAG.getNode(Opc, dl,
10592 DAG.getVTList(MVT::i32, MVT::i32),
10593 N->getOperand(1), N->getOperand(2),
10594 Lo, Hi);
10595 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10596 LongMul.getValue(0), LongMul.getValue(1)));
10597}
10598
10599/// ReplaceNodeResults - Replace the results of node with an illegal result
10600/// type with new values built out of custom code.
10603 SelectionDAG &DAG) const {
10604 SDValue Res;
10605 switch (N->getOpcode()) {
10606 default:
10607 llvm_unreachable("Don't know how to custom expand this!");
10608 case ISD::READ_REGISTER:
10610 break;
10611 case ISD::BITCAST:
10612 Res = ExpandBITCAST(N, DAG, Subtarget);
10613 break;
10614 case ISD::SRL:
10615 case ISD::SRA:
10616 case ISD::SHL:
10617 Res = Expand64BitShift(N, DAG, Subtarget);
10618 break;
10619 case ISD::SREM:
10620 case ISD::UREM:
10621 Res = LowerREM(N, DAG);
10622 break;
10623 case ISD::SDIVREM:
10624 case ISD::UDIVREM:
10625 Res = LowerDivRem(SDValue(N, 0), DAG);
10626 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10627 Results.push_back(Res.getValue(0));
10628 Results.push_back(Res.getValue(1));
10629 return;
10630 case ISD::SADDSAT:
10631 case ISD::SSUBSAT:
10632 case ISD::UADDSAT:
10633 case ISD::USUBSAT:
10634 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10635 break;
10637 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10638 return;
10639 case ISD::UDIV:
10640 case ISD::SDIV:
10641 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
10642 "can only expand DIV on Windows");
10643 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10644 Results);
10647 return;
10649 return ReplaceLongIntrinsic(N, Results, DAG);
10650 case ISD::LOAD:
10651 LowerLOAD(N, Results, DAG);
10652 break;
10653 case ISD::STORE:
10654 Res = LowerAEABIUnalignedStore(SDValue(N, 0), DAG);
10655 break;
10656 case ISD::TRUNCATE:
10657 Res = LowerTruncate(N, DAG, Subtarget);
10658 break;
10659 case ISD::SIGN_EXTEND:
10660 case ISD::ZERO_EXTEND:
10661 Res = LowerVectorExtend(N, DAG, Subtarget);
10662 break;
10665 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10666 break;
10667 }
10668 if (Res.getNode())
10669 Results.push_back(Res);
10670}
10671
10672//===----------------------------------------------------------------------===//
10673// ARM Scheduler Hooks
10674//===----------------------------------------------------------------------===//
10675
10676/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10677/// registers the function context.
10678void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10680 MachineBasicBlock *DispatchBB,
10681 int FI) const {
10682 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10683 "ROPI/RWPI not currently supported with SjLj");
10684 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10685 DebugLoc dl = MI.getDebugLoc();
10686 MachineFunction *MF = MBB->getParent();
10687 MachineRegisterInfo *MRI = &MF->getRegInfo();
10690 const Function &F = MF->getFunction();
10691
10692 bool isThumb = Subtarget->isThumb();
10693 bool isThumb2 = Subtarget->isThumb2();
10694
10695 unsigned PCLabelId = AFI->createPICLabelUId();
10696 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10698 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10699 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10700
10701 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10702 : &ARM::GPRRegClass;
10703
10704 // Grab constant pool and fixed stack memory operands.
10705 MachineMemOperand *CPMMO =
10708
10709 MachineMemOperand *FIMMOSt =
10712
10713 // Load the address of the dispatch MBB into the jump buffer.
10714 if (isThumb2) {
10715 // Incoming value: jbuf
10716 // ldr.n r5, LCPI1_1
10717 // orr r5, r5, #1
10718 // add r5, pc
10719 // str r5, [$jbuf, #+4] ; &jbuf[1]
10720 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10721 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10723 .addMemOperand(CPMMO)
10725 // Set the low bit because of thumb mode.
10726 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10727 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10728 .addReg(NewVReg1, RegState::Kill)
10729 .addImm(0x01)
10731 .add(condCodeOp());
10732 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10733 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10734 .addReg(NewVReg2, RegState::Kill)
10735 .addImm(PCLabelId);
10736 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10737 .addReg(NewVReg3, RegState::Kill)
10738 .addFrameIndex(FI)
10739 .addImm(36) // &jbuf[1] :: pc
10740 .addMemOperand(FIMMOSt)
10742 } else if (isThumb) {
10743 // Incoming value: jbuf
10744 // ldr.n r1, LCPI1_4
10745 // add r1, pc
10746 // mov r2, #1
10747 // orrs r1, r2
10748 // add r2, $jbuf, #+4 ; &jbuf[1]
10749 // str r1, [r2]
10750 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10751 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10753 .addMemOperand(CPMMO)
10755 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10756 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10757 .addReg(NewVReg1, RegState::Kill)
10758 .addImm(PCLabelId);
10759 // Set the low bit because of thumb mode.
10760 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10761 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10762 .addReg(ARM::CPSR, RegState::Define)
10763 .addImm(1)
10765 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10766 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10767 .addReg(ARM::CPSR, RegState::Define)
10768 .addReg(NewVReg2, RegState::Kill)
10769 .addReg(NewVReg3, RegState::Kill)
10771 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10772 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10773 .addFrameIndex(FI)
10774 .addImm(36); // &jbuf[1] :: pc
10775 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10776 .addReg(NewVReg4, RegState::Kill)
10777 .addReg(NewVReg5, RegState::Kill)
10778 .addImm(0)
10779 .addMemOperand(FIMMOSt)
10781 } else {
10782 // Incoming value: jbuf
10783 // ldr r1, LCPI1_1
10784 // add r1, pc, r1
10785 // str r1, [$jbuf, #+4] ; &jbuf[1]
10786 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10787 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10789 .addImm(0)
10790 .addMemOperand(CPMMO)
10792 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10793 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10794 .addReg(NewVReg1, RegState::Kill)
10795 .addImm(PCLabelId)
10797 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10798 .addReg(NewVReg2, RegState::Kill)
10799 .addFrameIndex(FI)
10800 .addImm(36) // &jbuf[1] :: pc
10801 .addMemOperand(FIMMOSt)
10803 }
10804}
10805
10806void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10807 MachineBasicBlock *MBB) const {
10808 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10809 DebugLoc dl = MI.getDebugLoc();
10810 MachineFunction *MF = MBB->getParent();
10811 MachineRegisterInfo *MRI = &MF->getRegInfo();
10812 MachineFrameInfo &MFI = MF->getFrameInfo();
10813 int FI = MFI.getFunctionContextIndex();
10814
10815 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10816 : &ARM::GPRnopcRegClass;
10817
10818 // Get a mapping of the call site numbers to all of the landing pads they're
10819 // associated with.
10820 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10821 unsigned MaxCSNum = 0;
10822 for (MachineBasicBlock &BB : *MF) {
10823 if (!BB.isEHPad())
10824 continue;
10825
10826 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10827 // pad.
10828 for (MachineInstr &II : BB) {
10829 if (!II.isEHLabel())
10830 continue;
10831
10832 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10833 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10834
10835 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10836 for (unsigned Idx : CallSiteIdxs) {
10837 CallSiteNumToLPad[Idx].push_back(&BB);
10838 MaxCSNum = std::max(MaxCSNum, Idx);
10839 }
10840 break;
10841 }
10842 }
10843
10844 // Get an ordered list of the machine basic blocks for the jump table.
10845 std::vector<MachineBasicBlock*> LPadList;
10846 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10847 LPadList.reserve(CallSiteNumToLPad.size());
10848 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10849 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10850 for (MachineBasicBlock *MBB : MBBList) {
10851 LPadList.push_back(MBB);
10852 InvokeBBs.insert_range(MBB->predecessors());
10853 }
10854 }
10855
10856 assert(!LPadList.empty() &&
10857 "No landing pad destinations for the dispatch jump table!");
10858
10859 // Create the jump table and associated information.
10860 MachineJumpTableInfo *JTI =
10861 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10862 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10863
10864 // Create the MBBs for the dispatch code.
10865
10866 // Shove the dispatch's address into the return slot in the function context.
10867 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10868 DispatchBB->setIsEHPad();
10869
10870 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10871
10872 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10873 DispatchBB->addSuccessor(TrapBB);
10874
10875 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10876 DispatchBB->addSuccessor(DispContBB);
10877
10878 // Insert and MBBs.
10879 MF->insert(MF->end(), DispatchBB);
10880 MF->insert(MF->end(), DispContBB);
10881 MF->insert(MF->end(), TrapBB);
10882
10883 // Insert code into the entry block that creates and registers the function
10884 // context.
10885 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10886
10887 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10890
10891 MachineInstrBuilder MIB;
10892 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10893
10894 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10895 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10896
10897 // Add a register mask with no preserved registers. This results in all
10898 // registers being marked as clobbered. This can't work if the dispatch block
10899 // is in a Thumb1 function and is linked with ARM code which uses the FP
10900 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10902
10903 bool IsPositionIndependent = isPositionIndependent();
10904 unsigned NumLPads = LPadList.size();
10905 if (Subtarget->isThumb2()) {
10906 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10907 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10908 .addFrameIndex(FI)
10909 .addImm(4)
10910 .addMemOperand(FIMMOLd)
10912
10913 if (NumLPads < 256) {
10914 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10915 .addReg(NewVReg1)
10916 .addImm(LPadList.size())
10918 } else {
10919 Register VReg1 = MRI->createVirtualRegister(TRC);
10920 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10921 .addImm(NumLPads & 0xFFFF)
10923
10924 unsigned VReg2 = VReg1;
10925 if ((NumLPads & 0xFFFF0000) != 0) {
10926 VReg2 = MRI->createVirtualRegister(TRC);
10927 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10928 .addReg(VReg1)
10929 .addImm(NumLPads >> 16)
10931 }
10932
10933 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10934 .addReg(NewVReg1)
10935 .addReg(VReg2)
10937 }
10938
10939 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10940 .addMBB(TrapBB)
10942 .addReg(ARM::CPSR);
10943
10944 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10945 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10946 .addJumpTableIndex(MJTI)
10948
10949 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10950 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10951 .addReg(NewVReg3, RegState::Kill)
10952 .addReg(NewVReg1)
10955 .add(condCodeOp());
10956
10957 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10958 .addReg(NewVReg4, RegState::Kill)
10959 .addReg(NewVReg1)
10960 .addJumpTableIndex(MJTI);
10961 } else if (Subtarget->isThumb()) {
10962 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10963 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10964 .addFrameIndex(FI)
10965 .addImm(1)
10966 .addMemOperand(FIMMOLd)
10968
10969 if (NumLPads < 256) {
10970 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10971 .addReg(NewVReg1)
10972 .addImm(NumLPads)
10974 } else {
10975 MachineConstantPool *ConstantPool = MF->getConstantPool();
10976 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10977 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10978
10979 // MachineConstantPool wants an explicit alignment.
10980 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10981 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10982
10983 Register VReg1 = MRI->createVirtualRegister(TRC);
10984 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10985 .addReg(VReg1, RegState::Define)
10988 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10989 .addReg(NewVReg1)
10990 .addReg(VReg1)
10992 }
10993
10994 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10995 .addMBB(TrapBB)
10997 .addReg(ARM::CPSR);
10998
10999 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11000 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11001 .addReg(ARM::CPSR, RegState::Define)
11002 .addReg(NewVReg1)
11003 .addImm(2)
11005
11006 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11007 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11008 .addJumpTableIndex(MJTI)
11010
11011 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11012 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11013 .addReg(ARM::CPSR, RegState::Define)
11014 .addReg(NewVReg2, RegState::Kill)
11015 .addReg(NewVReg3)
11017
11018 MachineMemOperand *JTMMOLd =
11019 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11021
11022 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11023 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11024 .addReg(NewVReg4, RegState::Kill)
11025 .addImm(0)
11026 .addMemOperand(JTMMOLd)
11028
11029 unsigned NewVReg6 = NewVReg5;
11030 if (IsPositionIndependent) {
11031 NewVReg6 = MRI->createVirtualRegister(TRC);
11032 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11033 .addReg(ARM::CPSR, RegState::Define)
11034 .addReg(NewVReg5, RegState::Kill)
11035 .addReg(NewVReg3)
11037 }
11038
11039 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11040 .addReg(NewVReg6, RegState::Kill)
11041 .addJumpTableIndex(MJTI);
11042 } else {
11043 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11044 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11045 .addFrameIndex(FI)
11046 .addImm(4)
11047 .addMemOperand(FIMMOLd)
11049
11050 if (NumLPads < 256) {
11051 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11052 .addReg(NewVReg1)
11053 .addImm(NumLPads)
11055 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11056 Register VReg1 = MRI->createVirtualRegister(TRC);
11057 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11058 .addImm(NumLPads & 0xFFFF)
11060
11061 unsigned VReg2 = VReg1;
11062 if ((NumLPads & 0xFFFF0000) != 0) {
11063 VReg2 = MRI->createVirtualRegister(TRC);
11064 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11065 .addReg(VReg1)
11066 .addImm(NumLPads >> 16)
11068 }
11069
11070 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11071 .addReg(NewVReg1)
11072 .addReg(VReg2)
11074 } else {
11075 MachineConstantPool *ConstantPool = MF->getConstantPool();
11076 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11077 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11078
11079 // MachineConstantPool wants an explicit alignment.
11080 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11081 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11082
11083 Register VReg1 = MRI->createVirtualRegister(TRC);
11084 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11085 .addReg(VReg1, RegState::Define)
11087 .addImm(0)
11089 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11090 .addReg(NewVReg1)
11091 .addReg(VReg1, RegState::Kill)
11093 }
11094
11095 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11096 .addMBB(TrapBB)
11098 .addReg(ARM::CPSR);
11099
11100 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11101 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11102 .addReg(NewVReg1)
11105 .add(condCodeOp());
11106 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11107 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11108 .addJumpTableIndex(MJTI)
11110
11111 MachineMemOperand *JTMMOLd =
11112 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11114 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11115 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11116 .addReg(NewVReg3, RegState::Kill)
11117 .addReg(NewVReg4)
11118 .addImm(0)
11119 .addMemOperand(JTMMOLd)
11121
11122 if (IsPositionIndependent) {
11123 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11124 .addReg(NewVReg5, RegState::Kill)
11125 .addReg(NewVReg4)
11126 .addJumpTableIndex(MJTI);
11127 } else {
11128 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11129 .addReg(NewVReg5, RegState::Kill)
11130 .addJumpTableIndex(MJTI);
11131 }
11132 }
11133
11134 // Add the jump table entries as successors to the MBB.
11135 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11136 for (MachineBasicBlock *CurMBB : LPadList) {
11137 if (SeenMBBs.insert(CurMBB).second)
11138 DispContBB->addSuccessor(CurMBB);
11139 }
11140
11141 // N.B. the order the invoke BBs are processed in doesn't matter here.
11142 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11144 for (MachineBasicBlock *BB : InvokeBBs) {
11145
11146 // Remove the landing pad successor from the invoke block and replace it
11147 // with the new dispatch block.
11148 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11149 while (!Successors.empty()) {
11150 MachineBasicBlock *SMBB = Successors.pop_back_val();
11151 if (SMBB->isEHPad()) {
11152 BB->removeSuccessor(SMBB);
11153 MBBLPads.push_back(SMBB);
11154 }
11155 }
11156
11157 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11158 BB->normalizeSuccProbs();
11159
11160 // Find the invoke call and mark all of the callee-saved registers as
11161 // 'implicit defined' so that they're spilled. This prevents code from
11162 // moving instructions to before the EH block, where they will never be
11163 // executed.
11165 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11166 if (!II->isCall()) continue;
11167
11168 DenseSet<unsigned> DefRegs;
11170 OI = II->operands_begin(), OE = II->operands_end();
11171 OI != OE; ++OI) {
11172 if (!OI->isReg()) continue;
11173 DefRegs.insert(OI->getReg());
11174 }
11175
11176 MachineInstrBuilder MIB(*MF, &*II);
11177
11178 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11179 unsigned Reg = SavedRegs[i];
11180 if (Subtarget->isThumb2() &&
11181 !ARM::tGPRRegClass.contains(Reg) &&
11182 !ARM::hGPRRegClass.contains(Reg))
11183 continue;
11184 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11185 continue;
11186 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11187 continue;
11188 if (!DefRegs.contains(Reg))
11190 }
11191
11192 break;
11193 }
11194 }
11195
11196 // Mark all former landing pads as non-landing pads. The dispatch is the only
11197 // landing pad now.
11198 for (MachineBasicBlock *MBBLPad : MBBLPads)
11199 MBBLPad->setIsEHPad(false);
11200
11201 // The instruction is gone now.
11202 MI.eraseFromParent();
11203}
11204
11205static
11207 for (MachineBasicBlock *S : MBB->successors())
11208 if (S != Succ)
11209 return S;
11210 llvm_unreachable("Expecting a BB with two successors!");
11211}
11212
11213/// Return the load opcode for a given load size. If load size >= 8,
11214/// neon opcode will be returned.
11215static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11216 if (LdSize >= 8)
11217 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11218 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11219 if (IsThumb1)
11220 return LdSize == 4 ? ARM::tLDRi
11221 : LdSize == 2 ? ARM::tLDRHi
11222 : LdSize == 1 ? ARM::tLDRBi : 0;
11223 if (IsThumb2)
11224 return LdSize == 4 ? ARM::t2LDR_POST
11225 : LdSize == 2 ? ARM::t2LDRH_POST
11226 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11227 return LdSize == 4 ? ARM::LDR_POST_IMM
11228 : LdSize == 2 ? ARM::LDRH_POST
11229 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11230}
11231
11232/// Return the store opcode for a given store size. If store size >= 8,
11233/// neon opcode will be returned.
11234static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11235 if (StSize >= 8)
11236 return StSize == 16 ? ARM::VST1q32wb_fixed
11237 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11238 if (IsThumb1)
11239 return StSize == 4 ? ARM::tSTRi
11240 : StSize == 2 ? ARM::tSTRHi
11241 : StSize == 1 ? ARM::tSTRBi : 0;
11242 if (IsThumb2)
11243 return StSize == 4 ? ARM::t2STR_POST
11244 : StSize == 2 ? ARM::t2STRH_POST
11245 : StSize == 1 ? ARM::t2STRB_POST : 0;
11246 return StSize == 4 ? ARM::STR_POST_IMM
11247 : StSize == 2 ? ARM::STRH_POST
11248 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11249}
11250
11251/// Emit a post-increment load operation with given size. The instructions
11252/// will be added to BB at Pos.
11254 const TargetInstrInfo *TII, const DebugLoc &dl,
11255 unsigned LdSize, unsigned Data, unsigned AddrIn,
11256 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11257 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11258 assert(LdOpc != 0 && "Should have a load opcode");
11259 if (LdSize >= 8) {
11260 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11261 .addReg(AddrOut, RegState::Define)
11262 .addReg(AddrIn)
11263 .addImm(0)
11265 } else if (IsThumb1) {
11266 // load + update AddrIn
11267 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11268 .addReg(AddrIn)
11269 .addImm(0)
11271 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11272 .add(t1CondCodeOp())
11273 .addReg(AddrIn)
11274 .addImm(LdSize)
11276 } else if (IsThumb2) {
11277 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11278 .addReg(AddrOut, RegState::Define)
11279 .addReg(AddrIn)
11280 .addImm(LdSize)
11282 } else { // arm
11283 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11284 .addReg(AddrOut, RegState::Define)
11285 .addReg(AddrIn)
11286 .addReg(0)
11287 .addImm(LdSize)
11289 }
11290}
11291
11292/// Emit a post-increment store operation with given size. The instructions
11293/// will be added to BB at Pos.
11295 const TargetInstrInfo *TII, const DebugLoc &dl,
11296 unsigned StSize, unsigned Data, unsigned AddrIn,
11297 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11298 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11299 assert(StOpc != 0 && "Should have a store opcode");
11300 if (StSize >= 8) {
11301 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11302 .addReg(AddrIn)
11303 .addImm(0)
11304 .addReg(Data)
11306 } else if (IsThumb1) {
11307 // store + update AddrIn
11308 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11309 .addReg(Data)
11310 .addReg(AddrIn)
11311 .addImm(0)
11313 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11314 .add(t1CondCodeOp())
11315 .addReg(AddrIn)
11316 .addImm(StSize)
11318 } else if (IsThumb2) {
11319 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11320 .addReg(Data)
11321 .addReg(AddrIn)
11322 .addImm(StSize)
11324 } else { // arm
11325 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11326 .addReg(Data)
11327 .addReg(AddrIn)
11328 .addReg(0)
11329 .addImm(StSize)
11331 }
11332}
11333
11335ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11336 MachineBasicBlock *BB) const {
11337 // This pseudo instruction has 3 operands: dst, src, size
11338 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11339 // Otherwise, we will generate unrolled scalar copies.
11340 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11341 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11343
11344 Register dest = MI.getOperand(0).getReg();
11345 Register src = MI.getOperand(1).getReg();
11346 unsigned SizeVal = MI.getOperand(2).getImm();
11347 unsigned Alignment = MI.getOperand(3).getImm();
11348 DebugLoc dl = MI.getDebugLoc();
11349
11350 MachineFunction *MF = BB->getParent();
11351 MachineRegisterInfo &MRI = MF->getRegInfo();
11352 unsigned UnitSize = 0;
11353 const TargetRegisterClass *TRC = nullptr;
11354 const TargetRegisterClass *VecTRC = nullptr;
11355
11356 bool IsThumb1 = Subtarget->isThumb1Only();
11357 bool IsThumb2 = Subtarget->isThumb2();
11358 bool IsThumb = Subtarget->isThumb();
11359
11360 if (Alignment & 1) {
11361 UnitSize = 1;
11362 } else if (Alignment & 2) {
11363 UnitSize = 2;
11364 } else {
11365 // Check whether we can use NEON instructions.
11366 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11367 Subtarget->hasNEON()) {
11368 if ((Alignment % 16 == 0) && SizeVal >= 16)
11369 UnitSize = 16;
11370 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11371 UnitSize = 8;
11372 }
11373 // Can't use NEON instructions.
11374 if (UnitSize == 0)
11375 UnitSize = 4;
11376 }
11377
11378 // Select the correct opcode and register class for unit size load/store
11379 bool IsNeon = UnitSize >= 8;
11380 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11381 if (IsNeon)
11382 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11383 : UnitSize == 8 ? &ARM::DPRRegClass
11384 : nullptr;
11385
11386 unsigned BytesLeft = SizeVal % UnitSize;
11387 unsigned LoopSize = SizeVal - BytesLeft;
11388
11389 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11390 // Use LDR and STR to copy.
11391 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11392 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11393 unsigned srcIn = src;
11394 unsigned destIn = dest;
11395 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11396 Register srcOut = MRI.createVirtualRegister(TRC);
11397 Register destOut = MRI.createVirtualRegister(TRC);
11398 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11399 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11400 IsThumb1, IsThumb2);
11401 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11402 IsThumb1, IsThumb2);
11403 srcIn = srcOut;
11404 destIn = destOut;
11405 }
11406
11407 // Handle the leftover bytes with LDRB and STRB.
11408 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11409 // [destOut] = STRB_POST(scratch, destIn, 1)
11410 for (unsigned i = 0; i < BytesLeft; i++) {
11411 Register srcOut = MRI.createVirtualRegister(TRC);
11412 Register destOut = MRI.createVirtualRegister(TRC);
11413 Register scratch = MRI.createVirtualRegister(TRC);
11414 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11415 IsThumb1, IsThumb2);
11416 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11417 IsThumb1, IsThumb2);
11418 srcIn = srcOut;
11419 destIn = destOut;
11420 }
11421 MI.eraseFromParent(); // The instruction is gone now.
11422 return BB;
11423 }
11424
11425 // Expand the pseudo op to a loop.
11426 // thisMBB:
11427 // ...
11428 // movw varEnd, # --> with thumb2
11429 // movt varEnd, #
11430 // ldrcp varEnd, idx --> without thumb2
11431 // fallthrough --> loopMBB
11432 // loopMBB:
11433 // PHI varPhi, varEnd, varLoop
11434 // PHI srcPhi, src, srcLoop
11435 // PHI destPhi, dst, destLoop
11436 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11437 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11438 // subs varLoop, varPhi, #UnitSize
11439 // bne loopMBB
11440 // fallthrough --> exitMBB
11441 // exitMBB:
11442 // epilogue to handle left-over bytes
11443 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11444 // [destOut] = STRB_POST(scratch, destLoop, 1)
11445 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11446 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11447 MF->insert(It, loopMBB);
11448 MF->insert(It, exitMBB);
11449
11450 // Set the call frame size on entry to the new basic blocks.
11451 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11452 loopMBB->setCallFrameSize(CallFrameSize);
11453 exitMBB->setCallFrameSize(CallFrameSize);
11454
11455 // Transfer the remainder of BB and its successor edges to exitMBB.
11456 exitMBB->splice(exitMBB->begin(), BB,
11457 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11459
11460 // Load an immediate to varEnd.
11461 Register varEnd = MRI.createVirtualRegister(TRC);
11462 if (Subtarget->useMovt()) {
11463 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11464 varEnd)
11465 .addImm(LoopSize);
11466 } else if (Subtarget->genExecuteOnly()) {
11467 assert(IsThumb && "Non-thumb expected to have used movt");
11468 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11469 } else {
11470 MachineConstantPool *ConstantPool = MF->getConstantPool();
11472 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11473
11474 // MachineConstantPool wants an explicit alignment.
11475 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11476 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11477 MachineMemOperand *CPMMO =
11480
11481 if (IsThumb)
11482 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11483 .addReg(varEnd, RegState::Define)
11486 .addMemOperand(CPMMO);
11487 else
11488 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11489 .addReg(varEnd, RegState::Define)
11491 .addImm(0)
11493 .addMemOperand(CPMMO);
11494 }
11495 BB->addSuccessor(loopMBB);
11496
11497 // Generate the loop body:
11498 // varPhi = PHI(varLoop, varEnd)
11499 // srcPhi = PHI(srcLoop, src)
11500 // destPhi = PHI(destLoop, dst)
11501 MachineBasicBlock *entryBB = BB;
11502 BB = loopMBB;
11503 Register varLoop = MRI.createVirtualRegister(TRC);
11504 Register varPhi = MRI.createVirtualRegister(TRC);
11505 Register srcLoop = MRI.createVirtualRegister(TRC);
11506 Register srcPhi = MRI.createVirtualRegister(TRC);
11507 Register destLoop = MRI.createVirtualRegister(TRC);
11508 Register destPhi = MRI.createVirtualRegister(TRC);
11509
11510 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11511 .addReg(varLoop).addMBB(loopMBB)
11512 .addReg(varEnd).addMBB(entryBB);
11513 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11514 .addReg(srcLoop).addMBB(loopMBB)
11515 .addReg(src).addMBB(entryBB);
11516 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11517 .addReg(destLoop).addMBB(loopMBB)
11518 .addReg(dest).addMBB(entryBB);
11519
11520 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11521 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11522 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11523 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11524 IsThumb1, IsThumb2);
11525 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11526 IsThumb1, IsThumb2);
11527
11528 // Decrement loop variable by UnitSize.
11529 if (IsThumb1) {
11530 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11531 .add(t1CondCodeOp())
11532 .addReg(varPhi)
11533 .addImm(UnitSize)
11535 } else {
11536 MachineInstrBuilder MIB =
11537 BuildMI(*BB, BB->end(), dl,
11538 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11539 MIB.addReg(varPhi)
11540 .addImm(UnitSize)
11542 .add(condCodeOp());
11543 MIB->getOperand(5).setReg(ARM::CPSR);
11544 MIB->getOperand(5).setIsDef(true);
11545 }
11546 BuildMI(*BB, BB->end(), dl,
11547 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11548 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11549
11550 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11551 BB->addSuccessor(loopMBB);
11552 BB->addSuccessor(exitMBB);
11553
11554 // Add epilogue to handle BytesLeft.
11555 BB = exitMBB;
11556 auto StartOfExit = exitMBB->begin();
11557
11558 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11559 // [destOut] = STRB_POST(scratch, destLoop, 1)
11560 unsigned srcIn = srcLoop;
11561 unsigned destIn = destLoop;
11562 for (unsigned i = 0; i < BytesLeft; i++) {
11563 Register srcOut = MRI.createVirtualRegister(TRC);
11564 Register destOut = MRI.createVirtualRegister(TRC);
11565 Register scratch = MRI.createVirtualRegister(TRC);
11566 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11567 IsThumb1, IsThumb2);
11568 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11569 IsThumb1, IsThumb2);
11570 srcIn = srcOut;
11571 destIn = destOut;
11572 }
11573
11574 MI.eraseFromParent(); // The instruction is gone now.
11575 return BB;
11576}
11577
11579ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11580 MachineBasicBlock *MBB) const {
11581 const TargetMachine &TM = getTargetMachine();
11582 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11583 DebugLoc DL = MI.getDebugLoc();
11584
11585 assert(TM.getTargetTriple().isOSWindows() &&
11586 "__chkstk is only supported on Windows");
11587 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11588
11589 // __chkstk takes the number of words to allocate on the stack in R4, and
11590 // returns the stack adjustment in number of bytes in R4. This will not
11591 // clober any other registers (other than the obvious lr).
11592 //
11593 // Although, technically, IP should be considered a register which may be
11594 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11595 // thumb-2 environment, so there is no interworking required. As a result, we
11596 // do not expect a veneer to be emitted by the linker, clobbering IP.
11597 //
11598 // Each module receives its own copy of __chkstk, so no import thunk is
11599 // required, again, ensuring that IP is not clobbered.
11600 //
11601 // Finally, although some linkers may theoretically provide a trampoline for
11602 // out of range calls (which is quite common due to a 32M range limitation of
11603 // branches for Thumb), we can generate the long-call version via
11604 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11605 // IP.
11606
11607 RTLIB::LibcallImpl ChkStkLibcall = getLibcallImpl(RTLIB::STACK_PROBE);
11608 if (ChkStkLibcall == RTLIB::Unsupported)
11609 reportFatalUsageError("no available implementation of __chkstk");
11610
11611 const char *ChkStk = getLibcallImplName(ChkStkLibcall).data();
11612 switch (TM.getCodeModel()) {
11613 case CodeModel::Tiny:
11614 llvm_unreachable("Tiny code model not available on ARM.");
11615 case CodeModel::Small:
11616 case CodeModel::Medium:
11617 case CodeModel::Kernel:
11618 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11620 .addExternalSymbol(ChkStk)
11623 .addReg(ARM::R12,
11625 .addReg(ARM::CPSR,
11627 break;
11628 case CodeModel::Large: {
11629 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11630 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11631
11632 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11633 .addExternalSymbol(ChkStk);
11639 .addReg(ARM::R12,
11641 .addReg(ARM::CPSR,
11643 break;
11644 }
11645 }
11646
11647 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11648 .addReg(ARM::SP, RegState::Kill)
11649 .addReg(ARM::R4, RegState::Kill)
11652 .add(condCodeOp());
11653
11654 MI.eraseFromParent();
11655 return MBB;
11656}
11657
11659ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11660 MachineBasicBlock *MBB) const {
11661 DebugLoc DL = MI.getDebugLoc();
11662 MachineFunction *MF = MBB->getParent();
11663 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11664
11665 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11666 MF->insert(++MBB->getIterator(), ContBB);
11667 ContBB->splice(ContBB->begin(), MBB,
11668 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11670 MBB->addSuccessor(ContBB);
11671
11672 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11673 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11674 MF->push_back(TrapBB);
11675 MBB->addSuccessor(TrapBB);
11676
11677 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11678 .addReg(MI.getOperand(0).getReg())
11679 .addImm(0)
11681 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11682 .addMBB(TrapBB)
11684 .addReg(ARM::CPSR);
11685
11686 MI.eraseFromParent();
11687 return ContBB;
11688}
11689
11690// The CPSR operand of SelectItr might be missing a kill marker
11691// because there were multiple uses of CPSR, and ISel didn't know
11692// which to mark. Figure out whether SelectItr should have had a
11693// kill marker, and set it if it should. Returns the correct kill
11694// marker value.
11697 const TargetRegisterInfo* TRI) {
11698 // Scan forward through BB for a use/def of CPSR.
11699 MachineBasicBlock::iterator miI(std::next(SelectItr));
11700 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11701 const MachineInstr& mi = *miI;
11702 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11703 return false;
11704 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11705 break; // Should have kill-flag - update below.
11706 }
11707
11708 // If we hit the end of the block, check whether CPSR is live into a
11709 // successor.
11710 if (miI == BB->end()) {
11711 for (MachineBasicBlock *Succ : BB->successors())
11712 if (Succ->isLiveIn(ARM::CPSR))
11713 return false;
11714 }
11715
11716 // We found a def, or hit the end of the basic block and CPSR wasn't live
11717 // out. SelectMI should have a kill flag on CPSR.
11718 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11719 return true;
11720}
11721
11722/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11723/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11725 MachineBasicBlock *TpLoopBody,
11726 MachineBasicBlock *TpExit, Register OpSizeReg,
11727 const TargetInstrInfo *TII, DebugLoc Dl,
11728 MachineRegisterInfo &MRI) {
11729 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11730 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11731 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11732 .addUse(OpSizeReg)
11733 .addImm(15)
11735 .addReg(0);
11736
11737 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11738 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11739 .addUse(AddDestReg, RegState::Kill)
11740 .addImm(4)
11742 .addReg(0);
11743
11744 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11745 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11746 .addUse(LsrDestReg, RegState::Kill);
11747
11748 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11749 .addUse(TotalIterationsReg)
11750 .addMBB(TpExit);
11751
11752 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11753 .addMBB(TpLoopBody)
11755
11756 return TotalIterationsReg;
11757}
11758
11759/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11760/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11761/// loops.
11762static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11763 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11764 const TargetInstrInfo *TII, DebugLoc Dl,
11765 MachineRegisterInfo &MRI, Register OpSrcReg,
11766 Register OpDestReg, Register ElementCountReg,
11767 Register TotalIterationsReg, bool IsMemcpy) {
11768 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11769 // array, loop iteration counter, predication counter.
11770
11771 Register SrcPhiReg, CurrSrcReg;
11772 if (IsMemcpy) {
11773 // Current position in the src array
11774 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11775 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11776 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11777 .addUse(OpSrcReg)
11778 .addMBB(TpEntry)
11779 .addUse(CurrSrcReg)
11780 .addMBB(TpLoopBody);
11781 }
11782
11783 // Current position in the dest array
11784 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11785 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11786 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11787 .addUse(OpDestReg)
11788 .addMBB(TpEntry)
11789 .addUse(CurrDestReg)
11790 .addMBB(TpLoopBody);
11791
11792 // Current loop counter
11793 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11794 Register RemainingLoopIterationsReg =
11795 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11796 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11797 .addUse(TotalIterationsReg)
11798 .addMBB(TpEntry)
11799 .addUse(RemainingLoopIterationsReg)
11800 .addMBB(TpLoopBody);
11801
11802 // Predication counter
11803 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11804 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11805 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11806 .addUse(ElementCountReg)
11807 .addMBB(TpEntry)
11808 .addUse(RemainingElementsReg)
11809 .addMBB(TpLoopBody);
11810
11811 // Pass predication counter to VCTP
11812 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11813 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11814 .addUse(PredCounterPhiReg)
11816 .addReg(0)
11817 .addReg(0);
11818
11819 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11820 .addUse(PredCounterPhiReg)
11821 .addImm(16)
11823 .addReg(0);
11824
11825 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11826 Register SrcValueReg;
11827 if (IsMemcpy) {
11828 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11829 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11830 .addDef(CurrSrcReg)
11831 .addDef(SrcValueReg)
11832 .addReg(SrcPhiReg)
11833 .addImm(16)
11835 .addUse(VccrReg)
11836 .addReg(0);
11837 } else
11838 SrcValueReg = OpSrcReg;
11839
11840 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11841 .addDef(CurrDestReg)
11842 .addUse(SrcValueReg)
11843 .addReg(DestPhiReg)
11844 .addImm(16)
11846 .addUse(VccrReg)
11847 .addReg(0);
11848
11849 // Add the pseudoInstrs for decrementing the loop counter and marking the
11850 // end:t2DoLoopDec and t2DoLoopEnd
11851 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11852 .addUse(LoopCounterPhiReg)
11853 .addImm(1);
11854
11855 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11856 .addUse(RemainingLoopIterationsReg)
11857 .addMBB(TpLoopBody);
11858
11859 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11860 .addMBB(TpExit)
11862}
11863
11865 // KCFI is supported in all ARM/Thumb modes
11866 return true;
11867}
11868
11872 const TargetInstrInfo *TII) const {
11873 assert(MBBI->isCall() && MBBI->getCFIType() &&
11874 "Invalid call instruction for a KCFI check");
11875
11876 MachineOperand *TargetOp = nullptr;
11877 switch (MBBI->getOpcode()) {
11878 // ARM mode opcodes
11879 case ARM::BLX:
11880 case ARM::BLX_pred:
11881 case ARM::BLX_noip:
11882 case ARM::BLX_pred_noip:
11883 case ARM::BX_CALL:
11884 TargetOp = &MBBI->getOperand(0);
11885 break;
11886 case ARM::TCRETURNri:
11887 case ARM::TCRETURNrinotr12:
11888 case ARM::TAILJMPr:
11889 case ARM::TAILJMPr4:
11890 TargetOp = &MBBI->getOperand(0);
11891 break;
11892 // Thumb mode opcodes (Thumb1 and Thumb2)
11893 // Note: Most Thumb call instructions have predicate operands before the
11894 // target register Format: tBLXr pred, predreg, target_register, ...
11895 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11896 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11897 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11898 TargetOp = &MBBI->getOperand(2);
11899 break;
11900 // Tail call instructions don't have predicates, target is operand 0
11901 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11902 TargetOp = &MBBI->getOperand(0);
11903 break;
11904 default:
11905 llvm_unreachable("Unexpected CFI call opcode");
11906 }
11907
11908 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11909 TargetOp->setIsRenamable(false);
11910
11911 // Select the appropriate KCFI_CHECK variant based on the instruction set
11912 unsigned KCFICheckOpcode;
11913 if (Subtarget->isThumb()) {
11914 if (Subtarget->isThumb2()) {
11915 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11916 } else {
11917 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11918 }
11919 } else {
11920 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11921 }
11922
11923 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11924 .addReg(TargetOp->getReg())
11925 .addImm(MBBI->getCFIType())
11926 .getInstr();
11927}
11928
11931 MachineBasicBlock *BB) const {
11932 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11933 DebugLoc dl = MI.getDebugLoc();
11934 bool isThumb2 = Subtarget->isThumb2();
11935 switch (MI.getOpcode()) {
11936 default: {
11937 MI.print(errs());
11938 llvm_unreachable("Unexpected instr type to insert");
11939 }
11940
11941 // Thumb1 post-indexed loads are really just single-register LDMs.
11942 case ARM::tLDR_postidx: {
11943 MachineOperand Def(MI.getOperand(1));
11944 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11945 .add(Def) // Rn_wb
11946 .add(MI.getOperand(2)) // Rn
11947 .add(MI.getOperand(3)) // PredImm
11948 .add(MI.getOperand(4)) // PredReg
11949 .add(MI.getOperand(0)) // Rt
11950 .cloneMemRefs(MI);
11951 MI.eraseFromParent();
11952 return BB;
11953 }
11954
11955 case ARM::MVE_MEMCPYLOOPINST:
11956 case ARM::MVE_MEMSETLOOPINST: {
11957
11958 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11959 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11960 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11961 // adds the relevant instructions in the TP loop Body for generation of a
11962 // WLSTP loop.
11963
11964 // Below is relevant portion of the CFG after the transformation.
11965 // The Machine Basic Blocks are shown along with branch conditions (in
11966 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11967 // portion of the CFG and may not necessarily be the entry/exit of the
11968 // function.
11969
11970 // (Relevant) CFG after transformation:
11971 // TP entry MBB
11972 // |
11973 // |-----------------|
11974 // (n <= 0) (n > 0)
11975 // | |
11976 // | TP loop Body MBB<--|
11977 // | | |
11978 // \ |___________|
11979 // \ /
11980 // TP exit MBB
11981
11982 MachineFunction *MF = BB->getParent();
11983 MachineFunctionProperties &Properties = MF->getProperties();
11984 MachineRegisterInfo &MRI = MF->getRegInfo();
11985
11986 Register OpDestReg = MI.getOperand(0).getReg();
11987 Register OpSrcReg = MI.getOperand(1).getReg();
11988 Register OpSizeReg = MI.getOperand(2).getReg();
11989
11990 // Allocate the required MBBs and add to parent function.
11991 MachineBasicBlock *TpEntry = BB;
11992 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
11993 MachineBasicBlock *TpExit;
11994
11995 MF->push_back(TpLoopBody);
11996
11997 // If any instructions are present in the current block after
11998 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11999 // move the instructions into the newly created exit block. If there are no
12000 // instructions add an explicit branch to the FallThrough block and then
12001 // split.
12002 //
12003 // The split is required for two reasons:
12004 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12005 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12006 // need to be updated. splitAt() already handles this.
12007 TpExit = BB->splitAt(MI, false);
12008 if (TpExit == BB) {
12009 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12010 "block containing memcpy/memset Pseudo");
12011 TpExit = BB->getFallThrough();
12012 BuildMI(BB, dl, TII->get(ARM::t2B))
12013 .addMBB(TpExit)
12015 TpExit = BB->splitAt(MI, false);
12016 }
12017
12018 // Add logic for iteration count
12019 Register TotalIterationsReg =
12020 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12021
12022 // Add the vectorized (and predicated) loads/store instructions
12023 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12024 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12025 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12026
12027 // Required to avoid conflict with the MachineVerifier during testing.
12028 Properties.resetNoPHIs();
12029
12030 // Connect the blocks
12031 TpEntry->addSuccessor(TpLoopBody);
12032 TpLoopBody->addSuccessor(TpLoopBody);
12033 TpLoopBody->addSuccessor(TpExit);
12034
12035 // Reorder for a more natural layout
12036 TpLoopBody->moveAfter(TpEntry);
12037 TpExit->moveAfter(TpLoopBody);
12038
12039 // Finally, remove the memcpy Pseudo Instruction
12040 MI.eraseFromParent();
12041
12042 // Return the exit block as it may contain other instructions requiring a
12043 // custom inserter
12044 return TpExit;
12045 }
12046
12047 // The Thumb2 pre-indexed stores have the same MI operands, they just
12048 // define them differently in the .td files from the isel patterns, so
12049 // they need pseudos.
12050 case ARM::t2STR_preidx:
12051 MI.setDesc(TII->get(ARM::t2STR_PRE));
12052 return BB;
12053 case ARM::t2STRB_preidx:
12054 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12055 return BB;
12056 case ARM::t2STRH_preidx:
12057 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12058 return BB;
12059
12060 case ARM::STRi_preidx:
12061 case ARM::STRBi_preidx: {
12062 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12063 : ARM::STRB_PRE_IMM;
12064 // Decode the offset.
12065 unsigned Offset = MI.getOperand(4).getImm();
12066 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12068 if (isSub)
12069 Offset = -Offset;
12070
12071 MachineMemOperand *MMO = *MI.memoperands_begin();
12072 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12073 .add(MI.getOperand(0)) // Rn_wb
12074 .add(MI.getOperand(1)) // Rt
12075 .add(MI.getOperand(2)) // Rn
12076 .addImm(Offset) // offset (skip GPR==zero_reg)
12077 .add(MI.getOperand(5)) // pred
12078 .add(MI.getOperand(6))
12079 .addMemOperand(MMO);
12080 MI.eraseFromParent();
12081 return BB;
12082 }
12083 case ARM::STRr_preidx:
12084 case ARM::STRBr_preidx:
12085 case ARM::STRH_preidx: {
12086 unsigned NewOpc;
12087 switch (MI.getOpcode()) {
12088 default: llvm_unreachable("unexpected opcode!");
12089 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12090 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12091 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12092 }
12093 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12094 for (const MachineOperand &MO : MI.operands())
12095 MIB.add(MO);
12096 MI.eraseFromParent();
12097 return BB;
12098 }
12099
12100 case ARM::tMOVCCr_pseudo: {
12101 // To "insert" a SELECT_CC instruction, we actually have to insert the
12102 // diamond control-flow pattern. The incoming instruction knows the
12103 // destination vreg to set, the condition code register to branch on, the
12104 // true/false values to select between, and a branch opcode to use.
12105 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12107
12108 // thisMBB:
12109 // ...
12110 // TrueVal = ...
12111 // cmpTY ccX, r1, r2
12112 // bCC copy1MBB
12113 // fallthrough --> copy0MBB
12114 MachineBasicBlock *thisMBB = BB;
12115 MachineFunction *F = BB->getParent();
12116 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12117 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12118 F->insert(It, copy0MBB);
12119 F->insert(It, sinkMBB);
12120
12121 // Set the call frame size on entry to the new basic blocks.
12122 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12123 copy0MBB->setCallFrameSize(CallFrameSize);
12124 sinkMBB->setCallFrameSize(CallFrameSize);
12125
12126 // Check whether CPSR is live past the tMOVCCr_pseudo.
12127 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12128 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12129 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12130 copy0MBB->addLiveIn(ARM::CPSR);
12131 sinkMBB->addLiveIn(ARM::CPSR);
12132 }
12133
12134 // Transfer the remainder of BB and its successor edges to sinkMBB.
12135 sinkMBB->splice(sinkMBB->begin(), BB,
12136 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12138
12139 BB->addSuccessor(copy0MBB);
12140 BB->addSuccessor(sinkMBB);
12141
12142 BuildMI(BB, dl, TII->get(ARM::tBcc))
12143 .addMBB(sinkMBB)
12144 .addImm(MI.getOperand(3).getImm())
12145 .addReg(MI.getOperand(4).getReg());
12146
12147 // copy0MBB:
12148 // %FalseValue = ...
12149 // # fallthrough to sinkMBB
12150 BB = copy0MBB;
12151
12152 // Update machine-CFG edges
12153 BB->addSuccessor(sinkMBB);
12154
12155 // sinkMBB:
12156 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12157 // ...
12158 BB = sinkMBB;
12159 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12160 .addReg(MI.getOperand(1).getReg())
12161 .addMBB(copy0MBB)
12162 .addReg(MI.getOperand(2).getReg())
12163 .addMBB(thisMBB);
12164
12165 MI.eraseFromParent(); // The pseudo instruction is gone now.
12166 return BB;
12167 }
12168
12169 case ARM::BCCi64:
12170 case ARM::BCCZi64: {
12171 // If there is an unconditional branch to the other successor, remove it.
12172 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12173
12174 // Compare both parts that make up the double comparison separately for
12175 // equality.
12176 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12177
12178 Register LHS1 = MI.getOperand(1).getReg();
12179 Register LHS2 = MI.getOperand(2).getReg();
12180 if (RHSisZero) {
12181 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12182 .addReg(LHS1)
12183 .addImm(0)
12185 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12186 .addReg(LHS2).addImm(0)
12187 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12188 } else {
12189 Register RHS1 = MI.getOperand(3).getReg();
12190 Register RHS2 = MI.getOperand(4).getReg();
12191 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12192 .addReg(LHS1)
12193 .addReg(RHS1)
12195 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12196 .addReg(LHS2).addReg(RHS2)
12197 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12198 }
12199
12200 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12201 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12202 if (MI.getOperand(0).getImm() == ARMCC::NE)
12203 std::swap(destMBB, exitMBB);
12204
12205 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12206 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12207 if (isThumb2)
12208 BuildMI(BB, dl, TII->get(ARM::t2B))
12209 .addMBB(exitMBB)
12211 else
12212 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12213
12214 MI.eraseFromParent(); // The pseudo instruction is gone now.
12215 return BB;
12216 }
12217
12218 case ARM::Int_eh_sjlj_setjmp:
12219 case ARM::Int_eh_sjlj_setjmp_nofp:
12220 case ARM::tInt_eh_sjlj_setjmp:
12221 case ARM::t2Int_eh_sjlj_setjmp:
12222 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12223 return BB;
12224
12225 case ARM::Int_eh_sjlj_setup_dispatch:
12226 EmitSjLjDispatchBlock(MI, BB);
12227 return BB;
12228 case ARM::COPY_STRUCT_BYVAL_I32:
12229 ++NumLoopByVals;
12230 return EmitStructByval(MI, BB);
12231 case ARM::WIN__CHKSTK:
12232 return EmitLowered__chkstk(MI, BB);
12233 case ARM::WIN__DBZCHK:
12234 return EmitLowered__dbzchk(MI, BB);
12235 }
12236}
12237
12238/// Attaches vregs to MEMCPY that it will use as scratch registers
12239/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12240/// instead of as a custom inserter because we need the use list from the SDNode.
12241static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12242 MachineInstr &MI, const SDNode *Node) {
12243 bool isThumb1 = Subtarget->isThumb1Only();
12244
12245 MachineFunction *MF = MI.getParent()->getParent();
12246 MachineRegisterInfo &MRI = MF->getRegInfo();
12247 MachineInstrBuilder MIB(*MF, MI);
12248
12249 // If the new dst/src is unused mark it as dead.
12250 if (!Node->hasAnyUseOfValue(0)) {
12251 MI.getOperand(0).setIsDead(true);
12252 }
12253 if (!Node->hasAnyUseOfValue(1)) {
12254 MI.getOperand(1).setIsDead(true);
12255 }
12256
12257 // The MEMCPY both defines and kills the scratch registers.
12258 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12259 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12260 : &ARM::GPRRegClass);
12262 }
12263}
12264
12266 SDNode *Node) const {
12267 if (MI.getOpcode() == ARM::MEMCPY) {
12268 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12269 return;
12270 }
12271
12272 const MCInstrDesc *MCID = &MI.getDesc();
12273 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12274 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12275 // operand is still set to noreg. If needed, set the optional operand's
12276 // register to CPSR, and remove the redundant implicit def.
12277 //
12278 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12279
12280 // Rename pseudo opcodes.
12281 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12282 unsigned ccOutIdx;
12283 if (NewOpc) {
12284 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12285 MCID = &TII->get(NewOpc);
12286
12287 assert(MCID->getNumOperands() ==
12288 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12289 && "converted opcode should be the same except for cc_out"
12290 " (and, on Thumb1, pred)");
12291
12292 MI.setDesc(*MCID);
12293
12294 // Add the optional cc_out operand
12295 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12296
12297 // On Thumb1, move all input operands to the end, then add the predicate
12298 if (Subtarget->isThumb1Only()) {
12299 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12300 MI.addOperand(MI.getOperand(1));
12301 MI.removeOperand(1);
12302 }
12303
12304 // Restore the ties
12305 for (unsigned i = MI.getNumOperands(); i--;) {
12306 const MachineOperand& op = MI.getOperand(i);
12307 if (op.isReg() && op.isUse()) {
12308 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12309 if (DefIdx != -1)
12310 MI.tieOperands(DefIdx, i);
12311 }
12312 }
12313
12315 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12316 ccOutIdx = 1;
12317 } else
12318 ccOutIdx = MCID->getNumOperands() - 1;
12319 } else
12320 ccOutIdx = MCID->getNumOperands() - 1;
12321
12322 // Any ARM instruction that sets the 's' bit should specify an optional
12323 // "cc_out" operand in the last operand position.
12324 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12325 assert(!NewOpc && "Optional cc_out operand required");
12326 return;
12327 }
12328 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12329 // since we already have an optional CPSR def.
12330 bool definesCPSR = false;
12331 bool deadCPSR = false;
12332 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12333 ++i) {
12334 const MachineOperand &MO = MI.getOperand(i);
12335 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12336 definesCPSR = true;
12337 if (MO.isDead())
12338 deadCPSR = true;
12339 MI.removeOperand(i);
12340 break;
12341 }
12342 }
12343 if (!definesCPSR) {
12344 assert(!NewOpc && "Optional cc_out operand required");
12345 return;
12346 }
12347 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12348 if (deadCPSR) {
12349 assert(!MI.getOperand(ccOutIdx).getReg() &&
12350 "expect uninitialized optional cc_out operand");
12351 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12352 if (!Subtarget->isThumb1Only())
12353 return;
12354 }
12355
12356 // If this instruction was defined with an optional CPSR def and its dag node
12357 // had a live implicit CPSR def, then activate the optional CPSR def.
12358 MachineOperand &MO = MI.getOperand(ccOutIdx);
12359 MO.setReg(ARM::CPSR);
12360 MO.setIsDef(true);
12361}
12362
12363//===----------------------------------------------------------------------===//
12364// ARM Optimization Hooks
12365//===----------------------------------------------------------------------===//
12366
12367// Helper function that checks if N is a null or all ones constant.
12368static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12370}
12371
12372// Return true if N is conditionally 0 or all ones.
12373// Detects these expressions where cc is an i1 value:
12374//
12375// (select cc 0, y) [AllOnes=0]
12376// (select cc y, 0) [AllOnes=0]
12377// (zext cc) [AllOnes=0]
12378// (sext cc) [AllOnes=0/1]
12379// (select cc -1, y) [AllOnes=1]
12380// (select cc y, -1) [AllOnes=1]
12381//
12382// Invert is set when N is the null/all ones constant when CC is false.
12383// OtherOp is set to the alternative value of N.
12385 SDValue &CC, bool &Invert,
12386 SDValue &OtherOp,
12387 SelectionDAG &DAG) {
12388 switch (N->getOpcode()) {
12389 default: return false;
12390 case ISD::SELECT: {
12391 CC = N->getOperand(0);
12392 SDValue N1 = N->getOperand(1);
12393 SDValue N2 = N->getOperand(2);
12394 if (isZeroOrAllOnes(N1, AllOnes)) {
12395 Invert = false;
12396 OtherOp = N2;
12397 return true;
12398 }
12399 if (isZeroOrAllOnes(N2, AllOnes)) {
12400 Invert = true;
12401 OtherOp = N1;
12402 return true;
12403 }
12404 return false;
12405 }
12406 case ISD::ZERO_EXTEND:
12407 // (zext cc) can never be the all ones value.
12408 if (AllOnes)
12409 return false;
12410 [[fallthrough]];
12411 case ISD::SIGN_EXTEND: {
12412 SDLoc dl(N);
12413 EVT VT = N->getValueType(0);
12414 CC = N->getOperand(0);
12415 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12416 return false;
12417 Invert = !AllOnes;
12418 if (AllOnes)
12419 // When looking for an AllOnes constant, N is an sext, and the 'other'
12420 // value is 0.
12421 OtherOp = DAG.getConstant(0, dl, VT);
12422 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12423 // When looking for a 0 constant, N can be zext or sext.
12424 OtherOp = DAG.getConstant(1, dl, VT);
12425 else
12426 OtherOp = DAG.getAllOnesConstant(dl, VT);
12427 return true;
12428 }
12429 }
12430}
12431
12432// Combine a constant select operand into its use:
12433//
12434// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12435// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12436// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12437// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12438// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12439//
12440// The transform is rejected if the select doesn't have a constant operand that
12441// is null, or all ones when AllOnes is set.
12442//
12443// Also recognize sext/zext from i1:
12444//
12445// (add (zext cc), x) -> (select cc (add x, 1), x)
12446// (add (sext cc), x) -> (select cc (add x, -1), x)
12447//
12448// These transformations eventually create predicated instructions.
12449//
12450// @param N The node to transform.
12451// @param Slct The N operand that is a select.
12452// @param OtherOp The other N operand (x above).
12453// @param DCI Context.
12454// @param AllOnes Require the select constant to be all ones instead of null.
12455// @returns The new node, or SDValue() on failure.
12456static
12459 bool AllOnes = false) {
12460 SelectionDAG &DAG = DCI.DAG;
12461 EVT VT = N->getValueType(0);
12462 SDValue NonConstantVal;
12463 SDValue CCOp;
12464 bool SwapSelectOps;
12465 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12466 NonConstantVal, DAG))
12467 return SDValue();
12468
12469 // Slct is now know to be the desired identity constant when CC is true.
12470 SDValue TrueVal = OtherOp;
12471 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12472 OtherOp, NonConstantVal);
12473 // Unless SwapSelectOps says CC should be false.
12474 if (SwapSelectOps)
12475 std::swap(TrueVal, FalseVal);
12476
12477 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12478 CCOp, TrueVal, FalseVal);
12479}
12480
12481// Attempt combineSelectAndUse on each operand of a commutative operator N.
12482static
12485 SDValue N0 = N->getOperand(0);
12486 SDValue N1 = N->getOperand(1);
12487 if (N0.getNode()->hasOneUse())
12488 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12489 return Result;
12490 if (N1.getNode()->hasOneUse())
12491 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12492 return Result;
12493 return SDValue();
12494}
12495
12497 // VUZP shuffle node.
12498 if (N->getOpcode() == ARMISD::VUZP)
12499 return true;
12500
12501 // "VUZP" on i32 is an alias for VTRN.
12502 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12503 return true;
12504
12505 return false;
12506}
12507
12510 const ARMSubtarget *Subtarget) {
12511 // Look for ADD(VUZP.0, VUZP.1).
12512 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12513 N0 == N1)
12514 return SDValue();
12515
12516 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12517 if (!N->getValueType(0).is64BitVector())
12518 return SDValue();
12519
12520 // Generate vpadd.
12521 SelectionDAG &DAG = DCI.DAG;
12522 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12523 SDLoc dl(N);
12524 SDNode *Unzip = N0.getNode();
12525 EVT VT = N->getValueType(0);
12526
12528 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12529 TLI.getPointerTy(DAG.getDataLayout())));
12530 Ops.push_back(Unzip->getOperand(0));
12531 Ops.push_back(Unzip->getOperand(1));
12532
12533 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12534}
12535
12538 const ARMSubtarget *Subtarget) {
12539 // Check for two extended operands.
12540 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12541 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12542 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12543 N1.getOpcode() == ISD::ZERO_EXTEND))
12544 return SDValue();
12545
12546 SDValue N00 = N0.getOperand(0);
12547 SDValue N10 = N1.getOperand(0);
12548
12549 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12550 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12551 N00 == N10)
12552 return SDValue();
12553
12554 // We only recognize Q register paddl here; this can't be reached until
12555 // after type legalization.
12556 if (!N00.getValueType().is64BitVector() ||
12558 return SDValue();
12559
12560 // Generate vpaddl.
12561 SelectionDAG &DAG = DCI.DAG;
12562 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12563 SDLoc dl(N);
12564 EVT VT = N->getValueType(0);
12565
12567 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12568 unsigned Opcode;
12569 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12570 Opcode = Intrinsic::arm_neon_vpaddls;
12571 else
12572 Opcode = Intrinsic::arm_neon_vpaddlu;
12573 Ops.push_back(DAG.getConstant(Opcode, dl,
12574 TLI.getPointerTy(DAG.getDataLayout())));
12575 EVT ElemTy = N00.getValueType().getVectorElementType();
12576 unsigned NumElts = VT.getVectorNumElements();
12577 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12578 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12579 N00.getOperand(0), N00.getOperand(1));
12580 Ops.push_back(Concat);
12581
12582 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12583}
12584
12585// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12586// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12587// much easier to match.
12588static SDValue
12591 const ARMSubtarget *Subtarget) {
12592 // Only perform optimization if after legalize, and if NEON is available. We
12593 // also expected both operands to be BUILD_VECTORs.
12594 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12595 || N0.getOpcode() != ISD::BUILD_VECTOR
12596 || N1.getOpcode() != ISD::BUILD_VECTOR)
12597 return SDValue();
12598
12599 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12600 EVT VT = N->getValueType(0);
12601 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12602 return SDValue();
12603
12604 // Check that the vector operands are of the right form.
12605 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12606 // operands, where N is the size of the formed vector.
12607 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12608 // index such that we have a pair wise add pattern.
12609
12610 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12612 return SDValue();
12613 SDValue Vec = N0->getOperand(0)->getOperand(0);
12614 SDNode *V = Vec.getNode();
12615 unsigned nextIndex = 0;
12616
12617 // For each operands to the ADD which are BUILD_VECTORs,
12618 // check to see if each of their operands are an EXTRACT_VECTOR with
12619 // the same vector and appropriate index.
12620 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12623
12624 SDValue ExtVec0 = N0->getOperand(i);
12625 SDValue ExtVec1 = N1->getOperand(i);
12626
12627 // First operand is the vector, verify its the same.
12628 if (V != ExtVec0->getOperand(0).getNode() ||
12629 V != ExtVec1->getOperand(0).getNode())
12630 return SDValue();
12631
12632 // Second is the constant, verify its correct.
12635
12636 // For the constant, we want to see all the even or all the odd.
12637 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12638 || C1->getZExtValue() != nextIndex+1)
12639 return SDValue();
12640
12641 // Increment index.
12642 nextIndex+=2;
12643 } else
12644 return SDValue();
12645 }
12646
12647 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12648 // we're using the entire input vector, otherwise there's a size/legality
12649 // mismatch somewhere.
12650 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12652 return SDValue();
12653
12654 // Create VPADDL node.
12655 SelectionDAG &DAG = DCI.DAG;
12656 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12657
12658 SDLoc dl(N);
12659
12660 // Build operand list.
12662 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12663 TLI.getPointerTy(DAG.getDataLayout())));
12664
12665 // Input is the vector.
12666 Ops.push_back(Vec);
12667
12668 // Get widened type and narrowed type.
12669 MVT widenType;
12670 unsigned numElem = VT.getVectorNumElements();
12671
12672 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12673 switch (inputLaneType.getSimpleVT().SimpleTy) {
12674 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12675 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12676 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12677 default:
12678 llvm_unreachable("Invalid vector element type for padd optimization.");
12679 }
12680
12681 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12682 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12683 return DAG.getNode(ExtOp, dl, VT, tmp);
12684}
12685
12687 if (V->getOpcode() == ISD::UMUL_LOHI ||
12688 V->getOpcode() == ISD::SMUL_LOHI)
12689 return V;
12690 return SDValue();
12691}
12692
12693static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12695 const ARMSubtarget *Subtarget) {
12696 if (!Subtarget->hasBaseDSP())
12697 return SDValue();
12698
12699 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12700 // accumulates the product into a 64-bit value. The 16-bit values will
12701 // be sign extended somehow or SRA'd into 32-bit values
12702 // (addc (adde (mul 16bit, 16bit), lo), hi)
12703 SDValue Mul = AddcNode->getOperand(0);
12704 SDValue Lo = AddcNode->getOperand(1);
12705 if (Mul.getOpcode() != ISD::MUL) {
12706 Lo = AddcNode->getOperand(0);
12707 Mul = AddcNode->getOperand(1);
12708 if (Mul.getOpcode() != ISD::MUL)
12709 return SDValue();
12710 }
12711
12712 SDValue SRA = AddeNode->getOperand(0);
12713 SDValue Hi = AddeNode->getOperand(1);
12714 if (SRA.getOpcode() != ISD::SRA) {
12715 SRA = AddeNode->getOperand(1);
12716 Hi = AddeNode->getOperand(0);
12717 if (SRA.getOpcode() != ISD::SRA)
12718 return SDValue();
12719 }
12720 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12721 if (Const->getZExtValue() != 31)
12722 return SDValue();
12723 } else
12724 return SDValue();
12725
12726 if (SRA.getOperand(0) != Mul)
12727 return SDValue();
12728
12729 SelectionDAG &DAG = DCI.DAG;
12730 SDLoc dl(AddcNode);
12731 unsigned Opcode = 0;
12732 SDValue Op0;
12733 SDValue Op1;
12734
12735 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12736 Opcode = ARMISD::SMLALBB;
12737 Op0 = Mul.getOperand(0);
12738 Op1 = Mul.getOperand(1);
12739 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12740 Opcode = ARMISD::SMLALBT;
12741 Op0 = Mul.getOperand(0);
12742 Op1 = Mul.getOperand(1).getOperand(0);
12743 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12744 Opcode = ARMISD::SMLALTB;
12745 Op0 = Mul.getOperand(0).getOperand(0);
12746 Op1 = Mul.getOperand(1);
12747 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12748 Opcode = ARMISD::SMLALTT;
12749 Op0 = Mul->getOperand(0).getOperand(0);
12750 Op1 = Mul->getOperand(1).getOperand(0);
12751 }
12752
12753 if (!Op0 || !Op1)
12754 return SDValue();
12755
12756 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12757 Op0, Op1, Lo, Hi);
12758 // Replace the ADDs' nodes uses by the MLA node's values.
12759 SDValue HiMLALResult(SMLAL.getNode(), 1);
12760 SDValue LoMLALResult(SMLAL.getNode(), 0);
12761
12762 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12763 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12764
12765 // Return original node to notify the driver to stop replacing.
12766 SDValue resNode(AddcNode, 0);
12767 return resNode;
12768}
12769
12772 const ARMSubtarget *Subtarget) {
12773 // Look for multiply add opportunities.
12774 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12775 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12776 // a glue link from the first add to the second add.
12777 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12778 // a S/UMLAL instruction.
12779 // UMUL_LOHI
12780 // / :lo \ :hi
12781 // V \ [no multiline comment]
12782 // loAdd -> ADDC |
12783 // \ :carry /
12784 // V V
12785 // ADDE <- hiAdd
12786 //
12787 // In the special case where only the higher part of a signed result is used
12788 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12789 // a constant with the exact value of 0x80000000, we recognize we are dealing
12790 // with a "rounded multiply and add" (or subtract) and transform it into
12791 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12792
12793 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12794 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12795 "Expect an ADDE or SUBE");
12796
12797 assert(AddeSubeNode->getNumOperands() == 3 &&
12798 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12799 "ADDE node has the wrong inputs");
12800
12801 // Check that we are chained to the right ADDC or SUBC node.
12802 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12803 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12804 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12805 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12806 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12807 return SDValue();
12808
12809 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12810 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12811
12812 // Check if the two operands are from the same mul_lohi node.
12813 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12814 return SDValue();
12815
12816 assert(AddcSubcNode->getNumValues() == 2 &&
12817 AddcSubcNode->getValueType(0) == MVT::i32 &&
12818 "Expect ADDC with two result values. First: i32");
12819
12820 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12821 // maybe a SMLAL which multiplies two 16-bit values.
12822 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12823 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12824 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12825 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12826 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12827 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12828
12829 // Check for the triangle shape.
12830 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12831 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12832
12833 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12834 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12835 return SDValue();
12836
12837 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12838 bool IsLeftOperandMUL = false;
12839 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12840 if (MULOp == SDValue())
12841 MULOp = findMUL_LOHI(AddeSubeOp1);
12842 else
12843 IsLeftOperandMUL = true;
12844 if (MULOp == SDValue())
12845 return SDValue();
12846
12847 // Figure out the right opcode.
12848 unsigned Opc = MULOp->getOpcode();
12849 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12850
12851 // Figure out the high and low input values to the MLAL node.
12852 SDValue *HiAddSub = nullptr;
12853 SDValue *LoMul = nullptr;
12854 SDValue *LowAddSub = nullptr;
12855
12856 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12857 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12858 return SDValue();
12859
12860 if (IsLeftOperandMUL)
12861 HiAddSub = &AddeSubeOp1;
12862 else
12863 HiAddSub = &AddeSubeOp0;
12864
12865 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12866 // whose low result is fed to the ADDC/SUBC we are checking.
12867
12868 if (AddcSubcOp0 == MULOp.getValue(0)) {
12869 LoMul = &AddcSubcOp0;
12870 LowAddSub = &AddcSubcOp1;
12871 }
12872 if (AddcSubcOp1 == MULOp.getValue(0)) {
12873 LoMul = &AddcSubcOp1;
12874 LowAddSub = &AddcSubcOp0;
12875 }
12876
12877 if (!LoMul)
12878 return SDValue();
12879
12880 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12881 // the replacement below will create a cycle.
12882 if (AddcSubcNode == HiAddSub->getNode() ||
12883 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12884 return SDValue();
12885
12886 // Create the merged node.
12887 SelectionDAG &DAG = DCI.DAG;
12888
12889 // Start building operand list.
12891 Ops.push_back(LoMul->getOperand(0));
12892 Ops.push_back(LoMul->getOperand(1));
12893
12894 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12895 // the case, we must be doing signed multiplication and only use the higher
12896 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12897 // addition or subtraction with the value of 0x800000.
12898 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12899 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12900 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12901 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12902 0x80000000) {
12903 Ops.push_back(*HiAddSub);
12904 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12905 FinalOpc = ARMISD::SMMLSR;
12906 } else {
12907 FinalOpc = ARMISD::SMMLAR;
12908 }
12909 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12910 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12911
12912 return SDValue(AddeSubeNode, 0);
12913 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12914 // SMMLS is generated during instruction selection and the rest of this
12915 // function can not handle the case where AddcSubcNode is a SUBC.
12916 return SDValue();
12917
12918 // Finish building the operand list for {U/S}MLAL
12919 Ops.push_back(*LowAddSub);
12920 Ops.push_back(*HiAddSub);
12921
12922 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12923 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12924
12925 // Replace the ADDs' nodes uses by the MLA node's values.
12926 SDValue HiMLALResult(MLALNode.getNode(), 1);
12927 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12928
12929 SDValue LoMLALResult(MLALNode.getNode(), 0);
12930 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12931
12932 // Return original node to notify the driver to stop replacing.
12933 return SDValue(AddeSubeNode, 0);
12934}
12935
12938 const ARMSubtarget *Subtarget) {
12939 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12940 // While trying to combine for the other MLAL nodes, first search for the
12941 // chance to use UMAAL. Check if Addc uses a node which has already
12942 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12943 // as the addend, and it's handled in PerformUMLALCombine.
12944
12945 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12946 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12947
12948 // Check that we have a glued ADDC node.
12949 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12950 if (AddcNode->getOpcode() != ARMISD::ADDC)
12951 return SDValue();
12952
12953 // Find the converted UMAAL or quit if it doesn't exist.
12954 SDNode *UmlalNode = nullptr;
12955 SDValue AddHi;
12956 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12957 UmlalNode = AddcNode->getOperand(0).getNode();
12958 AddHi = AddcNode->getOperand(1);
12959 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12960 UmlalNode = AddcNode->getOperand(1).getNode();
12961 AddHi = AddcNode->getOperand(0);
12962 } else {
12963 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12964 }
12965
12966 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12967 // the ADDC as well as Zero.
12968 if (!isNullConstant(UmlalNode->getOperand(3)))
12969 return SDValue();
12970
12971 if ((isNullConstant(AddeNode->getOperand(0)) &&
12972 AddeNode->getOperand(1).getNode() == UmlalNode) ||
12973 (AddeNode->getOperand(0).getNode() == UmlalNode &&
12974 isNullConstant(AddeNode->getOperand(1)))) {
12975 SelectionDAG &DAG = DCI.DAG;
12976 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12977 UmlalNode->getOperand(2), AddHi };
12978 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
12979 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12980
12981 // Replace the ADDs' nodes uses by the UMAAL node's values.
12982 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12983 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12984
12985 // Return original node to notify the driver to stop replacing.
12986 return SDValue(AddeNode, 0);
12987 }
12988 return SDValue();
12989}
12990
12992 const ARMSubtarget *Subtarget) {
12993 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12994 return SDValue();
12995
12996 // Check that we have a pair of ADDC and ADDE as operands.
12997 // Both addends of the ADDE must be zero.
12998 SDNode* AddcNode = N->getOperand(2).getNode();
12999 SDNode* AddeNode = N->getOperand(3).getNode();
13000 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13001 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13002 isNullConstant(AddeNode->getOperand(0)) &&
13003 isNullConstant(AddeNode->getOperand(1)) &&
13004 (AddeNode->getOperand(2).getNode() == AddcNode))
13005 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13006 DAG.getVTList(MVT::i32, MVT::i32),
13007 {N->getOperand(0), N->getOperand(1),
13008 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13009 else
13010 return SDValue();
13011}
13012
13015 const ARMSubtarget *Subtarget) {
13016 SelectionDAG &DAG(DCI.DAG);
13017
13018 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13019 // (SUBC (ADDE 0, 0, C), 1) -> C
13020 SDValue LHS = N->getOperand(0);
13021 SDValue RHS = N->getOperand(1);
13022 if (LHS->getOpcode() == ARMISD::ADDE &&
13023 isNullConstant(LHS->getOperand(0)) &&
13024 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13025 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13026 }
13027 }
13028
13029 if (Subtarget->isThumb1Only()) {
13030 SDValue RHS = N->getOperand(1);
13032 int32_t imm = C->getSExtValue();
13033 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13034 SDLoc DL(N);
13035 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13036 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13037 : ARMISD::ADDC;
13038 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13039 }
13040 }
13041 }
13042
13043 return SDValue();
13044}
13045
13048 const ARMSubtarget *Subtarget) {
13049 if (Subtarget->isThumb1Only()) {
13050 SelectionDAG &DAG = DCI.DAG;
13051 SDValue RHS = N->getOperand(1);
13053 int64_t imm = C->getSExtValue();
13054 if (imm < 0) {
13055 SDLoc DL(N);
13056
13057 // The with-carry-in form matches bitwise not instead of the negation.
13058 // Effectively, the inverse interpretation of the carry flag already
13059 // accounts for part of the negation.
13060 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13061
13062 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13063 : ARMISD::ADDE;
13064 return DAG.getNode(Opcode, DL, N->getVTList(),
13065 N->getOperand(0), RHS, N->getOperand(2));
13066 }
13067 }
13068 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13069 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13070 }
13071 return SDValue();
13072}
13073
13076 const ARMSubtarget *Subtarget) {
13077 if (!Subtarget->hasMVEIntegerOps())
13078 return SDValue();
13079
13080 SDLoc dl(N);
13081 SDValue SetCC;
13082 SDValue LHS;
13083 SDValue RHS;
13084 ISD::CondCode CC;
13085 SDValue TrueVal;
13086 SDValue FalseVal;
13087
13088 if (N->getOpcode() == ISD::SELECT &&
13089 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13090 SetCC = N->getOperand(0);
13091 LHS = SetCC->getOperand(0);
13092 RHS = SetCC->getOperand(1);
13093 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13094 TrueVal = N->getOperand(1);
13095 FalseVal = N->getOperand(2);
13096 } else if (N->getOpcode() == ISD::SELECT_CC) {
13097 LHS = N->getOperand(0);
13098 RHS = N->getOperand(1);
13099 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13100 TrueVal = N->getOperand(2);
13101 FalseVal = N->getOperand(3);
13102 } else {
13103 return SDValue();
13104 }
13105
13106 unsigned int Opcode = 0;
13107 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13108 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13109 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13110 Opcode = ARMISD::VMINVu;
13111 if (CC == ISD::SETUGT)
13112 std::swap(TrueVal, FalseVal);
13113 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13114 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13115 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13116 Opcode = ARMISD::VMINVs;
13117 if (CC == ISD::SETGT)
13118 std::swap(TrueVal, FalseVal);
13119 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13120 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13121 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13122 Opcode = ARMISD::VMAXVu;
13123 if (CC == ISD::SETULT)
13124 std::swap(TrueVal, FalseVal);
13125 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13126 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13127 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13128 Opcode = ARMISD::VMAXVs;
13129 if (CC == ISD::SETLT)
13130 std::swap(TrueVal, FalseVal);
13131 } else
13132 return SDValue();
13133
13134 // Normalise to the right hand side being the vector reduction
13135 switch (TrueVal->getOpcode()) {
13140 std::swap(LHS, RHS);
13141 std::swap(TrueVal, FalseVal);
13142 break;
13143 }
13144
13145 EVT VectorType = FalseVal->getOperand(0).getValueType();
13146
13147 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13148 VectorType != MVT::v4i32)
13149 return SDValue();
13150
13151 EVT VectorScalarType = VectorType.getVectorElementType();
13152
13153 // The values being selected must also be the ones being compared
13154 if (TrueVal != LHS || FalseVal != RHS)
13155 return SDValue();
13156
13157 EVT LeftType = LHS->getValueType(0);
13158 EVT RightType = RHS->getValueType(0);
13159
13160 // The types must match the reduced type too
13161 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13162 return SDValue();
13163
13164 // Legalise the scalar to an i32
13165 if (VectorScalarType != MVT::i32)
13166 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13167
13168 // Generate the reduction as an i32 for legalisation purposes
13169 auto Reduction =
13170 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13171
13172 // The result isn't actually an i32 so truncate it back to its original type
13173 if (VectorScalarType != MVT::i32)
13174 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13175
13176 return Reduction;
13177}
13178
13179// A special combine for the vqdmulh family of instructions. This is one of the
13180// potential set of patterns that could patch this instruction. The base pattern
13181// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13182// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13183// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13184// the max is unnecessary.
13186 EVT VT = N->getValueType(0);
13187 SDValue Shft;
13188 ConstantSDNode *Clamp;
13189
13190 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13191 return SDValue();
13192
13193 if (N->getOpcode() == ISD::SMIN) {
13194 Shft = N->getOperand(0);
13195 Clamp = isConstOrConstSplat(N->getOperand(1));
13196 } else if (N->getOpcode() == ISD::VSELECT) {
13197 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13198 SDValue Cmp = N->getOperand(0);
13199 if (Cmp.getOpcode() != ISD::SETCC ||
13200 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13201 Cmp.getOperand(0) != N->getOperand(1) ||
13202 Cmp.getOperand(1) != N->getOperand(2))
13203 return SDValue();
13204 Shft = N->getOperand(1);
13205 Clamp = isConstOrConstSplat(N->getOperand(2));
13206 } else
13207 return SDValue();
13208
13209 if (!Clamp)
13210 return SDValue();
13211
13212 MVT ScalarType;
13213 int ShftAmt = 0;
13214 switch (Clamp->getSExtValue()) {
13215 case (1 << 7) - 1:
13216 ScalarType = MVT::i8;
13217 ShftAmt = 7;
13218 break;
13219 case (1 << 15) - 1:
13220 ScalarType = MVT::i16;
13221 ShftAmt = 15;
13222 break;
13223 case (1ULL << 31) - 1:
13224 ScalarType = MVT::i32;
13225 ShftAmt = 31;
13226 break;
13227 default:
13228 return SDValue();
13229 }
13230
13231 if (Shft.getOpcode() != ISD::SRA)
13232 return SDValue();
13234 if (!N1 || N1->getSExtValue() != ShftAmt)
13235 return SDValue();
13236
13237 SDValue Mul = Shft.getOperand(0);
13238 if (Mul.getOpcode() != ISD::MUL)
13239 return SDValue();
13240
13241 SDValue Ext0 = Mul.getOperand(0);
13242 SDValue Ext1 = Mul.getOperand(1);
13243 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13244 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13245 return SDValue();
13246 EVT VecVT = Ext0.getOperand(0).getValueType();
13247 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13248 return SDValue();
13249 if (Ext1.getOperand(0).getValueType() != VecVT ||
13250 VecVT.getScalarType() != ScalarType ||
13251 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13252 return SDValue();
13253
13254 SDLoc DL(Mul);
13255 unsigned LegalLanes = 128 / (ShftAmt + 1);
13256 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13257 // For types smaller than legal vectors extend to be legal and only use needed
13258 // lanes.
13259 if (VecVT.getSizeInBits() < 128) {
13260 EVT ExtVecVT =
13262 VecVT.getVectorNumElements());
13263 SDValue Inp0 =
13264 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13265 SDValue Inp1 =
13266 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13267 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13268 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13269 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13270 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13271 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13272 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13273 }
13274
13275 // For larger types, split into legal sized chunks.
13276 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13277 unsigned NumParts = VecVT.getSizeInBits() / 128;
13279 for (unsigned I = 0; I < NumParts; ++I) {
13280 SDValue Inp0 =
13281 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13282 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13283 SDValue Inp1 =
13284 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13285 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13286 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13287 Parts.push_back(VQDMULH);
13288 }
13289 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13290 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13291}
13292
13295 const ARMSubtarget *Subtarget) {
13296 if (!Subtarget->hasMVEIntegerOps())
13297 return SDValue();
13298
13299 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13300 return V;
13301
13302 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13303 //
13304 // We need to re-implement this optimization here as the implementation in the
13305 // Target-Independent DAGCombiner does not handle the kind of constant we make
13306 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13307 // good reason, allowing truncation there would break other targets).
13308 //
13309 // Currently, this is only done for MVE, as it's the only target that benefits
13310 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13311 if (N->getOperand(0).getOpcode() != ISD::XOR)
13312 return SDValue();
13313 SDValue XOR = N->getOperand(0);
13314
13315 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13316 // It is important to check with truncation allowed as the BUILD_VECTORs we
13317 // generate in those situations will truncate their operands.
13318 ConstantSDNode *Const =
13319 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13320 /*AllowTruncation*/ true);
13321 if (!Const || !Const->isOne())
13322 return SDValue();
13323
13324 // Rewrite into vselect(cond, rhs, lhs).
13325 SDValue Cond = XOR->getOperand(0);
13326 SDValue LHS = N->getOperand(1);
13327 SDValue RHS = N->getOperand(2);
13328 EVT Type = N->getValueType(0);
13329 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13330}
13331
13332// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13335 const ARMSubtarget *Subtarget) {
13336 SDValue Op0 = N->getOperand(0);
13337 SDValue Op1 = N->getOperand(1);
13338 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13339 EVT VT = N->getValueType(0);
13340
13341 if (!Subtarget->hasMVEIntegerOps() ||
13343 return SDValue();
13344
13345 if (CC == ISD::SETUGE) {
13346 std::swap(Op0, Op1);
13347 CC = ISD::SETULT;
13348 }
13349
13350 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13352 return SDValue();
13353
13354 // Check first operand is BuildVector of 0,1,2,...
13355 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13356 if (!Op0.getOperand(I).isUndef() &&
13358 Op0.getConstantOperandVal(I) == I))
13359 return SDValue();
13360 }
13361
13362 // The second is a Splat of Op1S
13363 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13364 if (!Op1S)
13365 return SDValue();
13366
13367 unsigned Opc;
13368 switch (VT.getVectorNumElements()) {
13369 case 2:
13370 Opc = Intrinsic::arm_mve_vctp64;
13371 break;
13372 case 4:
13373 Opc = Intrinsic::arm_mve_vctp32;
13374 break;
13375 case 8:
13376 Opc = Intrinsic::arm_mve_vctp16;
13377 break;
13378 case 16:
13379 Opc = Intrinsic::arm_mve_vctp8;
13380 break;
13381 default:
13382 return SDValue();
13383 }
13384
13385 SDLoc DL(N);
13386 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13387 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13388 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13389}
13390
13391/// PerformADDECombine - Target-specific dag combine transform from
13392/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13393/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13396 const ARMSubtarget *Subtarget) {
13397 // Only ARM and Thumb2 support UMLAL/SMLAL.
13398 if (Subtarget->isThumb1Only())
13399 return PerformAddeSubeCombine(N, DCI, Subtarget);
13400
13401 // Only perform the checks after legalize when the pattern is available.
13402 if (DCI.isBeforeLegalize()) return SDValue();
13403
13404 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13405}
13406
13407/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13408/// operands N0 and N1. This is a helper for PerformADDCombine that is
13409/// called with the default operands, and if that fails, with commuted
13410/// operands.
13413 const ARMSubtarget *Subtarget){
13414 // Attempt to create vpadd for this add.
13415 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13416 return Result;
13417
13418 // Attempt to create vpaddl for this add.
13419 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13420 return Result;
13421 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13422 Subtarget))
13423 return Result;
13424
13425 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13426 if (N0.getNode()->hasOneUse())
13427 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13428 return Result;
13429 return SDValue();
13430}
13431
13433 EVT VT = N->getValueType(0);
13434 SDValue N0 = N->getOperand(0);
13435 SDValue N1 = N->getOperand(1);
13436 SDLoc dl(N);
13437
13438 auto IsVecReduce = [](SDValue Op) {
13439 switch (Op.getOpcode()) {
13440 case ISD::VECREDUCE_ADD:
13441 case ARMISD::VADDVs:
13442 case ARMISD::VADDVu:
13443 case ARMISD::VMLAVs:
13444 case ARMISD::VMLAVu:
13445 return true;
13446 }
13447 return false;
13448 };
13449
13450 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13451 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13452 // add(add(X, vecreduce(Y)), vecreduce(Z))
13453 // to make better use of vaddva style instructions.
13454 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13455 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13456 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13457 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13458 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13459 }
13460 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13461 // add(add(add(A, C), reduce(B)), reduce(D))
13462 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13463 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13464 unsigned N0RedOp = 0;
13465 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13466 N0RedOp = 1;
13467 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13468 return SDValue();
13469 }
13470
13471 unsigned N1RedOp = 0;
13472 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13473 N1RedOp = 1;
13474 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13475 return SDValue();
13476
13477 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13478 N1.getOperand(1 - N1RedOp));
13479 SDValue Add1 =
13480 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13481 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13482 }
13483 return SDValue();
13484 };
13485 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13486 return R;
13487 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13488 return R;
13489
13490 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13491 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13492 // by ascending load offsets. This can help cores prefetch if the order of
13493 // loads is more predictable.
13494 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13495 // Check if two reductions are known to load data where one is before/after
13496 // another. Return negative if N0 loads data before N1, positive if N1 is
13497 // before N0 and 0 otherwise if nothing is known.
13498 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13499 // Look through to the first operand of a MUL, for the VMLA case.
13500 // Currently only looks at the first operand, in the hope they are equal.
13501 if (N0.getOpcode() == ISD::MUL)
13502 N0 = N0.getOperand(0);
13503 if (N1.getOpcode() == ISD::MUL)
13504 N1 = N1.getOperand(0);
13505
13506 // Return true if the two operands are loads to the same object and the
13507 // offset of the first is known to be less than the offset of the second.
13508 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13509 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13510 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13511 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13512 Load1->isIndexed())
13513 return 0;
13514
13515 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13516 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13517
13518 if (!BaseLocDecomp0.getBase() ||
13519 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13520 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13521 return 0;
13522 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13523 return -1;
13524 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13525 return 1;
13526 return 0;
13527 };
13528
13529 SDValue X;
13530 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13531 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13532 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13533 N0.getOperand(1).getOperand(0));
13534 if (IsBefore < 0) {
13535 X = N0.getOperand(0);
13536 N0 = N0.getOperand(1);
13537 } else if (IsBefore > 0) {
13538 X = N0.getOperand(1);
13539 N0 = N0.getOperand(0);
13540 } else
13541 return SDValue();
13542 } else if (IsVecReduce(N0.getOperand(0))) {
13543 X = N0.getOperand(1);
13544 N0 = N0.getOperand(0);
13545 } else if (IsVecReduce(N0.getOperand(1))) {
13546 X = N0.getOperand(0);
13547 N0 = N0.getOperand(1);
13548 } else
13549 return SDValue();
13550 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13551 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13552 // Note this is backward to how you would expect. We create
13553 // add(reduce(load + 16), reduce(load + 0)) so that the
13554 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13555 // the X as VADDV(load + 0)
13556 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13557 } else
13558 return SDValue();
13559
13560 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13561 return SDValue();
13562
13563 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13564 return SDValue();
13565
13566 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13567 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13568 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13569 };
13570 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13571 return R;
13572 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13573 return R;
13574 return SDValue();
13575}
13576
13578 const ARMSubtarget *Subtarget) {
13579 if (!Subtarget->hasMVEIntegerOps())
13580 return SDValue();
13581
13583 return R;
13584
13585 EVT VT = N->getValueType(0);
13586 SDValue N0 = N->getOperand(0);
13587 SDValue N1 = N->getOperand(1);
13588 SDLoc dl(N);
13589
13590 if (VT != MVT::i64)
13591 return SDValue();
13592
13593 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13594 // will look like:
13595 // t1: i32,i32 = ARMISD::VADDLVs x
13596 // t2: i64 = build_pair t1, t1:1
13597 // t3: i64 = add t2, y
13598 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13599 // the add to be simplified separately.
13600 // We also need to check for sext / zext and commutitive adds.
13601 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13602 SDValue NB) {
13603 if (NB->getOpcode() != ISD::BUILD_PAIR)
13604 return SDValue();
13605 SDValue VecRed = NB->getOperand(0);
13606 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13607 VecRed.getResNo() != 0 ||
13608 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13609 return SDValue();
13610
13611 if (VecRed->getOpcode() == OpcodeA) {
13612 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13613 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13614 VecRed.getOperand(0), VecRed.getOperand(1));
13615 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13616 }
13617
13619 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13620
13621 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13622 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13623 Ops.push_back(VecRed->getOperand(I));
13624 SDValue Red =
13625 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13626 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13627 SDValue(Red.getNode(), 1));
13628 };
13629
13630 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13631 return M;
13632 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13633 return M;
13634 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13635 return M;
13636 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13637 return M;
13638 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13639 return M;
13640 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13641 return M;
13642 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13643 return M;
13644 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13645 return M;
13646 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13647 return M;
13648 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13649 return M;
13650 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13651 return M;
13652 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13653 return M;
13654 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13655 return M;
13656 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13657 return M;
13658 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13659 return M;
13660 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13661 return M;
13662 return SDValue();
13663}
13664
13665bool
13667 CombineLevel Level) const {
13668 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13669 N->getOpcode() == ISD::SRL) &&
13670 "Expected shift op");
13671
13672 SDValue ShiftLHS = N->getOperand(0);
13673 if (!ShiftLHS->hasOneUse())
13674 return false;
13675
13676 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13677 !ShiftLHS.getOperand(0)->hasOneUse())
13678 return false;
13679
13680 if (Level == BeforeLegalizeTypes)
13681 return true;
13682
13683 if (N->getOpcode() != ISD::SHL)
13684 return true;
13685
13686 if (Subtarget->isThumb1Only()) {
13687 // Avoid making expensive immediates by commuting shifts. (This logic
13688 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13689 // for free.)
13690 if (N->getOpcode() != ISD::SHL)
13691 return true;
13692 SDValue N1 = N->getOperand(0);
13693 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13694 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13695 return true;
13696 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13697 if (Const->getAPIntValue().ult(256))
13698 return false;
13699 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13700 Const->getAPIntValue().sgt(-256))
13701 return false;
13702 }
13703 return true;
13704 }
13705
13706 // Turn off commute-with-shift transform after legalization, so it doesn't
13707 // conflict with PerformSHLSimplify. (We could try to detect when
13708 // PerformSHLSimplify would trigger more precisely, but it isn't
13709 // really necessary.)
13710 return false;
13711}
13712
13714 const SDNode *N) const {
13715 assert(N->getOpcode() == ISD::XOR &&
13716 (N->getOperand(0).getOpcode() == ISD::SHL ||
13717 N->getOperand(0).getOpcode() == ISD::SRL) &&
13718 "Expected XOR(SHIFT) pattern");
13719
13720 // Only commute if the entire NOT mask is a hidden shifted mask.
13721 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13722 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13723 if (XorC && ShiftC) {
13724 unsigned MaskIdx, MaskLen;
13725 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13726 unsigned ShiftAmt = ShiftC->getZExtValue();
13727 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13728 if (N->getOperand(0).getOpcode() == ISD::SHL)
13729 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13730 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13731 }
13732 }
13733
13734 return false;
13735}
13736
13738 const SDNode *N) const {
13739 assert(((N->getOpcode() == ISD::SHL &&
13740 N->getOperand(0).getOpcode() == ISD::SRL) ||
13741 (N->getOpcode() == ISD::SRL &&
13742 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13743 "Expected shift-shift mask");
13744
13745 if (!Subtarget->isThumb1Only())
13746 return true;
13747
13748 EVT VT = N->getValueType(0);
13749 if (VT.getScalarSizeInBits() > 32)
13750 return true;
13751
13752 return false;
13753}
13754
13756 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13757 SDValue Y) const {
13758 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13759 SelectOpcode == ISD::VSELECT;
13760}
13761
13763 if (!Subtarget->hasNEON()) {
13764 if (Subtarget->isThumb1Only())
13765 return VT.getScalarSizeInBits() <= 32;
13766 return true;
13767 }
13768 return VT.isScalarInteger();
13769}
13770
13772 EVT VT) const {
13773 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13774 return false;
13775
13776 switch (FPVT.getSimpleVT().SimpleTy) {
13777 case MVT::f16:
13778 return Subtarget->hasVFP2Base();
13779 case MVT::f32:
13780 return Subtarget->hasVFP2Base();
13781 case MVT::f64:
13782 return Subtarget->hasFP64();
13783 case MVT::v4f32:
13784 case MVT::v8f16:
13785 return Subtarget->hasMVEFloatOps();
13786 default:
13787 return false;
13788 }
13789}
13790
13793 const ARMSubtarget *ST) {
13794 // Allow the generic combiner to identify potential bswaps.
13795 if (DCI.isBeforeLegalize())
13796 return SDValue();
13797
13798 // DAG combiner will fold:
13799 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13800 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13801 // Other code patterns that can be also be modified have the following form:
13802 // b + ((a << 1) | 510)
13803 // b + ((a << 1) & 510)
13804 // b + ((a << 1) ^ 510)
13805 // b + ((a << 1) + 510)
13806
13807 // Many instructions can perform the shift for free, but it requires both
13808 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13809 // instruction will needed. So, unfold back to the original pattern if:
13810 // - if c1 and c2 are small enough that they don't require mov imms.
13811 // - the user(s) of the node can perform an shl
13812
13813 // No shifted operands for 16-bit instructions.
13814 if (ST->isThumb() && ST->isThumb1Only())
13815 return SDValue();
13816
13817 // Check that all the users could perform the shl themselves.
13818 for (auto *U : N->users()) {
13819 switch(U->getOpcode()) {
13820 default:
13821 return SDValue();
13822 case ISD::SUB:
13823 case ISD::ADD:
13824 case ISD::AND:
13825 case ISD::OR:
13826 case ISD::XOR:
13827 case ISD::SETCC:
13828 case ARMISD::CMP:
13829 // Check that the user isn't already using a constant because there
13830 // aren't any instructions that support an immediate operand and a
13831 // shifted operand.
13832 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13833 isa<ConstantSDNode>(U->getOperand(1)))
13834 return SDValue();
13835
13836 // Check that it's not already using a shift.
13837 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13838 U->getOperand(1).getOpcode() == ISD::SHL)
13839 return SDValue();
13840 break;
13841 }
13842 }
13843
13844 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13845 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13846 return SDValue();
13847
13848 if (N->getOperand(0).getOpcode() != ISD::SHL)
13849 return SDValue();
13850
13851 SDValue SHL = N->getOperand(0);
13852
13853 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13854 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13855 if (!C1ShlC2 || !C2)
13856 return SDValue();
13857
13858 APInt C2Int = C2->getAPIntValue();
13859 APInt C1Int = C1ShlC2->getAPIntValue();
13860 unsigned C2Width = C2Int.getBitWidth();
13861 if (C2Int.uge(C2Width))
13862 return SDValue();
13863 uint64_t C2Value = C2Int.getZExtValue();
13864
13865 // Check that performing a lshr will not lose any information.
13866 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13867 if ((C1Int & Mask) != C1Int)
13868 return SDValue();
13869
13870 // Shift the first constant.
13871 C1Int.lshrInPlace(C2Int);
13872
13873 // The immediates are encoded as an 8-bit value that can be rotated.
13874 auto LargeImm = [](const APInt &Imm) {
13875 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13876 return Imm.getBitWidth() - Zeros > 8;
13877 };
13878
13879 if (LargeImm(C1Int) || LargeImm(C2Int))
13880 return SDValue();
13881
13882 SelectionDAG &DAG = DCI.DAG;
13883 SDLoc dl(N);
13884 SDValue X = SHL.getOperand(0);
13885 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13886 DAG.getConstant(C1Int, dl, MVT::i32));
13887 // Shift left to compensate for the lshr of C1Int.
13888 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13889
13890 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13891 SHL.dump(); N->dump());
13892 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13893 return Res;
13894}
13895
13896
13897/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13898///
13901 const ARMSubtarget *Subtarget) {
13902 SDValue N0 = N->getOperand(0);
13903 SDValue N1 = N->getOperand(1);
13904
13905 // Only works one way, because it needs an immediate operand.
13906 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13907 return Result;
13908
13909 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13910 return Result;
13911
13912 // First try with the default operand order.
13913 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13914 return Result;
13915
13916 // If that didn't work, try again with the operands commuted.
13917 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13918}
13919
13920// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13921// providing -X is as cheap as X (currently, just a constant).
13923 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13924 return SDValue();
13925 SDValue CSINC = N->getOperand(1);
13926 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13927 return SDValue();
13928
13930 if (!X)
13931 return SDValue();
13932
13933 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13934 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13935 CSINC.getOperand(0)),
13936 CSINC.getOperand(1), CSINC.getOperand(2),
13937 CSINC.getOperand(3));
13938}
13939
13941 // Free to negate.
13943 return 0;
13944
13945 // Will save one instruction.
13946 if (Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)))
13947 return -1;
13948
13949 // Can freely negate by converting sra <-> srl.
13950 if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL) {
13951 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13952 if (Op.hasOneUse() && ShiftAmt &&
13953 ShiftAmt->getZExtValue() == Op.getValueType().getScalarSizeInBits() - 1)
13954 return 0;
13955 }
13956
13957 // Will have to create sub.
13958 return 1;
13959}
13960
13961// Try to fold
13962//
13963// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
13964//
13965// The folding helps cmov to be matched with csneg without generating
13966// redundant neg instruction.
13968 assert(N->getOpcode() == ISD::SUB);
13969 if (!isNullConstant(N->getOperand(0)))
13970 return SDValue();
13971
13972 SDValue CMov = N->getOperand(1);
13973 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
13974 return SDValue();
13975
13976 SDValue N0 = CMov.getOperand(0);
13977 SDValue N1 = CMov.getOperand(1);
13978
13979 // Only perform the fold if we actually save something.
13980 if (getNegationCost(N0) + getNegationCost(N1) > 0)
13981 return SDValue();
13982
13983 SDLoc DL(N);
13984 EVT VT = CMov.getValueType();
13985
13986 SDValue N0N = DAG.getNegative(N0, DL, VT);
13987 SDValue N1N = DAG.getNegative(N1, DL, VT);
13988 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
13989 CMov.getOperand(3));
13990}
13991
13992/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13993///
13996 const ARMSubtarget *Subtarget) {
13997 SDValue N0 = N->getOperand(0);
13998 SDValue N1 = N->getOperand(1);
13999
14000 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14001 if (N1.getNode()->hasOneUse())
14002 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14003 return Result;
14004
14005 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14006 return R;
14007
14008 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14009 return Val;
14010
14011 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14012 return SDValue();
14013
14014 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14015 // so that we can readily pattern match more mve instructions which can use
14016 // a scalar operand.
14017 SDValue VDup = N->getOperand(1);
14018 if (VDup->getOpcode() != ARMISD::VDUP)
14019 return SDValue();
14020
14021 SDValue VMov = N->getOperand(0);
14022 if (VMov->getOpcode() == ISD::BITCAST)
14023 VMov = VMov->getOperand(0);
14024
14025 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14026 return SDValue();
14027
14028 SDLoc dl(N);
14029 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14030 DCI.DAG.getConstant(0, dl, MVT::i32),
14031 VDup->getOperand(0));
14032 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14033}
14034
14035/// PerformVMULCombine
14036/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14037/// special multiplier accumulator forwarding.
14038/// vmul d3, d0, d2
14039/// vmla d3, d1, d2
14040/// is faster than
14041/// vadd d3, d0, d1
14042/// vmul d3, d3, d2
14043// However, for (A + B) * (A + B),
14044// vadd d2, d0, d1
14045// vmul d3, d0, d2
14046// vmla d3, d1, d2
14047// is slower than
14048// vadd d2, d0, d1
14049// vmul d3, d2, d2
14052 const ARMSubtarget *Subtarget) {
14053 if (!Subtarget->hasVMLxForwarding())
14054 return SDValue();
14055
14056 SelectionDAG &DAG = DCI.DAG;
14057 SDValue N0 = N->getOperand(0);
14058 SDValue N1 = N->getOperand(1);
14059 unsigned Opcode = N0.getOpcode();
14060 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14061 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14062 Opcode = N1.getOpcode();
14063 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14064 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14065 return SDValue();
14066 std::swap(N0, N1);
14067 }
14068
14069 if (N0 == N1)
14070 return SDValue();
14071
14072 EVT VT = N->getValueType(0);
14073 SDLoc DL(N);
14074 SDValue N00 = N0->getOperand(0);
14075 SDValue N01 = N0->getOperand(1);
14076 return DAG.getNode(Opcode, DL, VT,
14077 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14078 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14079}
14080
14082 const ARMSubtarget *Subtarget) {
14083 EVT VT = N->getValueType(0);
14084 if (VT != MVT::v2i64)
14085 return SDValue();
14086
14087 SDValue N0 = N->getOperand(0);
14088 SDValue N1 = N->getOperand(1);
14089
14090 auto IsSignExt = [&](SDValue Op) {
14091 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14092 return SDValue();
14093 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14094 if (VT.getScalarSizeInBits() == 32)
14095 return Op->getOperand(0);
14096 return SDValue();
14097 };
14098 auto IsZeroExt = [&](SDValue Op) {
14099 // Zero extends are a little more awkward. At the point we are matching
14100 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14101 // That might be before of after a bitcast depending on how the and is
14102 // placed. Because this has to look through bitcasts, it is currently only
14103 // supported on LE.
14104 if (!Subtarget->isLittle())
14105 return SDValue();
14106
14107 SDValue And = Op;
14108 if (And->getOpcode() == ISD::BITCAST)
14109 And = And->getOperand(0);
14110 if (And->getOpcode() != ISD::AND)
14111 return SDValue();
14112 SDValue Mask = And->getOperand(1);
14113 if (Mask->getOpcode() == ISD::BITCAST)
14114 Mask = Mask->getOperand(0);
14115
14116 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14117 Mask.getValueType() != MVT::v4i32)
14118 return SDValue();
14119 if (isAllOnesConstant(Mask->getOperand(0)) &&
14120 isNullConstant(Mask->getOperand(1)) &&
14121 isAllOnesConstant(Mask->getOperand(2)) &&
14122 isNullConstant(Mask->getOperand(3)))
14123 return And->getOperand(0);
14124 return SDValue();
14125 };
14126
14127 SDLoc dl(N);
14128 if (SDValue Op0 = IsSignExt(N0)) {
14129 if (SDValue Op1 = IsSignExt(N1)) {
14130 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14131 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14132 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14133 }
14134 }
14135 if (SDValue Op0 = IsZeroExt(N0)) {
14136 if (SDValue Op1 = IsZeroExt(N1)) {
14137 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14138 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14139 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14140 }
14141 }
14142
14143 return SDValue();
14144}
14145
14148 const ARMSubtarget *Subtarget) {
14149 SelectionDAG &DAG = DCI.DAG;
14150
14151 EVT VT = N->getValueType(0);
14152 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14153 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14154
14155 if (Subtarget->isThumb1Only())
14156 return SDValue();
14157
14158 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14159 return SDValue();
14160
14161 if (VT.is64BitVector() || VT.is128BitVector())
14162 return PerformVMULCombine(N, DCI, Subtarget);
14163 if (VT != MVT::i32)
14164 return SDValue();
14165
14166 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14167 if (!C)
14168 return SDValue();
14169
14170 int64_t MulAmt = C->getSExtValue();
14171 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14172
14173 ShiftAmt = ShiftAmt & (32 - 1);
14174 SDValue V = N->getOperand(0);
14175 SDLoc DL(N);
14176
14177 SDValue Res;
14178 MulAmt >>= ShiftAmt;
14179
14180 if (MulAmt >= 0) {
14181 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14182 // (mul x, 2^N + 1) => (add (shl x, N), x)
14183 Res = DAG.getNode(ISD::ADD, DL, VT,
14184 V,
14185 DAG.getNode(ISD::SHL, DL, VT,
14186 V,
14187 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14188 MVT::i32)));
14189 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14190 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14191 Res = DAG.getNode(ISD::SUB, DL, VT,
14192 DAG.getNode(ISD::SHL, DL, VT,
14193 V,
14194 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14195 MVT::i32)),
14196 V);
14197 } else
14198 return SDValue();
14199 } else {
14200 uint64_t MulAmtAbs = -MulAmt;
14201 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14202 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14203 Res = DAG.getNode(ISD::SUB, DL, VT,
14204 V,
14205 DAG.getNode(ISD::SHL, DL, VT,
14206 V,
14207 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14208 MVT::i32)));
14209 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14210 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14211 Res = DAG.getNode(ISD::ADD, DL, VT,
14212 V,
14213 DAG.getNode(ISD::SHL, DL, VT,
14214 V,
14215 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14216 MVT::i32)));
14217 Res = DAG.getNode(ISD::SUB, DL, VT,
14218 DAG.getConstant(0, DL, MVT::i32), Res);
14219 } else
14220 return SDValue();
14221 }
14222
14223 if (ShiftAmt != 0)
14224 Res = DAG.getNode(ISD::SHL, DL, VT,
14225 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14226
14227 // Do not add new nodes to DAG combiner worklist.
14228 DCI.CombineTo(N, Res, false);
14229 return SDValue();
14230}
14231
14234 const ARMSubtarget *Subtarget) {
14235 // Allow DAGCombine to pattern-match before we touch the canonical form.
14236 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14237 return SDValue();
14238
14239 if (N->getValueType(0) != MVT::i32)
14240 return SDValue();
14241
14242 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14243 if (!N1C)
14244 return SDValue();
14245
14246 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14247 // Don't transform uxtb/uxth.
14248 if (C1 == 255 || C1 == 65535)
14249 return SDValue();
14250
14251 SDNode *N0 = N->getOperand(0).getNode();
14252 if (!N0->hasOneUse())
14253 return SDValue();
14254
14255 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14256 return SDValue();
14257
14258 bool LeftShift = N0->getOpcode() == ISD::SHL;
14259
14261 if (!N01C)
14262 return SDValue();
14263
14264 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14265 if (!C2 || C2 >= 32)
14266 return SDValue();
14267
14268 // Clear irrelevant bits in the mask.
14269 if (LeftShift)
14270 C1 &= (-1U << C2);
14271 else
14272 C1 &= (-1U >> C2);
14273
14274 SelectionDAG &DAG = DCI.DAG;
14275 SDLoc DL(N);
14276
14277 // We have a pattern of the form "(and (shl x, c2) c1)" or
14278 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14279 // transform to a pair of shifts, to save materializing c1.
14280
14281 // First pattern: right shift, then mask off leading bits.
14282 // FIXME: Use demanded bits?
14283 if (!LeftShift && isMask_32(C1)) {
14284 uint32_t C3 = llvm::countl_zero(C1);
14285 if (C2 < C3) {
14286 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14287 DAG.getConstant(C3 - C2, DL, MVT::i32));
14288 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14289 DAG.getConstant(C3, DL, MVT::i32));
14290 }
14291 }
14292
14293 // First pattern, reversed: left shift, then mask off trailing bits.
14294 if (LeftShift && isMask_32(~C1)) {
14295 uint32_t C3 = llvm::countr_zero(C1);
14296 if (C2 < C3) {
14297 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14298 DAG.getConstant(C3 - C2, DL, MVT::i32));
14299 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14300 DAG.getConstant(C3, DL, MVT::i32));
14301 }
14302 }
14303
14304 // Second pattern: left shift, then mask off leading bits.
14305 // FIXME: Use demanded bits?
14306 if (LeftShift && isShiftedMask_32(C1)) {
14307 uint32_t Trailing = llvm::countr_zero(C1);
14308 uint32_t C3 = llvm::countl_zero(C1);
14309 if (Trailing == C2 && C2 + C3 < 32) {
14310 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14311 DAG.getConstant(C2 + C3, DL, MVT::i32));
14312 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14313 DAG.getConstant(C3, DL, MVT::i32));
14314 }
14315 }
14316
14317 // Second pattern, reversed: right shift, then mask off trailing bits.
14318 // FIXME: Handle other patterns of known/demanded bits.
14319 if (!LeftShift && isShiftedMask_32(C1)) {
14320 uint32_t Leading = llvm::countl_zero(C1);
14321 uint32_t C3 = llvm::countr_zero(C1);
14322 if (Leading == C2 && C2 + C3 < 32) {
14323 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14324 DAG.getConstant(C2 + C3, DL, MVT::i32));
14325 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14326 DAG.getConstant(C3, DL, MVT::i32));
14327 }
14328 }
14329
14330 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14331 // if "c1 >> c2" is a cheaper immediate than "c1"
14332 if (LeftShift &&
14333 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14334
14335 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14336 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14337 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14338 DAG.getConstant(C2, DL, MVT::i32));
14339 }
14340
14341 return SDValue();
14342}
14343
14346 const ARMSubtarget *Subtarget) {
14347 // Attempt to use immediate-form VBIC
14348 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14349 SDLoc dl(N);
14350 EVT VT = N->getValueType(0);
14351 SelectionDAG &DAG = DCI.DAG;
14352
14353 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14354 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14355 return SDValue();
14356
14357 APInt SplatBits, SplatUndef;
14358 unsigned SplatBitSize;
14359 bool HasAnyUndefs;
14360 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14361 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14362 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14363 SplatBitSize == 64) {
14364 EVT VbicVT;
14365 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14366 SplatUndef.getZExtValue(), SplatBitSize,
14367 DAG, dl, VbicVT, VT, OtherModImm);
14368 if (Val.getNode()) {
14369 SDValue Input =
14370 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14371 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14372 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14373 }
14374 }
14375 }
14376
14377 if (!Subtarget->isThumb1Only()) {
14378 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14379 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14380 return Result;
14381
14382 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14383 return Result;
14384 }
14385
14386 if (Subtarget->isThumb1Only())
14387 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14388 return Result;
14389
14390 return SDValue();
14391}
14392
14393// Try combining OR nodes to SMULWB, SMULWT.
14396 const ARMSubtarget *Subtarget) {
14397 if (!Subtarget->hasV6Ops() ||
14398 (Subtarget->isThumb() &&
14399 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14400 return SDValue();
14401
14402 SDValue SRL = OR->getOperand(0);
14403 SDValue SHL = OR->getOperand(1);
14404
14405 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14406 SRL = OR->getOperand(1);
14407 SHL = OR->getOperand(0);
14408 }
14409 if (!isSRL16(SRL) || !isSHL16(SHL))
14410 return SDValue();
14411
14412 // The first operands to the shifts need to be the two results from the
14413 // same smul_lohi node.
14414 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14415 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14416 return SDValue();
14417
14418 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14419 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14420 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14421 return SDValue();
14422
14423 // Now we have:
14424 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14425 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14426 // For SMUWB the 16-bit value will signed extended somehow.
14427 // For SMULWT only the SRA is required.
14428 // Check both sides of SMUL_LOHI
14429 SDValue OpS16 = SMULLOHI->getOperand(0);
14430 SDValue OpS32 = SMULLOHI->getOperand(1);
14431
14432 SelectionDAG &DAG = DCI.DAG;
14433 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14434 OpS16 = OpS32;
14435 OpS32 = SMULLOHI->getOperand(0);
14436 }
14437
14438 SDLoc dl(OR);
14439 unsigned Opcode = 0;
14440 if (isS16(OpS16, DAG))
14441 Opcode = ARMISD::SMULWB;
14442 else if (isSRA16(OpS16)) {
14443 Opcode = ARMISD::SMULWT;
14444 OpS16 = OpS16->getOperand(0);
14445 }
14446 else
14447 return SDValue();
14448
14449 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14450 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14451 return SDValue(OR, 0);
14452}
14453
14456 const ARMSubtarget *Subtarget) {
14457 // BFI is only available on V6T2+
14458 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14459 return SDValue();
14460
14461 EVT VT = N->getValueType(0);
14462 SDValue N0 = N->getOperand(0);
14463 SDValue N1 = N->getOperand(1);
14464 SelectionDAG &DAG = DCI.DAG;
14465 SDLoc DL(N);
14466 // 1) or (and A, mask), val => ARMbfi A, val, mask
14467 // iff (val & mask) == val
14468 //
14469 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14470 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14471 // && mask == ~mask2
14472 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14473 // && ~mask == mask2
14474 // (i.e., copy a bitfield value into another bitfield of the same width)
14475
14476 if (VT != MVT::i32)
14477 return SDValue();
14478
14479 SDValue N00 = N0.getOperand(0);
14480
14481 // The value and the mask need to be constants so we can verify this is
14482 // actually a bitfield set. If the mask is 0xffff, we can do better
14483 // via a movt instruction, so don't use BFI in that case.
14484 SDValue MaskOp = N0.getOperand(1);
14486 if (!MaskC)
14487 return SDValue();
14488 unsigned Mask = MaskC->getZExtValue();
14489 if (Mask == 0xffff)
14490 return SDValue();
14491 SDValue Res;
14492 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14494 if (N1C) {
14495 unsigned Val = N1C->getZExtValue();
14496 if ((Val & ~Mask) != Val)
14497 return SDValue();
14498
14499 if (ARM::isBitFieldInvertedMask(Mask)) {
14500 Val >>= llvm::countr_zero(~Mask);
14501
14502 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14503 DAG.getConstant(Val, DL, MVT::i32),
14504 DAG.getConstant(Mask, DL, MVT::i32));
14505
14506 DCI.CombineTo(N, Res, false);
14507 // Return value from the original node to inform the combiner than N is
14508 // now dead.
14509 return SDValue(N, 0);
14510 }
14511 } else if (N1.getOpcode() == ISD::AND) {
14512 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14514 if (!N11C)
14515 return SDValue();
14516 unsigned Mask2 = N11C->getZExtValue();
14517
14518 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14519 // as is to match.
14520 if (ARM::isBitFieldInvertedMask(Mask) &&
14521 (Mask == ~Mask2)) {
14522 // The pack halfword instruction works better for masks that fit it,
14523 // so use that when it's available.
14524 if (Subtarget->hasDSP() &&
14525 (Mask == 0xffff || Mask == 0xffff0000))
14526 return SDValue();
14527 // 2a
14528 unsigned amt = llvm::countr_zero(Mask2);
14529 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14530 DAG.getConstant(amt, DL, MVT::i32));
14531 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14532 DAG.getConstant(Mask, DL, MVT::i32));
14533 DCI.CombineTo(N, Res, false);
14534 // Return value from the original node to inform the combiner than N is
14535 // now dead.
14536 return SDValue(N, 0);
14537 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14538 (~Mask == Mask2)) {
14539 // The pack halfword instruction works better for masks that fit it,
14540 // so use that when it's available.
14541 if (Subtarget->hasDSP() &&
14542 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14543 return SDValue();
14544 // 2b
14545 unsigned lsb = llvm::countr_zero(Mask);
14546 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14547 DAG.getConstant(lsb, DL, MVT::i32));
14548 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14549 DAG.getConstant(Mask2, DL, MVT::i32));
14550 DCI.CombineTo(N, Res, false);
14551 // Return value from the original node to inform the combiner than N is
14552 // now dead.
14553 return SDValue(N, 0);
14554 }
14555 }
14556
14557 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14558 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14560 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14561 // where lsb(mask) == #shamt and masked bits of B are known zero.
14562 SDValue ShAmt = N00.getOperand(1);
14563 unsigned ShAmtC = ShAmt->getAsZExtVal();
14564 unsigned LSB = llvm::countr_zero(Mask);
14565 if (ShAmtC != LSB)
14566 return SDValue();
14567
14568 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14569 DAG.getConstant(~Mask, DL, MVT::i32));
14570
14571 DCI.CombineTo(N, Res, false);
14572 // Return value from the original node to inform the combiner than N is
14573 // now dead.
14574 return SDValue(N, 0);
14575 }
14576
14577 return SDValue();
14578}
14579
14580static bool isValidMVECond(unsigned CC, bool IsFloat) {
14581 switch (CC) {
14582 case ARMCC::EQ:
14583 case ARMCC::NE:
14584 case ARMCC::LE:
14585 case ARMCC::GT:
14586 case ARMCC::GE:
14587 case ARMCC::LT:
14588 return true;
14589 case ARMCC::HS:
14590 case ARMCC::HI:
14591 return !IsFloat;
14592 default:
14593 return false;
14594 };
14595}
14596
14598 if (N->getOpcode() == ARMISD::VCMP)
14599 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14600 else if (N->getOpcode() == ARMISD::VCMPZ)
14601 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14602 else
14603 llvm_unreachable("Not a VCMP/VCMPZ!");
14604}
14605
14608 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14609}
14610
14612 const ARMSubtarget *Subtarget) {
14613 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14614 // together with predicates
14615 EVT VT = N->getValueType(0);
14616 SDLoc DL(N);
14617 SDValue N0 = N->getOperand(0);
14618 SDValue N1 = N->getOperand(1);
14619
14620 auto IsFreelyInvertable = [&](SDValue V) {
14621 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14622 return CanInvertMVEVCMP(V);
14623 return false;
14624 };
14625
14626 // At least one operand must be freely invertable.
14627 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14628 return SDValue();
14629
14630 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14631 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14632 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14633 return DAG.getLogicalNOT(DL, And, VT);
14634}
14635
14636// Try to form a NEON shift-{right, left}-and-insert (VSRI/VSLI) from:
14637// (or (and X, splat (i32 C1)), (srl Y, splat (i32 C2))) -> VSRI X, Y, #C2
14638// (or (and X, splat (i32 C1)), (shl Y, splat (i32 C2))) -> VSLI X, Y, #C2
14639// where C1 is a mask that preserves the bits not written by the shift/insert,
14640// i.e. `C1 == (1 << C2) - 1`.
14642 SDValue ShiftOp, EVT VT,
14643 SDLoc dl) {
14644 // Match (and X, Mask)
14645 if (AndOp.getOpcode() != ISD::AND)
14646 return SDValue();
14647
14648 SDValue X = AndOp.getOperand(0);
14649 SDValue Mask = AndOp.getOperand(1);
14650
14651 ConstantSDNode *MaskC = isConstOrConstSplat(Mask, false, true);
14652 if (!MaskC)
14653 return SDValue();
14654 APInt MaskBits =
14655 MaskC->getAPIntValue().trunc(Mask.getScalarValueSizeInBits());
14656
14657 // Match shift (srl/shl Y, CntVec)
14658 int64_t Cnt = 0;
14659 bool IsShiftRight = false;
14660 SDValue Y;
14661
14662 if (ShiftOp.getOpcode() == ARMISD::VSHRuIMM) {
14663 IsShiftRight = true;
14664 Y = ShiftOp.getOperand(0);
14665 Cnt = ShiftOp.getConstantOperandVal(1);
14666 } else if (ShiftOp.getOpcode() == ARMISD::VSHLIMM) {
14667 Y = ShiftOp.getOperand(0);
14668 Cnt = ShiftOp.getConstantOperandVal(1);
14669 } else {
14670 return SDValue();
14671 }
14672
14673 unsigned ElemBits = VT.getScalarSizeInBits();
14674 APInt RequiredMask = IsShiftRight
14675 ? APInt::getHighBitsSet(ElemBits, (unsigned)Cnt)
14676 : APInt::getLowBitsSet(ElemBits, (unsigned)Cnt);
14677 if (MaskBits != RequiredMask)
14678 return SDValue();
14679
14680 unsigned Opc = IsShiftRight ? ARMISD::VSRIIMM : ARMISD::VSLIIMM;
14681 return DAG.getNode(Opc, dl, VT, X, Y, DAG.getConstant(Cnt, dl, MVT::i32));
14682}
14683
14684/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14686 const ARMSubtarget *Subtarget) {
14687 // Attempt to use immediate-form VORR
14688 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14689 SDLoc dl(N);
14690 EVT VT = N->getValueType(0);
14691 SelectionDAG &DAG = DCI.DAG;
14692
14693 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14694 return SDValue();
14695
14696 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14697 VT == MVT::v8i1 || VT == MVT::v16i1))
14698 return PerformORCombine_i1(N, DAG, Subtarget);
14699
14700 APInt SplatBits, SplatUndef;
14701 unsigned SplatBitSize;
14702 bool HasAnyUndefs;
14703 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14704 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14705 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14706 SplatBitSize == 64) {
14707 EVT VorrVT;
14708 SDValue Val =
14709 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14710 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14711 if (Val.getNode()) {
14712 SDValue Input =
14713 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14714 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14715 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14716 }
14717 }
14718 }
14719
14720 if (!Subtarget->isThumb1Only()) {
14721 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14722 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14723 return Result;
14724 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14725 return Result;
14726 }
14727
14728 SDValue N0 = N->getOperand(0);
14729 SDValue N1 = N->getOperand(1);
14730
14731 // (or (and X, C1), (srl Y, C2)) -> VSRI X, Y, #C2
14732 // (or (and X, C1), (shl Y, C2)) -> VSLI X, Y, #C2
14733 if (VT.isVector() &&
14734 ((Subtarget->hasNEON() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) ||
14735 (Subtarget->hasMVEIntegerOps() &&
14736 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32)))) {
14737 if (SDValue ShiftInsert =
14738 PerformORCombineToShiftInsert(DAG, N0, N1, VT, dl))
14739 return ShiftInsert;
14740
14741 if (SDValue ShiftInsert =
14742 PerformORCombineToShiftInsert(DAG, N1, N0, VT, dl))
14743 return ShiftInsert;
14744 }
14745
14746 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14747 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14749
14750 // The code below optimizes (or (and X, Y), Z).
14751 // The AND operand needs to have a single user to make these optimizations
14752 // profitable.
14753 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14754 return SDValue();
14755
14756 APInt SplatUndef;
14757 unsigned SplatBitSize;
14758 bool HasAnyUndefs;
14759
14760 APInt SplatBits0, SplatBits1;
14763 // Ensure that the second operand of both ands are constants
14764 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14765 HasAnyUndefs) && !HasAnyUndefs) {
14766 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14767 HasAnyUndefs) && !HasAnyUndefs) {
14768 // Ensure that the bit width of the constants are the same and that
14769 // the splat arguments are logical inverses as per the pattern we
14770 // are trying to simplify.
14771 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14772 SplatBits0 == ~SplatBits1) {
14773 // Canonicalize the vector type to make instruction selection
14774 // simpler.
14775 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14776 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14777 N0->getOperand(1),
14778 N0->getOperand(0),
14779 N1->getOperand(0));
14780 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14781 }
14782 }
14783 }
14784 }
14785
14786 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14787 // reasonable.
14788 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14789 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14790 return Res;
14791 }
14792
14793 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14794 return Result;
14795
14796 return SDValue();
14797}
14798
14801 const ARMSubtarget *Subtarget) {
14802 EVT VT = N->getValueType(0);
14803 SelectionDAG &DAG = DCI.DAG;
14804
14805 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14806 return SDValue();
14807
14808 if (!Subtarget->isThumb1Only()) {
14809 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14810 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14811 return Result;
14812
14813 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14814 return Result;
14815 }
14816
14817 if (Subtarget->hasMVEIntegerOps()) {
14818 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14819 SDValue N0 = N->getOperand(0);
14820 SDValue N1 = N->getOperand(1);
14821 const TargetLowering *TLI = Subtarget->getTargetLowering();
14822 if (TLI->isConstTrueVal(N1) &&
14823 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14824 if (CanInvertMVEVCMP(N0)) {
14825 SDLoc DL(N0);
14827
14829 Ops.push_back(N0->getOperand(0));
14830 if (N0->getOpcode() == ARMISD::VCMP)
14831 Ops.push_back(N0->getOperand(1));
14832 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14833 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14834 }
14835 }
14836 }
14837
14838 return SDValue();
14839}
14840
14841// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14842// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14843// their position in "to" (Rd).
14844static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14845 assert(N->getOpcode() == ARMISD::BFI);
14846
14847 SDValue From = N->getOperand(1);
14848 ToMask = ~N->getConstantOperandAPInt(2);
14849 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14850
14851 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14852 // #C in the base of the SHR.
14853 if (From->getOpcode() == ISD::SRL &&
14854 isa<ConstantSDNode>(From->getOperand(1))) {
14855 APInt Shift = From->getConstantOperandAPInt(1);
14856 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14857 FromMask <<= Shift.getLimitedValue(31);
14858 From = From->getOperand(0);
14859 }
14860
14861 return From;
14862}
14863
14864// If A and B contain one contiguous set of bits, does A | B == A . B?
14865//
14866// Neither A nor B must be zero.
14867static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14868 unsigned LastActiveBitInA = A.countr_zero();
14869 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14870 return LastActiveBitInA - 1 == FirstActiveBitInB;
14871}
14872
14874 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14875 APInt ToMask, FromMask;
14876 SDValue From = ParseBFI(N, ToMask, FromMask);
14877 SDValue To = N->getOperand(0);
14878
14879 SDValue V = To;
14880 if (V.getOpcode() != ARMISD::BFI)
14881 return SDValue();
14882
14883 APInt NewToMask, NewFromMask;
14884 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14885 if (NewFrom != From)
14886 return SDValue();
14887
14888 // Do the written bits conflict with any we've seen so far?
14889 if ((NewToMask & ToMask).getBoolValue())
14890 // Conflicting bits.
14891 return SDValue();
14892
14893 // Are the new bits contiguous when combined with the old bits?
14894 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14895 BitsProperlyConcatenate(FromMask, NewFromMask))
14896 return V;
14897 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14898 BitsProperlyConcatenate(NewFromMask, FromMask))
14899 return V;
14900
14901 return SDValue();
14902}
14903
14905 SDValue N0 = N->getOperand(0);
14906 SDValue N1 = N->getOperand(1);
14907
14908 if (N1.getOpcode() == ISD::AND) {
14909 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14910 // the bits being cleared by the AND are not demanded by the BFI.
14912 if (!N11C)
14913 return SDValue();
14914 unsigned InvMask = N->getConstantOperandVal(2);
14915 unsigned LSB = llvm::countr_zero(~InvMask);
14916 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14917 assert(Width <
14918 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14919 "undefined behavior");
14920 unsigned Mask = (1u << Width) - 1;
14921 unsigned Mask2 = N11C->getZExtValue();
14922 if ((Mask & (~Mask2)) == 0)
14923 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14924 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14925 return SDValue();
14926 }
14927
14928 // Look for another BFI to combine with.
14929 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14930 // We've found a BFI.
14931 APInt ToMask1, FromMask1;
14932 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14933
14934 APInt ToMask2, FromMask2;
14935 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14936 assert(From1 == From2);
14937 (void)From2;
14938
14939 // Create a new BFI, combining the two together.
14940 APInt NewFromMask = FromMask1 | FromMask2;
14941 APInt NewToMask = ToMask1 | ToMask2;
14942
14943 EVT VT = N->getValueType(0);
14944 SDLoc dl(N);
14945
14946 if (NewFromMask[0] == 0)
14947 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14948 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14949 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14950 DAG.getConstant(~NewToMask, dl, VT));
14951 }
14952
14953 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14954 // that lower bit insertions are performed first, providing that M1 and M2
14955 // do no overlap. This can allow multiple BFI instructions to be combined
14956 // together by the other folds above.
14957 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14958 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14959 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14960
14961 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14962 ToMask1.countl_zero() < ToMask2.countl_zero())
14963 return SDValue();
14964
14965 EVT VT = N->getValueType(0);
14966 SDLoc dl(N);
14967 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14968 N->getOperand(1), N->getOperand(2));
14969 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14970 N0.getOperand(2));
14971 }
14972
14973 return SDValue();
14974}
14975
14976// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14977// or CMPZ(CMOV(1, 0, CC, X))
14978// return X if valid.
14980 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14981 return SDValue();
14982 SDValue CSInc = Cmp->getOperand(0);
14983
14984 // Ignore any `And 1` nodes that may not yet have been removed. We are
14985 // looking for a value that produces 1/0, so these have no effect on the
14986 // code.
14987 while (CSInc.getOpcode() == ISD::AND &&
14988 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14989 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14990 CSInc = CSInc.getOperand(0);
14991
14992 if (CSInc.getOpcode() == ARMISD::CSINC &&
14993 isNullConstant(CSInc.getOperand(0)) &&
14994 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14996 return CSInc.getOperand(3);
14997 }
14998 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
14999 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15001 return CSInc.getOperand(3);
15002 }
15003 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15004 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15007 return CSInc.getOperand(3);
15008 }
15009 return SDValue();
15010}
15011
15013 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15014 // t92: flags = ARMISD::CMPZ t74, 0
15015 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15016 // t96: flags = ARMISD::CMPZ t93, 0
15017 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15019 if (SDValue C = IsCMPZCSINC(N, Cond))
15020 if (Cond == ARMCC::EQ)
15021 return C;
15022 return SDValue();
15023}
15024
15026 // Fold away an unnecessary CMPZ/CSINC
15027 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15028 // if C1==EQ -> CSXYZ A, B, C2, D
15029 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15031 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15032 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15033 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15034 N->getOperand(1),
15035 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15036 if (N->getConstantOperandVal(2) == ARMCC::NE)
15037 return DAG.getNode(
15038 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15039 N->getOperand(1),
15041 }
15042 return SDValue();
15043}
15044
15045/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15046/// ARMISD::VMOVRRD.
15049 const ARMSubtarget *Subtarget) {
15050 // vmovrrd(vmovdrr x, y) -> x,y
15051 SDValue InDouble = N->getOperand(0);
15052 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15053 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15054
15055 // vmovrrd(load f64) -> (load i32), (load i32)
15056 SDNode *InNode = InDouble.getNode();
15057 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15058 InNode->getValueType(0) == MVT::f64 &&
15059 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15060 !cast<LoadSDNode>(InNode)->isVolatile()) {
15061 // TODO: Should this be done for non-FrameIndex operands?
15062 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15063
15064 SelectionDAG &DAG = DCI.DAG;
15065 SDLoc DL(LD);
15066 SDValue BasePtr = LD->getBasePtr();
15067 SDValue NewLD1 =
15068 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15069 LD->getAlign(), LD->getMemOperand()->getFlags());
15070
15071 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15072 DAG.getConstant(4, DL, MVT::i32));
15073
15074 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15075 LD->getPointerInfo().getWithOffset(4),
15076 commonAlignment(LD->getAlign(), 4),
15077 LD->getMemOperand()->getFlags());
15078
15079 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15080 if (DCI.DAG.getDataLayout().isBigEndian())
15081 std::swap (NewLD1, NewLD2);
15082 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15083 return Result;
15084 }
15085
15086 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15087 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15088 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15089 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15090 SDValue BV = InDouble.getOperand(0);
15091 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15092 // change lane order under big endian.
15093 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15094 while (
15095 (BV.getOpcode() == ISD::BITCAST ||
15096 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
15097 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15098 BVSwap = BV.getOpcode() == ISD::BITCAST;
15099 BV = BV.getOperand(0);
15100 }
15101 if (BV.getValueType() != MVT::v4i32)
15102 return SDValue();
15103
15104 // Handle buildvectors, pulling out the correct lane depending on
15105 // endianness.
15106 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15107 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15108 SDValue Op0 = BV.getOperand(Offset);
15109 SDValue Op1 = BV.getOperand(Offset + 1);
15110 if (!Subtarget->isLittle() && BVSwap)
15111 std::swap(Op0, Op1);
15112
15113 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15114 }
15115
15116 // A chain of insert_vectors, grabbing the correct value of the chain of
15117 // inserts.
15118 SDValue Op0, Op1;
15119 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15120 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15121 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15122 Op0 = BV.getOperand(1);
15123 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15124 Op1 = BV.getOperand(1);
15125 }
15126 BV = BV.getOperand(0);
15127 }
15128 if (!Subtarget->isLittle() && BVSwap)
15129 std::swap(Op0, Op1);
15130 if (Op0 && Op1)
15131 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15132 }
15133
15134 return SDValue();
15135}
15136
15137/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15138/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15140 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15141 SDValue Op0 = N->getOperand(0);
15142 SDValue Op1 = N->getOperand(1);
15143 if (Op0.getOpcode() == ISD::BITCAST)
15144 Op0 = Op0.getOperand(0);
15145 if (Op1.getOpcode() == ISD::BITCAST)
15146 Op1 = Op1.getOperand(0);
15147 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15148 Op0.getNode() == Op1.getNode() &&
15149 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15150 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15151 N->getValueType(0), Op0.getOperand(0));
15152 return SDValue();
15153}
15154
15157 SDValue Op0 = N->getOperand(0);
15158
15159 // VMOVhr (VMOVrh (X)) -> X
15160 if (Op0->getOpcode() == ARMISD::VMOVrh)
15161 return Op0->getOperand(0);
15162
15163 // FullFP16: half values are passed in S-registers, and we don't
15164 // need any of the bitcast and moves:
15165 //
15166 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15167 // t5: i32 = bitcast t2
15168 // t18: f16 = ARMISD::VMOVhr t5
15169 // =>
15170 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15171 if (Op0->getOpcode() == ISD::BITCAST) {
15172 SDValue Copy = Op0->getOperand(0);
15173 if (Copy.getValueType() == MVT::f32 &&
15174 Copy->getOpcode() == ISD::CopyFromReg) {
15175 bool HasGlue = Copy->getNumOperands() == 3;
15176 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15177 HasGlue ? Copy->getOperand(2) : SDValue()};
15178 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15179 SDValue NewCopy =
15181 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15182 ArrayRef(Ops, HasGlue ? 3 : 2));
15183
15184 // Update Users, Chains, and Potential Glue.
15185 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15186 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15187 if (HasGlue)
15188 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15189 NewCopy.getValue(2));
15190
15191 return NewCopy;
15192 }
15193 }
15194
15195 // fold (VMOVhr (load x)) -> (load (f16*)x)
15196 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15197 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15198 LN0->getMemoryVT() == MVT::i16) {
15199 SDValue Load =
15200 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15201 LN0->getBasePtr(), LN0->getMemOperand());
15202 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15203 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15204 return Load;
15205 }
15206 }
15207
15208 // Only the bottom 16 bits of the source register are used.
15209 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15210 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15211 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15212 return SDValue(N, 0);
15213
15214 return SDValue();
15215}
15216
15218 SDValue N0 = N->getOperand(0);
15219 EVT VT = N->getValueType(0);
15220
15221 // fold (VMOVrh (fpconst x)) -> const x
15223 APFloat V = C->getValueAPF();
15224 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15225 }
15226
15227 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15228 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15229 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15230
15231 SDValue Load =
15232 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15233 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15234 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15235 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15236 return Load;
15237 }
15238
15239 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15240 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15242 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15243 N0->getOperand(1));
15244
15245 return SDValue();
15246}
15247
15248/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15249/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15250/// i64 vector to have f64 elements, since the value can then be loaded
15251/// directly into a VFP register.
15253 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15254 for (unsigned i = 0; i < NumElts; ++i) {
15255 SDNode *Elt = N->getOperand(i).getNode();
15256 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15257 return true;
15258 }
15259 return false;
15260}
15261
15262/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15263/// ISD::BUILD_VECTOR.
15266 const ARMSubtarget *Subtarget) {
15267 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15268 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15269 // into a pair of GPRs, which is fine when the value is used as a scalar,
15270 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15271 SelectionDAG &DAG = DCI.DAG;
15272 if (N->getNumOperands() == 2)
15273 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15274 return RV;
15275
15276 // Load i64 elements as f64 values so that type legalization does not split
15277 // them up into i32 values.
15278 EVT VT = N->getValueType(0);
15279 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15280 return SDValue();
15281 SDLoc dl(N);
15283 unsigned NumElts = VT.getVectorNumElements();
15284 for (unsigned i = 0; i < NumElts; ++i) {
15285 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15286 Ops.push_back(V);
15287 // Make the DAGCombiner fold the bitcast.
15288 DCI.AddToWorklist(V.getNode());
15289 }
15290 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15291 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15292 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15293}
15294
15295/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15296static SDValue
15298 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15299 // At that time, we may have inserted bitcasts from integer to float.
15300 // If these bitcasts have survived DAGCombine, change the lowering of this
15301 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15302 // force to use floating point types.
15303
15304 // Make sure we can change the type of the vector.
15305 // This is possible iff:
15306 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15307 // 1.1. Vector is used only once.
15308 // 1.2. Use is a bit convert to an integer type.
15309 // 2. The size of its operands are 32-bits (64-bits are not legal).
15310 EVT VT = N->getValueType(0);
15311 EVT EltVT = VT.getVectorElementType();
15312
15313 // Check 1.1. and 2.
15314 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15315 return SDValue();
15316
15317 // By construction, the input type must be float.
15318 assert(EltVT == MVT::f32 && "Unexpected type!");
15319
15320 // Check 1.2.
15321 SDNode *Use = *N->user_begin();
15322 if (Use->getOpcode() != ISD::BITCAST ||
15323 Use->getValueType(0).isFloatingPoint())
15324 return SDValue();
15325
15326 // Check profitability.
15327 // Model is, if more than half of the relevant operands are bitcast from
15328 // i32, turn the build_vector into a sequence of insert_vector_elt.
15329 // Relevant operands are everything that is not statically
15330 // (i.e., at compile time) bitcasted.
15331 unsigned NumOfBitCastedElts = 0;
15332 unsigned NumElts = VT.getVectorNumElements();
15333 unsigned NumOfRelevantElts = NumElts;
15334 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15335 SDValue Elt = N->getOperand(Idx);
15336 if (Elt->getOpcode() == ISD::BITCAST) {
15337 // Assume only bit cast to i32 will go away.
15338 if (Elt->getOperand(0).getValueType() == MVT::i32)
15339 ++NumOfBitCastedElts;
15340 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15341 // Constants are statically casted, thus do not count them as
15342 // relevant operands.
15343 --NumOfRelevantElts;
15344 }
15345
15346 // Check if more than half of the elements require a non-free bitcast.
15347 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15348 return SDValue();
15349
15350 SelectionDAG &DAG = DCI.DAG;
15351 // Create the new vector type.
15352 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15353 // Check if the type is legal.
15354 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15355 if (!TLI.isTypeLegal(VecVT))
15356 return SDValue();
15357
15358 // Combine:
15359 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15360 // => BITCAST INSERT_VECTOR_ELT
15361 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15362 // (BITCAST EN), N.
15363 SDValue Vec = DAG.getUNDEF(VecVT);
15364 SDLoc dl(N);
15365 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15366 SDValue V = N->getOperand(Idx);
15367 if (V.isUndef())
15368 continue;
15369 if (V.getOpcode() == ISD::BITCAST &&
15370 V->getOperand(0).getValueType() == MVT::i32)
15371 // Fold obvious case.
15372 V = V.getOperand(0);
15373 else {
15374 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15375 // Make the DAGCombiner fold the bitcasts.
15376 DCI.AddToWorklist(V.getNode());
15377 }
15378 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15379 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15380 }
15381 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15382 // Make the DAGCombiner fold the bitcasts.
15383 DCI.AddToWorklist(Vec.getNode());
15384 return Vec;
15385}
15386
15387static SDValue
15389 EVT VT = N->getValueType(0);
15390 SDValue Op = N->getOperand(0);
15391 SDLoc dl(N);
15392
15393 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15394 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15395 // If the valuetypes are the same, we can remove the cast entirely.
15396 if (Op->getOperand(0).getValueType() == VT)
15397 return Op->getOperand(0);
15398 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15399 }
15400
15401 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15402 // more VPNOT which might get folded as else predicates.
15403 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15404 SDValue X =
15405 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15406 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15407 DCI.DAG.getConstant(65535, dl, MVT::i32));
15408 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15409 }
15410
15411 // Only the bottom 16 bits of the source register are used.
15412 if (Op.getValueType() == MVT::i32) {
15413 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15414 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15415 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15416 return SDValue(N, 0);
15417 }
15418 return SDValue();
15419}
15420
15422 const ARMSubtarget *ST) {
15423 EVT VT = N->getValueType(0);
15424 SDValue Op = N->getOperand(0);
15425 SDLoc dl(N);
15426
15427 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15428 if (ST->isLittle())
15429 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15430
15431 // VT VECTOR_REG_CAST (VT Op) -> Op
15432 if (Op.getValueType() == VT)
15433 return Op;
15434 // VECTOR_REG_CAST undef -> undef
15435 if (Op.isUndef())
15436 return DAG.getUNDEF(VT);
15437
15438 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15439 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15440 // If the valuetypes are the same, we can remove the cast entirely.
15441 if (Op->getOperand(0).getValueType() == VT)
15442 return Op->getOperand(0);
15443 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15444 }
15445
15446 return SDValue();
15447}
15448
15450 const ARMSubtarget *Subtarget) {
15451 if (!Subtarget->hasMVEIntegerOps())
15452 return SDValue();
15453
15454 EVT VT = N->getValueType(0);
15455 SDValue Op0 = N->getOperand(0);
15456 SDValue Op1 = N->getOperand(1);
15457 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15458 SDLoc dl(N);
15459
15460 // vcmp X, 0, cc -> vcmpz X, cc
15461 if (isZeroVector(Op1))
15462 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15463
15464 unsigned SwappedCond = getSwappedCondition(Cond);
15465 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15466 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15467 if (isZeroVector(Op0))
15468 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15469 DAG.getConstant(SwappedCond, dl, MVT::i32));
15470 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15471 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15472 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15473 DAG.getConstant(SwappedCond, dl, MVT::i32));
15474 }
15475
15476 return SDValue();
15477}
15478
15479/// PerformInsertEltCombine - Target-specific dag combine xforms for
15480/// ISD::INSERT_VECTOR_ELT.
15483 // Bitcast an i64 load inserted into a vector to f64.
15484 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15485 EVT VT = N->getValueType(0);
15486 SDNode *Elt = N->getOperand(1).getNode();
15487 if (VT.getVectorElementType() != MVT::i64 ||
15488 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15489 return SDValue();
15490
15491 SelectionDAG &DAG = DCI.DAG;
15492 SDLoc dl(N);
15493 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15495 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15496 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15497 // Make the DAGCombiner fold the bitcasts.
15498 DCI.AddToWorklist(Vec.getNode());
15499 DCI.AddToWorklist(V.getNode());
15500 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15501 Vec, V, N->getOperand(2));
15502 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15503}
15504
15505// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15506// directly or bitcast to an integer if the original is a float vector.
15507// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15508// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15509static SDValue
15511 EVT VT = N->getValueType(0);
15512 SDLoc dl(N);
15513
15514 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15515 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15516 return SDValue();
15517
15518 SDValue Ext = SDValue(N, 0);
15519 if (Ext.getOpcode() == ISD::BITCAST &&
15520 Ext.getOperand(0).getValueType() == MVT::f32)
15521 Ext = Ext.getOperand(0);
15522 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15524 Ext.getConstantOperandVal(1) % 2 != 0)
15525 return SDValue();
15526 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15527 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15528 return SDValue();
15529
15530 SDValue Op0 = Ext.getOperand(0);
15531 EVT VecVT = Op0.getValueType();
15532 unsigned ResNo = Op0.getResNo();
15533 unsigned Lane = Ext.getConstantOperandVal(1);
15534 if (VecVT.getVectorNumElements() != 4)
15535 return SDValue();
15536
15537 // Find another extract, of Lane + 1
15538 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15539 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15540 isa<ConstantSDNode>(V->getOperand(1)) &&
15541 V->getConstantOperandVal(1) == Lane + 1 &&
15542 V->getOperand(0).getResNo() == ResNo;
15543 });
15544 if (OtherIt == Op0->users().end())
15545 return SDValue();
15546
15547 // For float extracts, we need to be converting to a i32 for both vector
15548 // lanes.
15549 SDValue OtherExt(*OtherIt, 0);
15550 if (OtherExt.getValueType() != MVT::i32) {
15551 if (!OtherExt->hasOneUse() ||
15552 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15553 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15554 return SDValue();
15555 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15556 }
15557
15558 // Convert the type to a f64 and extract with a VMOVRRD.
15559 SDValue F64 = DCI.DAG.getNode(
15560 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15561 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15562 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15563 SDValue VMOVRRD =
15564 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15565
15566 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15567 return VMOVRRD;
15568}
15569
15572 const ARMSubtarget *ST) {
15573 SDValue Op0 = N->getOperand(0);
15574 EVT VT = N->getValueType(0);
15575 SDLoc dl(N);
15576
15577 // extract (vdup x) -> x
15578 if (Op0->getOpcode() == ARMISD::VDUP) {
15579 SDValue X = Op0->getOperand(0);
15580 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15581 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15582 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15583 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15584 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15585 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15586
15587 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15588 X = X->getOperand(0);
15589 if (X.getValueType() == VT)
15590 return X;
15591 }
15592
15593 // extract ARM_BUILD_VECTOR -> x
15594 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15595 isa<ConstantSDNode>(N->getOperand(1)) &&
15596 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15597 return Op0.getOperand(N->getConstantOperandVal(1));
15598 }
15599
15600 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15601 if (Op0.getValueType() == MVT::v4i32 &&
15602 isa<ConstantSDNode>(N->getOperand(1)) &&
15603 Op0.getOpcode() == ISD::BITCAST &&
15605 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15606 SDValue BV = Op0.getOperand(0);
15607 unsigned Offset = N->getConstantOperandVal(1);
15608 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15609 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15610 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15611 }
15612
15613 // extract x, n; extract x, n+1 -> VMOVRRD x
15614 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15615 return R;
15616
15617 // extract (MVETrunc(x)) -> extract x
15618 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15619 unsigned Idx = N->getConstantOperandVal(1);
15620 unsigned Vec =
15622 unsigned SubIdx =
15624 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15625 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15626 }
15627
15628 return SDValue();
15629}
15630
15632 SDValue Op = N->getOperand(0);
15633 EVT VT = N->getValueType(0);
15634
15635 // sext_inreg(VGETLANEu) -> VGETLANEs
15636 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15637 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15638 Op.getOperand(0).getValueType().getScalarType())
15639 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15640 Op.getOperand(1));
15641
15642 return SDValue();
15643}
15644
15645static SDValue
15647 SDValue Vec = N->getOperand(0);
15648 SDValue SubVec = N->getOperand(1);
15649 uint64_t IdxVal = N->getConstantOperandVal(2);
15650 EVT VecVT = Vec.getValueType();
15651 EVT SubVT = SubVec.getValueType();
15652
15653 // Only do this for legal fixed vector types.
15654 if (!VecVT.isFixedLengthVector() ||
15655 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15657 return SDValue();
15658
15659 // Ignore widening patterns.
15660 if (IdxVal == 0 && Vec.isUndef())
15661 return SDValue();
15662
15663 // Subvector must be half the width and an "aligned" insertion.
15664 unsigned NumSubElts = SubVT.getVectorNumElements();
15665 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15666 (IdxVal != 0 && IdxVal != NumSubElts))
15667 return SDValue();
15668
15669 // Fold insert_subvector -> concat_vectors
15670 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15671 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15672 SDLoc DL(N);
15673 SDValue Lo, Hi;
15674 if (IdxVal == 0) {
15675 Lo = SubVec;
15676 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15677 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15678 } else {
15679 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15680 DCI.DAG.getVectorIdxConstant(0, DL));
15681 Hi = SubVec;
15682 }
15683 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15684}
15685
15686// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15688 SelectionDAG &DAG) {
15689 SDValue Trunc = N->getOperand(0);
15690 EVT VT = Trunc.getValueType();
15691 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15692 return SDValue();
15693
15694 SDLoc DL(Trunc);
15695 if (isVMOVNTruncMask(N->getMask(), VT, false))
15696 return DAG.getNode(
15697 ARMISD::VMOVN, DL, VT,
15698 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15699 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15700 DAG.getConstant(1, DL, MVT::i32));
15701 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15702 return DAG.getNode(
15703 ARMISD::VMOVN, DL, VT,
15704 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15705 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15706 DAG.getConstant(1, DL, MVT::i32));
15707 return SDValue();
15708}
15709
15710/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15711/// ISD::VECTOR_SHUFFLE.
15714 return R;
15715
15716 // The LLVM shufflevector instruction does not require the shuffle mask
15717 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15718 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15719 // operands do not match the mask length, they are extended by concatenating
15720 // them with undef vectors. That is probably the right thing for other
15721 // targets, but for NEON it is better to concatenate two double-register
15722 // size vector operands into a single quad-register size vector. Do that
15723 // transformation here:
15724 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15725 // shuffle(concat(v1, v2), undef)
15726 SDValue Op0 = N->getOperand(0);
15727 SDValue Op1 = N->getOperand(1);
15728 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15729 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15730 Op0.getNumOperands() != 2 ||
15731 Op1.getNumOperands() != 2)
15732 return SDValue();
15733 SDValue Concat0Op1 = Op0.getOperand(1);
15734 SDValue Concat1Op1 = Op1.getOperand(1);
15735 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15736 return SDValue();
15737 // Skip the transformation if any of the types are illegal.
15738 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15739 EVT VT = N->getValueType(0);
15740 if (!TLI.isTypeLegal(VT) ||
15741 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15742 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15743 return SDValue();
15744
15745 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15746 Op0.getOperand(0), Op1.getOperand(0));
15747 // Translate the shuffle mask.
15748 SmallVector<int, 16> NewMask;
15749 unsigned NumElts = VT.getVectorNumElements();
15750 unsigned HalfElts = NumElts/2;
15752 for (unsigned n = 0; n < NumElts; ++n) {
15753 int MaskElt = SVN->getMaskElt(n);
15754 int NewElt = -1;
15755 if (MaskElt < (int)HalfElts)
15756 NewElt = MaskElt;
15757 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15758 NewElt = HalfElts + MaskElt - NumElts;
15759 NewMask.push_back(NewElt);
15760 }
15761 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15762 DAG.getUNDEF(VT), NewMask);
15763}
15764
15765/// Load/store instruction that can be merged with a base address
15766/// update
15771 unsigned AddrOpIdx;
15772};
15773
15775 /// Instruction that updates a pointer
15777 /// Pointer increment operand
15779 /// Pointer increment value if it is a constant, or 0 otherwise
15780 unsigned ConstInc;
15781};
15782
15784 // Check that the add is independent of the load/store.
15785 // Otherwise, folding it would create a cycle. Search through Addr
15786 // as well, since the User may not be a direct user of Addr and
15787 // only share a base pointer.
15790 Worklist.push_back(N);
15791 Worklist.push_back(User);
15792 const unsigned MaxSteps = 1024;
15793 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15794 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15795 return false;
15796 return true;
15797}
15798
15800 struct BaseUpdateUser &User,
15801 bool SimpleConstIncOnly,
15803 SelectionDAG &DAG = DCI.DAG;
15804 SDNode *N = Target.N;
15805 MemSDNode *MemN = cast<MemSDNode>(N);
15806 SDLoc dl(N);
15807
15808 // Find the new opcode for the updating load/store.
15809 bool isLoadOp = true;
15810 bool isLaneOp = false;
15811 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15812 // as an operand.
15813 bool hasAlignment = true;
15814 unsigned NewOpc = 0;
15815 unsigned NumVecs = 0;
15816 if (Target.isIntrinsic) {
15817 unsigned IntNo = N->getConstantOperandVal(1);
15818 switch (IntNo) {
15819 default:
15820 llvm_unreachable("unexpected intrinsic for Neon base update");
15821 case Intrinsic::arm_neon_vld1:
15822 NewOpc = ARMISD::VLD1_UPD;
15823 NumVecs = 1;
15824 break;
15825 case Intrinsic::arm_neon_vld2:
15826 NewOpc = ARMISD::VLD2_UPD;
15827 NumVecs = 2;
15828 break;
15829 case Intrinsic::arm_neon_vld3:
15830 NewOpc = ARMISD::VLD3_UPD;
15831 NumVecs = 3;
15832 break;
15833 case Intrinsic::arm_neon_vld4:
15834 NewOpc = ARMISD::VLD4_UPD;
15835 NumVecs = 4;
15836 break;
15837 case Intrinsic::arm_neon_vld1x2:
15838 NewOpc = ARMISD::VLD1x2_UPD;
15839 NumVecs = 2;
15840 hasAlignment = false;
15841 break;
15842 case Intrinsic::arm_neon_vld1x3:
15843 NewOpc = ARMISD::VLD1x3_UPD;
15844 NumVecs = 3;
15845 hasAlignment = false;
15846 break;
15847 case Intrinsic::arm_neon_vld1x4:
15848 NewOpc = ARMISD::VLD1x4_UPD;
15849 NumVecs = 4;
15850 hasAlignment = false;
15851 break;
15852 case Intrinsic::arm_neon_vld2dup:
15853 NewOpc = ARMISD::VLD2DUP_UPD;
15854 NumVecs = 2;
15855 break;
15856 case Intrinsic::arm_neon_vld3dup:
15857 NewOpc = ARMISD::VLD3DUP_UPD;
15858 NumVecs = 3;
15859 break;
15860 case Intrinsic::arm_neon_vld4dup:
15861 NewOpc = ARMISD::VLD4DUP_UPD;
15862 NumVecs = 4;
15863 break;
15864 case Intrinsic::arm_neon_vld2lane:
15865 NewOpc = ARMISD::VLD2LN_UPD;
15866 NumVecs = 2;
15867 isLaneOp = true;
15868 break;
15869 case Intrinsic::arm_neon_vld3lane:
15870 NewOpc = ARMISD::VLD3LN_UPD;
15871 NumVecs = 3;
15872 isLaneOp = true;
15873 break;
15874 case Intrinsic::arm_neon_vld4lane:
15875 NewOpc = ARMISD::VLD4LN_UPD;
15876 NumVecs = 4;
15877 isLaneOp = true;
15878 break;
15879 case Intrinsic::arm_neon_vst1:
15880 NewOpc = ARMISD::VST1_UPD;
15881 NumVecs = 1;
15882 isLoadOp = false;
15883 break;
15884 case Intrinsic::arm_neon_vst2:
15885 NewOpc = ARMISD::VST2_UPD;
15886 NumVecs = 2;
15887 isLoadOp = false;
15888 break;
15889 case Intrinsic::arm_neon_vst3:
15890 NewOpc = ARMISD::VST3_UPD;
15891 NumVecs = 3;
15892 isLoadOp = false;
15893 break;
15894 case Intrinsic::arm_neon_vst4:
15895 NewOpc = ARMISD::VST4_UPD;
15896 NumVecs = 4;
15897 isLoadOp = false;
15898 break;
15899 case Intrinsic::arm_neon_vst2lane:
15900 NewOpc = ARMISD::VST2LN_UPD;
15901 NumVecs = 2;
15902 isLoadOp = false;
15903 isLaneOp = true;
15904 break;
15905 case Intrinsic::arm_neon_vst3lane:
15906 NewOpc = ARMISD::VST3LN_UPD;
15907 NumVecs = 3;
15908 isLoadOp = false;
15909 isLaneOp = true;
15910 break;
15911 case Intrinsic::arm_neon_vst4lane:
15912 NewOpc = ARMISD::VST4LN_UPD;
15913 NumVecs = 4;
15914 isLoadOp = false;
15915 isLaneOp = true;
15916 break;
15917 case Intrinsic::arm_neon_vst1x2:
15918 NewOpc = ARMISD::VST1x2_UPD;
15919 NumVecs = 2;
15920 isLoadOp = false;
15921 hasAlignment = false;
15922 break;
15923 case Intrinsic::arm_neon_vst1x3:
15924 NewOpc = ARMISD::VST1x3_UPD;
15925 NumVecs = 3;
15926 isLoadOp = false;
15927 hasAlignment = false;
15928 break;
15929 case Intrinsic::arm_neon_vst1x4:
15930 NewOpc = ARMISD::VST1x4_UPD;
15931 NumVecs = 4;
15932 isLoadOp = false;
15933 hasAlignment = false;
15934 break;
15935 }
15936 } else {
15937 isLaneOp = true;
15938 switch (N->getOpcode()) {
15939 default:
15940 llvm_unreachable("unexpected opcode for Neon base update");
15941 case ARMISD::VLD1DUP:
15942 NewOpc = ARMISD::VLD1DUP_UPD;
15943 NumVecs = 1;
15944 break;
15945 case ARMISD::VLD2DUP:
15946 NewOpc = ARMISD::VLD2DUP_UPD;
15947 NumVecs = 2;
15948 break;
15949 case ARMISD::VLD3DUP:
15950 NewOpc = ARMISD::VLD3DUP_UPD;
15951 NumVecs = 3;
15952 break;
15953 case ARMISD::VLD4DUP:
15954 NewOpc = ARMISD::VLD4DUP_UPD;
15955 NumVecs = 4;
15956 break;
15957 case ISD::LOAD:
15958 NewOpc = ARMISD::VLD1_UPD;
15959 NumVecs = 1;
15960 isLaneOp = false;
15961 break;
15962 case ISD::STORE:
15963 NewOpc = ARMISD::VST1_UPD;
15964 NumVecs = 1;
15965 isLaneOp = false;
15966 isLoadOp = false;
15967 break;
15968 }
15969 }
15970
15971 // Find the size of memory referenced by the load/store.
15972 EVT VecTy;
15973 if (isLoadOp) {
15974 VecTy = N->getValueType(0);
15975 } else if (Target.isIntrinsic) {
15976 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15977 } else {
15978 assert(Target.isStore &&
15979 "Node has to be a load, a store, or an intrinsic!");
15980 VecTy = N->getOperand(1).getValueType();
15981 }
15982
15983 bool isVLDDUPOp =
15984 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15985 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15986
15987 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15988 if (isLaneOp || isVLDDUPOp)
15989 NumBytes /= VecTy.getVectorNumElements();
15990
15991 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15992 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15993 // separate instructions that make it harder to use a non-constant update.
15994 return false;
15995 }
15996
15997 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15998 return false;
15999
16000 if (!isValidBaseUpdate(N, User.N))
16001 return false;
16002
16003 // OK, we found an ADD we can fold into the base update.
16004 // Now, create a _UPD node, taking care of not breaking alignment.
16005
16006 EVT AlignedVecTy = VecTy;
16007 Align Alignment = MemN->getAlign();
16008
16009 // If this is a less-than-standard-aligned load/store, change the type to
16010 // match the standard alignment.
16011 // The alignment is overlooked when selecting _UPD variants; and it's
16012 // easier to introduce bitcasts here than fix that.
16013 // There are 3 ways to get to this base-update combine:
16014 // - intrinsics: they are assumed to be properly aligned (to the standard
16015 // alignment of the memory type), so we don't need to do anything.
16016 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16017 // intrinsics, so, likewise, there's nothing to do.
16018 // - generic load/store instructions: the alignment is specified as an
16019 // explicit operand, rather than implicitly as the standard alignment
16020 // of the memory type (like the intrinsics). We need to change the
16021 // memory type to match the explicit alignment. That way, we don't
16022 // generate non-standard-aligned ARMISD::VLDx nodes.
16023 if (isa<LSBaseSDNode>(N)) {
16024 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16025 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16026 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16027 assert(!isLaneOp && "Unexpected generic load/store lane.");
16028 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16029 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16030 }
16031 // Don't set an explicit alignment on regular load/stores that we want
16032 // to transform to VLD/VST 1_UPD nodes.
16033 // This matches the behavior of regular load/stores, which only get an
16034 // explicit alignment if the MMO alignment is larger than the standard
16035 // alignment of the memory type.
16036 // Intrinsics, however, always get an explicit alignment, set to the
16037 // alignment of the MMO.
16038 Alignment = Align(1);
16039 }
16040
16041 // Create the new updating load/store node.
16042 // First, create an SDVTList for the new updating node's results.
16043 EVT Tys[6];
16044 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16045 unsigned n;
16046 for (n = 0; n < NumResultVecs; ++n)
16047 Tys[n] = AlignedVecTy;
16048 Tys[n++] = MVT::i32;
16049 Tys[n] = MVT::Other;
16050 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16051
16052 // Then, gather the new node's operands.
16054 Ops.push_back(N->getOperand(0)); // incoming chain
16055 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16056 Ops.push_back(User.Inc);
16057
16058 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16059 // Try to match the intrinsic's signature
16060 Ops.push_back(StN->getValue());
16061 } else {
16062 // Loads (and of course intrinsics) match the intrinsics' signature,
16063 // so just add all but the alignment operand.
16064 unsigned LastOperand =
16065 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16066 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16067 Ops.push_back(N->getOperand(i));
16068 }
16069
16070 // For all node types, the alignment operand is always the last one.
16071 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16072
16073 // If this is a non-standard-aligned STORE, the penultimate operand is the
16074 // stored value. Bitcast it to the aligned type.
16075 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16076 SDValue &StVal = Ops[Ops.size() - 2];
16077 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16078 }
16079
16080 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16081 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16082 MemN->getMemOperand());
16083
16084 // Update the uses.
16085 SmallVector<SDValue, 5> NewResults;
16086 for (unsigned i = 0; i < NumResultVecs; ++i)
16087 NewResults.push_back(SDValue(UpdN.getNode(), i));
16088
16089 // If this is an non-standard-aligned LOAD, the first result is the loaded
16090 // value. Bitcast it to the expected result type.
16091 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16092 SDValue &LdVal = NewResults[0];
16093 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16094 }
16095
16096 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16097 DCI.CombineTo(N, NewResults);
16098 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16099
16100 return true;
16101}
16102
16103// If (opcode ptr inc) is and ADD-like instruction, return the
16104// increment value. Otherwise return 0.
16105static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16106 SDValue Inc, const SelectionDAG &DAG) {
16108 if (!CInc)
16109 return 0;
16110
16111 switch (Opcode) {
16112 case ARMISD::VLD1_UPD:
16113 case ISD::ADD:
16114 return CInc->getZExtValue();
16115 case ISD::OR: {
16116 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16117 // (OR ptr inc) is the same as (ADD ptr inc)
16118 return CInc->getZExtValue();
16119 }
16120 return 0;
16121 }
16122 default:
16123 return 0;
16124 }
16125}
16126
16128 switch (N->getOpcode()) {
16129 case ISD::ADD:
16130 case ISD::OR: {
16131 if (isa<ConstantSDNode>(N->getOperand(1))) {
16132 *Ptr = N->getOperand(0);
16133 *CInc = N->getOperand(1);
16134 return true;
16135 }
16136 return false;
16137 }
16138 case ARMISD::VLD1_UPD: {
16139 if (isa<ConstantSDNode>(N->getOperand(2))) {
16140 *Ptr = N->getOperand(1);
16141 *CInc = N->getOperand(2);
16142 return true;
16143 }
16144 return false;
16145 }
16146 default:
16147 return false;
16148 }
16149}
16150
16151/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16152/// NEON load/store intrinsics, and generic vector load/stores, to merge
16153/// base address updates.
16154/// For generic load/stores, the memory type is assumed to be a vector.
16155/// The caller is assumed to have checked legality.
16158 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16159 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16160 const bool isStore = N->getOpcode() == ISD::STORE;
16161 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16162 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16163
16164 // Limit the number of possible base-updates we look at to prevent degenerate
16165 // cases.
16166 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16167
16168 SDValue Addr = N->getOperand(AddrOpIdx);
16169
16171
16172 // Search for a use of the address operand that is an increment.
16173 for (SDUse &Use : Addr->uses()) {
16174 SDNode *User = Use.getUser();
16175 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16176 continue;
16177
16178 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16179 unsigned ConstInc =
16180 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16181
16182 if (ConstInc || User->getOpcode() == ISD::ADD) {
16183 BaseUpdates.push_back({User, Inc, ConstInc});
16184 if (BaseUpdates.size() >= MaxBaseUpdates)
16185 break;
16186 }
16187 }
16188
16189 // If the address is a constant pointer increment itself, find
16190 // another constant increment that has the same base operand
16191 SDValue Base;
16192 SDValue CInc;
16193 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16194 unsigned Offset =
16195 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16196 for (SDUse &Use : Base->uses()) {
16197
16198 SDNode *User = Use.getUser();
16199 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16200 User->getNumOperands() != 2)
16201 continue;
16202
16203 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16204 unsigned UserOffset =
16205 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16206
16207 if (!UserOffset || UserOffset <= Offset)
16208 continue;
16209
16210 unsigned NewConstInc = UserOffset - Offset;
16211 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16212 BaseUpdates.push_back({User, NewInc, NewConstInc});
16213 if (BaseUpdates.size() >= MaxBaseUpdates)
16214 break;
16215 }
16216 }
16217
16218 // Try to fold the load/store with an update that matches memory
16219 // access size. This should work well for sequential loads.
16220 unsigned NumValidUpd = BaseUpdates.size();
16221 for (unsigned I = 0; I < NumValidUpd; I++) {
16222 BaseUpdateUser &User = BaseUpdates[I];
16223 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16224 return SDValue();
16225 }
16226
16227 // Try to fold with other users. Non-constant updates are considered
16228 // first, and constant updates are sorted to not break a sequence of
16229 // strided accesses (if there is any).
16230 llvm::stable_sort(BaseUpdates,
16231 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16232 return LHS.ConstInc < RHS.ConstInc;
16233 });
16234 for (BaseUpdateUser &User : BaseUpdates) {
16235 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16236 return SDValue();
16237 }
16238 return SDValue();
16239}
16240
16243 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16244 return SDValue();
16245
16246 return CombineBaseUpdate(N, DCI);
16247}
16248
16251 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16252 return SDValue();
16253
16254 SelectionDAG &DAG = DCI.DAG;
16255 SDValue Addr = N->getOperand(2);
16256 MemSDNode *MemN = cast<MemSDNode>(N);
16257 SDLoc dl(N);
16258
16259 // For the stores, where there are multiple intrinsics we only actually want
16260 // to post-inc the last of the them.
16261 unsigned IntNo = N->getConstantOperandVal(1);
16262 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16263 return SDValue();
16264 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16265 return SDValue();
16266
16267 // Search for a use of the address operand that is an increment.
16268 for (SDUse &Use : Addr->uses()) {
16269 SDNode *User = Use.getUser();
16270 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16271 continue;
16272
16273 // Check that the add is independent of the load/store. Otherwise, folding
16274 // it would create a cycle. We can avoid searching through Addr as it's a
16275 // predecessor to both.
16278 Visited.insert(Addr.getNode());
16279 Worklist.push_back(N);
16280 Worklist.push_back(User);
16281 const unsigned MaxSteps = 1024;
16282 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16283 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16284 continue;
16285
16286 // Find the new opcode for the updating load/store.
16287 bool isLoadOp = true;
16288 unsigned NewOpc = 0;
16289 unsigned NumVecs = 0;
16290 switch (IntNo) {
16291 default:
16292 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16293 case Intrinsic::arm_mve_vld2q:
16294 NewOpc = ARMISD::VLD2_UPD;
16295 NumVecs = 2;
16296 break;
16297 case Intrinsic::arm_mve_vld4q:
16298 NewOpc = ARMISD::VLD4_UPD;
16299 NumVecs = 4;
16300 break;
16301 case Intrinsic::arm_mve_vst2q:
16302 NewOpc = ARMISD::VST2_UPD;
16303 NumVecs = 2;
16304 isLoadOp = false;
16305 break;
16306 case Intrinsic::arm_mve_vst4q:
16307 NewOpc = ARMISD::VST4_UPD;
16308 NumVecs = 4;
16309 isLoadOp = false;
16310 break;
16311 }
16312
16313 // Find the size of memory referenced by the load/store.
16314 EVT VecTy;
16315 if (isLoadOp) {
16316 VecTy = N->getValueType(0);
16317 } else {
16318 VecTy = N->getOperand(3).getValueType();
16319 }
16320
16321 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16322
16323 // If the increment is a constant, it must match the memory ref size.
16324 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16326 if (!CInc || CInc->getZExtValue() != NumBytes)
16327 continue;
16328
16329 // Create the new updating load/store node.
16330 // First, create an SDVTList for the new updating node's results.
16331 EVT Tys[6];
16332 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16333 unsigned n;
16334 for (n = 0; n < NumResultVecs; ++n)
16335 Tys[n] = VecTy;
16336 Tys[n++] = MVT::i32;
16337 Tys[n] = MVT::Other;
16338 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16339
16340 // Then, gather the new node's operands.
16342 Ops.push_back(N->getOperand(0)); // incoming chain
16343 Ops.push_back(N->getOperand(2)); // ptr
16344 Ops.push_back(Inc);
16345
16346 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16347 Ops.push_back(N->getOperand(i));
16348
16349 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16350 MemN->getMemOperand());
16351
16352 // Update the uses.
16353 SmallVector<SDValue, 5> NewResults;
16354 for (unsigned i = 0; i < NumResultVecs; ++i)
16355 NewResults.push_back(SDValue(UpdN.getNode(), i));
16356
16357 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16358 DCI.CombineTo(N, NewResults);
16359 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16360
16361 break;
16362 }
16363
16364 return SDValue();
16365}
16366
16367/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16368/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16369/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16370/// return true.
16372 SelectionDAG &DAG = DCI.DAG;
16373 EVT VT = N->getValueType(0);
16374 // vldN-dup instructions only support 64-bit vectors for N > 1.
16375 if (!VT.is64BitVector())
16376 return false;
16377
16378 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16379 SDNode *VLD = N->getOperand(0).getNode();
16380 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16381 return false;
16382 unsigned NumVecs = 0;
16383 unsigned NewOpc = 0;
16384 unsigned IntNo = VLD->getConstantOperandVal(1);
16385 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16386 NumVecs = 2;
16387 NewOpc = ARMISD::VLD2DUP;
16388 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16389 NumVecs = 3;
16390 NewOpc = ARMISD::VLD3DUP;
16391 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16392 NumVecs = 4;
16393 NewOpc = ARMISD::VLD4DUP;
16394 } else {
16395 return false;
16396 }
16397
16398 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16399 // numbers match the load.
16400 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16401 for (SDUse &Use : VLD->uses()) {
16402 // Ignore uses of the chain result.
16403 if (Use.getResNo() == NumVecs)
16404 continue;
16405 SDNode *User = Use.getUser();
16406 if (User->getOpcode() != ARMISD::VDUPLANE ||
16407 VLDLaneNo != User->getConstantOperandVal(1))
16408 return false;
16409 }
16410
16411 // Create the vldN-dup node.
16412 EVT Tys[5];
16413 unsigned n;
16414 for (n = 0; n < NumVecs; ++n)
16415 Tys[n] = VT;
16416 Tys[n] = MVT::Other;
16417 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16418 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16420 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16421 Ops, VLDMemInt->getMemoryVT(),
16422 VLDMemInt->getMemOperand());
16423
16424 // Update the uses.
16425 for (SDUse &Use : VLD->uses()) {
16426 unsigned ResNo = Use.getResNo();
16427 // Ignore uses of the chain result.
16428 if (ResNo == NumVecs)
16429 continue;
16430 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16431 }
16432
16433 // Now the vldN-lane intrinsic is dead except for its chain result.
16434 // Update uses of the chain.
16435 std::vector<SDValue> VLDDupResults;
16436 for (unsigned n = 0; n < NumVecs; ++n)
16437 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16438 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16439 DCI.CombineTo(VLD, VLDDupResults);
16440
16441 return true;
16442}
16443
16444/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16445/// ARMISD::VDUPLANE.
16448 const ARMSubtarget *Subtarget) {
16449 SDValue Op = N->getOperand(0);
16450 EVT VT = N->getValueType(0);
16451
16452 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16453 if (Subtarget->hasMVEIntegerOps()) {
16454 EVT ExtractVT = VT.getVectorElementType();
16455 // We need to ensure we are creating a legal type.
16456 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16457 ExtractVT = MVT::i32;
16458 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16459 N->getOperand(0), N->getOperand(1));
16460 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16461 }
16462
16463 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16464 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16465 if (CombineVLDDUP(N, DCI))
16466 return SDValue(N, 0);
16467
16468 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16469 // redundant. Ignore bit_converts for now; element sizes are checked below.
16470 while (Op.getOpcode() == ISD::BITCAST)
16471 Op = Op.getOperand(0);
16472 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16473 return SDValue();
16474
16475 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16476 unsigned EltSize = Op.getScalarValueSizeInBits();
16477 // The canonical VMOV for a zero vector uses a 32-bit element size.
16478 unsigned Imm = Op.getConstantOperandVal(0);
16479 unsigned EltBits;
16480 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16481 EltSize = 8;
16482 if (EltSize > VT.getScalarSizeInBits())
16483 return SDValue();
16484
16485 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16486}
16487
16488/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16490 const ARMSubtarget *Subtarget) {
16491 SDValue Op = N->getOperand(0);
16492 SDLoc dl(N);
16493
16494 if (Subtarget->hasMVEIntegerOps()) {
16495 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16496 // need to come from a GPR.
16497 if (Op.getValueType() == MVT::f32)
16498 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16499 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16500 else if (Op.getValueType() == MVT::f16)
16501 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16502 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16503 }
16504
16505 if (!Subtarget->hasNEON())
16506 return SDValue();
16507
16508 // Match VDUP(LOAD) -> VLD1DUP.
16509 // We match this pattern here rather than waiting for isel because the
16510 // transform is only legal for unindexed loads.
16511 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16512 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16513 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16514 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16515 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16516 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16517 SDValue VLDDup =
16519 LD->getMemoryVT(), LD->getMemOperand());
16520 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16521 return VLDDup;
16522 }
16523
16524 return SDValue();
16525}
16526
16529 const ARMSubtarget *Subtarget) {
16530 EVT VT = N->getValueType(0);
16531
16532 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16533 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16535 return CombineBaseUpdate(N, DCI);
16536
16537 return SDValue();
16538}
16539
16540// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16541// pack all of the elements in one place. Next, store to memory in fewer
16542// chunks.
16544 SelectionDAG &DAG) {
16545 SDValue StVal = St->getValue();
16546 EVT VT = StVal.getValueType();
16547 if (!St->isTruncatingStore() || !VT.isVector())
16548 return SDValue();
16549 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16550 EVT StVT = St->getMemoryVT();
16551 unsigned NumElems = VT.getVectorNumElements();
16552 assert(StVT != VT && "Cannot truncate to the same type");
16553 unsigned FromEltSz = VT.getScalarSizeInBits();
16554 unsigned ToEltSz = StVT.getScalarSizeInBits();
16555
16556 // From, To sizes and ElemCount must be pow of two
16557 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16558 return SDValue();
16559
16560 // We are going to use the original vector elt for storing.
16561 // Accumulated smaller vector elements must be a multiple of the store size.
16562 if (0 != (NumElems * FromEltSz) % ToEltSz)
16563 return SDValue();
16564
16565 unsigned SizeRatio = FromEltSz / ToEltSz;
16566 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16567
16568 // Create a type on which we perform the shuffle.
16569 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16570 NumElems * SizeRatio);
16571 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16572
16573 SDLoc DL(St);
16574 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16575 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16576 for (unsigned i = 0; i < NumElems; ++i)
16577 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16578 : i * SizeRatio;
16579
16580 // Can't shuffle using an illegal type.
16581 if (!TLI.isTypeLegal(WideVecVT))
16582 return SDValue();
16583
16584 SDValue Shuff = DAG.getVectorShuffle(
16585 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16586 // At this point all of the data is stored at the bottom of the
16587 // register. We now need to save it to mem.
16588
16589 // Find the largest store unit
16590 MVT StoreType = MVT::i8;
16591 for (MVT Tp : MVT::integer_valuetypes()) {
16592 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16593 StoreType = Tp;
16594 }
16595 // Didn't find a legal store type.
16596 if (!TLI.isTypeLegal(StoreType))
16597 return SDValue();
16598
16599 // Bitcast the original vector into a vector of store-size units
16600 EVT StoreVecVT =
16601 EVT::getVectorVT(*DAG.getContext(), StoreType,
16602 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16603 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16604 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16606 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16607 TLI.getPointerTy(DAG.getDataLayout()));
16608 SDValue BasePtr = St->getBasePtr();
16609
16610 // Perform one or more big stores into memory.
16611 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16612 for (unsigned I = 0; I < E; I++) {
16613 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16614 ShuffWide, DAG.getIntPtrConstant(I, DL));
16615 SDValue Ch =
16616 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16617 St->getAlign(), St->getMemOperand()->getFlags());
16618 BasePtr =
16619 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16620 Chains.push_back(Ch);
16621 }
16622 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16623}
16624
16625// Try taking a single vector store from an fpround (which would otherwise turn
16626// into an expensive buildvector) and splitting it into a series of narrowing
16627// stores.
16629 SelectionDAG &DAG) {
16630 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16631 return SDValue();
16632 SDValue Trunc = St->getValue();
16633 if (Trunc->getOpcode() != ISD::FP_ROUND)
16634 return SDValue();
16635 EVT FromVT = Trunc->getOperand(0).getValueType();
16636 EVT ToVT = Trunc.getValueType();
16637 if (!ToVT.isVector())
16638 return SDValue();
16640 EVT ToEltVT = ToVT.getVectorElementType();
16641 EVT FromEltVT = FromVT.getVectorElementType();
16642
16643 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16644 return SDValue();
16645
16646 unsigned NumElements = 4;
16647 if (FromVT.getVectorNumElements() % NumElements != 0)
16648 return SDValue();
16649
16650 // Test if the Trunc will be convertible to a VMOVN with a shuffle, and if so
16651 // use the VMOVN over splitting the store. We are looking for patterns of:
16652 // !rev: 0 N 1 N+1 2 N+2 ...
16653 // rev: N 0 N+1 1 N+2 2 ...
16654 // The shuffle may either be a single source (in which case N = NumElts/2) or
16655 // two inputs extended with concat to the same size (in which case N =
16656 // NumElts).
16657 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16658 ArrayRef<int> M = SVN->getMask();
16659 unsigned NumElts = ToVT.getVectorNumElements();
16660 if (SVN->getOperand(1).isUndef())
16661 NumElts /= 2;
16662
16663 unsigned Off0 = Rev ? NumElts : 0;
16664 unsigned Off1 = Rev ? 0 : NumElts;
16665
16666 for (unsigned I = 0; I < NumElts; I += 2) {
16667 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16668 return false;
16669 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16670 return false;
16671 }
16672
16673 return true;
16674 };
16675
16676 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16677 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16678 return SDValue();
16679
16680 LLVMContext &C = *DAG.getContext();
16681 SDLoc DL(St);
16682 // Details about the old store
16683 SDValue Ch = St->getChain();
16684 SDValue BasePtr = St->getBasePtr();
16685 Align Alignment = St->getBaseAlign();
16687 AAMDNodes AAInfo = St->getAAInfo();
16688
16689 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16690 // and then stored as truncating integer stores.
16691 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16692 EVT NewToVT = EVT::getVectorVT(
16693 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16694
16696 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16697 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16698 SDValue NewPtr =
16699 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16700
16701 SDValue Extract =
16702 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16703 DAG.getConstant(i * NumElements, DL, MVT::i32));
16704
16705 SDValue FPTrunc =
16706 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16707 Extract, DAG.getConstant(0, DL, MVT::i32));
16708 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16709
16710 SDValue Store = DAG.getTruncStore(
16711 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16712 NewToVT, Alignment, MMOFlags, AAInfo);
16713 Stores.push_back(Store);
16714 }
16715 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16716}
16717
16718// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16719// into an expensive buildvector) and splitting it into a series of narrowing
16720// stores.
16722 SelectionDAG &DAG) {
16723 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16724 return SDValue();
16725 SDValue Trunc = St->getValue();
16726 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16727 return SDValue();
16728 EVT FromVT = Trunc->getOperand(0).getValueType();
16729 EVT ToVT = Trunc.getValueType();
16730
16731 LLVMContext &C = *DAG.getContext();
16732 SDLoc DL(St);
16733 // Details about the old store
16734 SDValue Ch = St->getChain();
16735 SDValue BasePtr = St->getBasePtr();
16736 Align Alignment = St->getBaseAlign();
16738 AAMDNodes AAInfo = St->getAAInfo();
16739
16740 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16741 FromVT.getVectorNumElements());
16742
16744 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16745 unsigned NewOffset =
16746 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16747 SDValue NewPtr =
16748 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16749
16750 SDValue Extract = Trunc.getOperand(i);
16751 SDValue Store = DAG.getTruncStore(
16752 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16753 NewToVT, Alignment, MMOFlags, AAInfo);
16754 Stores.push_back(Store);
16755 }
16756 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16757}
16758
16759// Given a floating point store from an extracted vector, with an integer
16760// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16761// help reduce fp register pressure, doesn't require the fp extract and allows
16762// use of more integer post-inc stores not available with vstr.
16764 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16765 return SDValue();
16766 SDValue Extract = St->getValue();
16767 EVT VT = Extract.getValueType();
16768 // For now only uses f16. This may be useful for f32 too, but that will
16769 // be bitcast(extract), not the VGETLANEu we currently check here.
16770 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16771 return SDValue();
16772
16773 SDNode *GetLane =
16774 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16775 {Extract.getOperand(0), Extract.getOperand(1)});
16776 if (!GetLane)
16777 return SDValue();
16778
16779 LLVMContext &C = *DAG.getContext();
16780 SDLoc DL(St);
16781 // Create a new integer store to replace the existing floating point version.
16782 SDValue Ch = St->getChain();
16783 SDValue BasePtr = St->getBasePtr();
16784 Align Alignment = St->getBaseAlign();
16786 AAMDNodes AAInfo = St->getAAInfo();
16787 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16788 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16789 St->getPointerInfo(), NewToVT, Alignment,
16790 MMOFlags, AAInfo);
16791
16792 return Store;
16793}
16794
16795/// PerformSTORECombine - Target-specific dag combine xforms for
16796/// ISD::STORE.
16799 const ARMSubtarget *Subtarget) {
16801 if (St->isVolatile())
16802 return SDValue();
16803 SDValue StVal = St->getValue();
16804 EVT VT = StVal.getValueType();
16805
16806 if (Subtarget->hasNEON())
16807 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16808 return Store;
16809
16810 if (Subtarget->hasMVEFloatOps())
16811 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16812 return NewToken;
16813
16814 if (Subtarget->hasMVEIntegerOps()) {
16815 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16816 return NewChain;
16817 if (SDValue NewToken =
16819 return NewToken;
16820 }
16821
16822 if (!ISD::isNormalStore(St))
16823 return SDValue();
16824
16825 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16826 // ARM stores of arguments in the same cache line.
16827 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16828 StVal.getNode()->hasOneUse()) {
16829 SelectionDAG &DAG = DCI.DAG;
16830 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16831 SDLoc DL(St);
16832 SDValue BasePtr = St->getBasePtr();
16833 SDValue NewST1 = DAG.getStore(
16834 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16835 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16836 St->getMemOperand()->getFlags());
16837
16838 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16839 DAG.getConstant(4, DL, MVT::i32));
16840 return DAG.getStore(NewST1.getValue(0), DL,
16841 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16842 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16843 St->getBaseAlign(), St->getMemOperand()->getFlags());
16844 }
16845
16846 if (StVal.getValueType() == MVT::i64 &&
16848
16849 // Bitcast an i64 store extracted from a vector to f64.
16850 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16851 SelectionDAG &DAG = DCI.DAG;
16852 SDLoc dl(StVal);
16853 SDValue IntVec = StVal.getOperand(0);
16854 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16856 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16857 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16858 Vec, StVal.getOperand(1));
16859 dl = SDLoc(N);
16860 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16861 // Make the DAGCombiner fold the bitcasts.
16862 DCI.AddToWorklist(Vec.getNode());
16863 DCI.AddToWorklist(ExtElt.getNode());
16864 DCI.AddToWorklist(V.getNode());
16865 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16866 St->getPointerInfo(), St->getAlign(),
16867 St->getMemOperand()->getFlags(), St->getAAInfo());
16868 }
16869
16870 // If this is a legal vector store, try to combine it into a VST1_UPD.
16871 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16873 return CombineBaseUpdate(N, DCI);
16874
16875 return SDValue();
16876}
16877
16878/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16879/// can replace combinations of VMUL and VCVT (floating-point to integer)
16880/// when the VMUL has a constant operand that is a power of 2.
16881///
16882/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16883/// vmul.f32 d16, d17, d16
16884/// vcvt.s32.f32 d16, d16
16885/// becomes:
16886/// vcvt.s32.f32 d16, d16, #3
16888 const ARMSubtarget *Subtarget) {
16889 if (!Subtarget->hasNEON())
16890 return SDValue();
16891
16892 SDValue Op = N->getOperand(0);
16893 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16894 Op.getOpcode() != ISD::FMUL)
16895 return SDValue();
16896
16897 SDValue ConstVec = Op->getOperand(1);
16898 if (!isa<BuildVectorSDNode>(ConstVec))
16899 return SDValue();
16900
16901 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16902 uint32_t FloatBits = FloatTy.getSizeInBits();
16903 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16904 uint32_t IntBits = IntTy.getSizeInBits();
16905 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16906 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16907 // These instructions only exist converting from f32 to i32. We can handle
16908 // smaller integers by generating an extra truncate, but larger ones would
16909 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16910 // these instructions only support v2i32/v4i32 types.
16911 return SDValue();
16912 }
16913
16914 BitVector UndefElements;
16916 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16917 if (C == -1 || C == 0 || C > 32)
16918 return SDValue();
16919
16920 SDLoc dl(N);
16921 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16922 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16923 Intrinsic::arm_neon_vcvtfp2fxu;
16924 SDValue FixConv = DAG.getNode(
16925 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16926 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16927 DAG.getConstant(C, dl, MVT::i32));
16928
16929 if (IntBits < FloatBits)
16930 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16931
16932 return FixConv;
16933}
16934
16936 const ARMSubtarget *Subtarget) {
16937 if (!Subtarget->hasMVEFloatOps())
16938 return SDValue();
16939
16940 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16941 // The second form can be more easily turned into a predicated vadd, and
16942 // possibly combined into a fma to become a predicated vfma.
16943 SDValue Op0 = N->getOperand(0);
16944 SDValue Op1 = N->getOperand(1);
16945 EVT VT = N->getValueType(0);
16946 SDLoc DL(N);
16947
16948 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16949 // which these VMOV's represent.
16950 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16951 if (Op.getOpcode() != ISD::BITCAST ||
16952 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16953 return false;
16954 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16955 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16956 return true;
16957 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16958 return true;
16959 return false;
16960 };
16961
16962 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16963 std::swap(Op0, Op1);
16964
16965 if (Op1.getOpcode() != ISD::VSELECT)
16966 return SDValue();
16967
16968 SDNodeFlags FaddFlags = N->getFlags();
16969 bool NSZ = FaddFlags.hasNoSignedZeros();
16970 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16971 return SDValue();
16972
16973 SDValue FAdd =
16974 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16975 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16976}
16977
16979 SDValue LHS = N->getOperand(0);
16980 SDValue RHS = N->getOperand(1);
16981 EVT VT = N->getValueType(0);
16982 SDLoc DL(N);
16983
16984 if (!N->getFlags().hasAllowReassociation())
16985 return SDValue();
16986
16987 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16988 auto ReassocComplex = [&](SDValue A, SDValue B) {
16989 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16990 return SDValue();
16991 unsigned Opc = A.getConstantOperandVal(0);
16992 if (Opc != Intrinsic::arm_mve_vcmlaq)
16993 return SDValue();
16994 SDValue VCMLA = DAG.getNode(
16995 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16996 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16997 A.getOperand(3), A.getOperand(4));
16998 VCMLA->setFlags(A->getFlags());
16999 return VCMLA;
17000 };
17001 if (SDValue R = ReassocComplex(LHS, RHS))
17002 return R;
17003 if (SDValue R = ReassocComplex(RHS, LHS))
17004 return R;
17005
17006 return SDValue();
17007}
17008
17010 const ARMSubtarget *Subtarget) {
17011 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17012 return S;
17013 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17014 return S;
17015 return SDValue();
17016}
17017
17018/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17019/// can replace combinations of VCVT (integer to floating-point) and VMUL
17020/// when the VMUL has a constant operand that is a power of 2.
17021///
17022/// Example (assume d17 = <float 0.125, float 0.125>):
17023/// vcvt.f32.s32 d16, d16
17024/// vmul.f32 d16, d16, d17
17025/// becomes:
17026/// vcvt.f32.s32 d16, d16, #3
17028 const ARMSubtarget *Subtarget) {
17029 if (!Subtarget->hasNEON())
17030 return SDValue();
17031
17032 SDValue Op = N->getOperand(0);
17033 unsigned OpOpcode = Op.getNode()->getOpcode();
17034 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17035 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17036 return SDValue();
17037
17038 SDValue ConstVec = N->getOperand(1);
17039 if (!isa<BuildVectorSDNode>(ConstVec))
17040 return SDValue();
17041
17042 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17043 uint32_t FloatBits = FloatTy.getSizeInBits();
17044 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17045 uint32_t IntBits = IntTy.getSizeInBits();
17046 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17047 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17048 // These instructions only exist converting from i32 to f32. We can handle
17049 // smaller integers by generating an extra extend, but larger ones would
17050 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17051 // these instructions only support v2i32/v4i32 types.
17052 return SDValue();
17053 }
17054
17055 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17056 APFloat Recip(0.0f);
17057 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17058 return SDValue();
17059
17060 bool IsExact;
17061 APSInt IntVal(33);
17062 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17063 APFloat::opOK ||
17064 !IsExact)
17065 return SDValue();
17066
17067 int32_t C = IntVal.exactLogBase2();
17068 if (C == -1 || C == 0 || C > 32)
17069 return SDValue();
17070
17071 SDLoc DL(N);
17072 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17073 SDValue ConvInput = Op.getOperand(0);
17074 if (IntBits < FloatBits)
17075 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17076 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17077
17078 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17079 : Intrinsic::arm_neon_vcvtfxu2fp;
17080 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17081 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17082 DAG.getConstant(C, DL, MVT::i32));
17083}
17084
17086 const ARMSubtarget *ST) {
17087 if (!ST->hasMVEIntegerOps())
17088 return SDValue();
17089
17090 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17091 EVT ResVT = N->getValueType(0);
17092 SDValue N0 = N->getOperand(0);
17093 SDLoc dl(N);
17094
17095 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17096 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17097 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17098 N0.getValueType() == MVT::v16i8)) {
17099 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17100 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17101 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17102 }
17103
17104 // We are looking for something that will have illegal types if left alone,
17105 // but that we can convert to a single instruction under MVE. For example
17106 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17107 // or
17108 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17109
17110 // The legal cases are:
17111 // VADDV u/s 8/16/32
17112 // VMLAV u/s 8/16/32
17113 // VADDLV u/s 32
17114 // VMLALV u/s 16/32
17115
17116 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17117 // extend it and use v4i32 instead.
17118 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17119 EVT AVT = A.getValueType();
17120 return any_of(ExtTypes, [&](MVT Ty) {
17121 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17122 AVT.bitsLE(Ty);
17123 });
17124 };
17125 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17126 EVT AVT = A.getValueType();
17127 if (!AVT.is128BitVector())
17128 A = DAG.getNode(
17129 ExtendCode, dl,
17131 *DAG.getContext(),
17133 A);
17134 return A;
17135 };
17136 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17137 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17138 return SDValue();
17139 SDValue A = N0->getOperand(0);
17140 if (ExtTypeMatches(A, ExtTypes))
17141 return ExtendIfNeeded(A, ExtendCode);
17142 return SDValue();
17143 };
17144 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17145 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17146 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17148 return SDValue();
17149 Mask = N0->getOperand(0);
17150 SDValue Ext = N0->getOperand(1);
17151 if (Ext->getOpcode() != ExtendCode)
17152 return SDValue();
17153 SDValue A = Ext->getOperand(0);
17154 if (ExtTypeMatches(A, ExtTypes))
17155 return ExtendIfNeeded(A, ExtendCode);
17156 return SDValue();
17157 };
17158 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17159 SDValue &A, SDValue &B) {
17160 // For a vmla we are trying to match a larger pattern:
17161 // ExtA = sext/zext A
17162 // ExtB = sext/zext B
17163 // Mul = mul ExtA, ExtB
17164 // vecreduce.add Mul
17165 // There might also be en extra extend between the mul and the addreduce, so
17166 // long as the bitwidth is high enough to make them equivalent (for example
17167 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17168 if (ResVT != RetTy)
17169 return false;
17170 SDValue Mul = N0;
17171 if (Mul->getOpcode() == ExtendCode &&
17172 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17173 ResVT.getScalarSizeInBits())
17174 Mul = Mul->getOperand(0);
17175 if (Mul->getOpcode() != ISD::MUL)
17176 return false;
17177 SDValue ExtA = Mul->getOperand(0);
17178 SDValue ExtB = Mul->getOperand(1);
17179 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17180 return false;
17181 A = ExtA->getOperand(0);
17182 B = ExtB->getOperand(0);
17183 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17184 A = ExtendIfNeeded(A, ExtendCode);
17185 B = ExtendIfNeeded(B, ExtendCode);
17186 return true;
17187 }
17188 return false;
17189 };
17190 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17191 SDValue &A, SDValue &B, SDValue &Mask) {
17192 // Same as the pattern above with a select for the zero predicated lanes
17193 // ExtA = sext/zext A
17194 // ExtB = sext/zext B
17195 // Mul = mul ExtA, ExtB
17196 // N0 = select Mask, Mul, 0
17197 // vecreduce.add N0
17198 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17200 return false;
17201 Mask = N0->getOperand(0);
17202 SDValue Mul = N0->getOperand(1);
17203 if (Mul->getOpcode() == ExtendCode &&
17204 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17205 ResVT.getScalarSizeInBits())
17206 Mul = Mul->getOperand(0);
17207 if (Mul->getOpcode() != ISD::MUL)
17208 return false;
17209 SDValue ExtA = Mul->getOperand(0);
17210 SDValue ExtB = Mul->getOperand(1);
17211 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17212 return false;
17213 A = ExtA->getOperand(0);
17214 B = ExtB->getOperand(0);
17215 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17216 A = ExtendIfNeeded(A, ExtendCode);
17217 B = ExtendIfNeeded(B, ExtendCode);
17218 return true;
17219 }
17220 return false;
17221 };
17222 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17223 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17224 // reductions. The operands are extended with MVEEXT, but as they are
17225 // reductions the lane orders do not matter. MVEEXT may be combined with
17226 // loads to produce two extending loads, or else they will be expanded to
17227 // VREV/VMOVL.
17228 EVT VT = Ops[0].getValueType();
17229 if (VT == MVT::v16i8) {
17230 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17231 "Unexpected illegal long reduction opcode");
17232 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17233
17234 SDValue Ext0 =
17235 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17236 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17237 SDValue Ext1 =
17238 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17239 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17240
17241 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17242 Ext0, Ext1);
17243 SDValue MLA1 =
17244 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17245 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17246 Ext0.getValue(1), Ext1.getValue(1));
17247 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17248 }
17249 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17250 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17251 SDValue(Node.getNode(), 1));
17252 };
17253
17254 SDValue A, B;
17255 SDValue Mask;
17256 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17257 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17258 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17259 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17260 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17261 A, B))
17262 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17263 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17264 A, B))
17265 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17266 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17267 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17268 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17269 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17270 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17271 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17272
17273 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17274 Mask))
17275 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17276 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17277 Mask))
17278 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17279 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17280 Mask))
17281 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17282 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17283 Mask))
17284 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17285 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17286 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17287 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17288 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17289 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17290 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17291
17292 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17293 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17294 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17295 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17296 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17297 return Create64bitNode(ARMISD::VADDLVs, {A});
17298 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17299 return Create64bitNode(ARMISD::VADDLVu, {A});
17300 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17301 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17302 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17303 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17304 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17305 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17306
17307 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17308 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17309 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17310 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17311 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17312 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17313 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17314 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17315 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17316 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17317 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17318 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17319 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17320 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17321
17322 // Some complications. We can get a case where the two inputs of the mul are
17323 // the same, then the output sext will have been helpfully converted to a
17324 // zext. Turn it back.
17325 SDValue Op = N0;
17326 if (Op->getOpcode() == ISD::VSELECT)
17327 Op = Op->getOperand(1);
17328 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17329 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17330 SDValue Mul = Op->getOperand(0);
17331 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17332 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17333 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17334 if (Op != N0)
17335 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17336 N0->getOperand(0), Ext, N0->getOperand(2));
17337 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17338 }
17339 }
17340
17341 return SDValue();
17342}
17343
17344// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17345// the lanes are used. Due to the reduction being commutative the shuffle can be
17346// removed.
17348 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17349 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17350 if (!Shuf || !Shuf->getOperand(1).isUndef())
17351 return SDValue();
17352
17353 // Check all elements are used once in the mask.
17354 ArrayRef<int> Mask = Shuf->getMask();
17355 APInt SetElts(Mask.size(), 0);
17356 for (int E : Mask) {
17357 if (E < 0 || E >= (int)Mask.size())
17358 return SDValue();
17359 SetElts.setBit(E);
17360 }
17361 if (!SetElts.isAllOnes())
17362 return SDValue();
17363
17364 if (N->getNumOperands() != VecOp + 1) {
17365 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17366 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17367 return SDValue();
17368 }
17369
17371 for (SDValue Op : N->ops()) {
17372 if (Op.getValueType().isVector())
17373 Ops.push_back(Op.getOperand(0));
17374 else
17375 Ops.push_back(Op);
17376 }
17377 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17378}
17379
17382 SDValue Op0 = N->getOperand(0);
17383 SDValue Op1 = N->getOperand(1);
17384 unsigned IsTop = N->getConstantOperandVal(2);
17385
17386 // VMOVNT a undef -> a
17387 // VMOVNB a undef -> a
17388 // VMOVNB undef a -> a
17389 if (Op1->isUndef())
17390 return Op0;
17391 if (Op0->isUndef() && !IsTop)
17392 return Op1;
17393
17394 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17395 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17396 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17397 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17398 Op1->getConstantOperandVal(2) == 0)
17399 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17400 Op0, Op1->getOperand(1), N->getOperand(2));
17401
17402 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17403 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17404 // into the top or bottom lanes.
17405 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17406 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17407 APInt Op0DemandedElts =
17408 IsTop ? Op1DemandedElts
17409 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17410
17411 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17412 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17413 return SDValue(N, 0);
17414 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17415 return SDValue(N, 0);
17416
17417 return SDValue();
17418}
17419
17422 SDValue Op0 = N->getOperand(0);
17423 unsigned IsTop = N->getConstantOperandVal(2);
17424
17425 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17426 APInt Op0DemandedElts =
17427 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17428 : APInt::getHighBitsSet(2, 1));
17429
17430 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17431 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17432 return SDValue(N, 0);
17433 return SDValue();
17434}
17435
17438 EVT VT = N->getValueType(0);
17439 SDValue LHS = N->getOperand(0);
17440 SDValue RHS = N->getOperand(1);
17441
17442 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17443 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17444 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17445 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17446 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17447 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17448 SDLoc DL(N);
17449 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17450 LHS.getOperand(0), RHS.getOperand(0));
17451 SDValue UndefV = LHS.getOperand(1);
17452 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17453 }
17454 return SDValue();
17455}
17456
17458 SDLoc DL(N);
17459 SDValue Op0 = N->getOperand(0);
17460 SDValue Op1 = N->getOperand(1);
17461
17462 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17463 // uses of the intrinsics.
17464 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17465 int ShiftAmt = C->getSExtValue();
17466 if (ShiftAmt == 0) {
17467 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17468 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17469 return SDValue();
17470 }
17471
17472 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17473 unsigned NewOpcode =
17474 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17475 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17476 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17477 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17478 return NewShift;
17479 }
17480 }
17481
17482 return SDValue();
17483}
17484
17485/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17487 DAGCombinerInfo &DCI) const {
17488 SelectionDAG &DAG = DCI.DAG;
17489 unsigned IntNo = N->getConstantOperandVal(0);
17490 switch (IntNo) {
17491 default:
17492 // Don't do anything for most intrinsics.
17493 break;
17494
17495 // Vector shifts: check for immediate versions and lower them.
17496 // Note: This is done during DAG combining instead of DAG legalizing because
17497 // the build_vectors for 64-bit vector element shift counts are generally
17498 // not legal, and it is hard to see their values after they get legalized to
17499 // loads from a constant pool.
17500 case Intrinsic::arm_neon_vshifts:
17501 case Intrinsic::arm_neon_vshiftu:
17502 case Intrinsic::arm_neon_vrshifts:
17503 case Intrinsic::arm_neon_vrshiftu:
17504 case Intrinsic::arm_neon_vrshiftn:
17505 case Intrinsic::arm_neon_vqshifts:
17506 case Intrinsic::arm_neon_vqshiftu:
17507 case Intrinsic::arm_neon_vqshiftsu:
17508 case Intrinsic::arm_neon_vqshiftns:
17509 case Intrinsic::arm_neon_vqshiftnu:
17510 case Intrinsic::arm_neon_vqshiftnsu:
17511 case Intrinsic::arm_neon_vqrshiftns:
17512 case Intrinsic::arm_neon_vqrshiftnu:
17513 case Intrinsic::arm_neon_vqrshiftnsu: {
17514 EVT VT = N->getOperand(1).getValueType();
17515 int64_t Cnt;
17516 unsigned VShiftOpc = 0;
17517
17518 switch (IntNo) {
17519 case Intrinsic::arm_neon_vshifts:
17520 case Intrinsic::arm_neon_vshiftu:
17521 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17522 VShiftOpc = ARMISD::VSHLIMM;
17523 break;
17524 }
17525 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17526 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17527 : ARMISD::VSHRuIMM);
17528 break;
17529 }
17530 return SDValue();
17531
17532 case Intrinsic::arm_neon_vrshifts:
17533 case Intrinsic::arm_neon_vrshiftu:
17534 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17535 break;
17536 return SDValue();
17537
17538 case Intrinsic::arm_neon_vqshifts:
17539 case Intrinsic::arm_neon_vqshiftu:
17540 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17541 break;
17542 return SDValue();
17543
17544 case Intrinsic::arm_neon_vqshiftsu:
17545 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17546 break;
17547 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17548
17549 case Intrinsic::arm_neon_vrshiftn:
17550 case Intrinsic::arm_neon_vqshiftns:
17551 case Intrinsic::arm_neon_vqshiftnu:
17552 case Intrinsic::arm_neon_vqshiftnsu:
17553 case Intrinsic::arm_neon_vqrshiftns:
17554 case Intrinsic::arm_neon_vqrshiftnu:
17555 case Intrinsic::arm_neon_vqrshiftnsu:
17556 // Narrowing shifts require an immediate right shift.
17557 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17558 break;
17559 llvm_unreachable("invalid shift count for narrowing vector shift "
17560 "intrinsic");
17561
17562 default:
17563 llvm_unreachable("unhandled vector shift");
17564 }
17565
17566 switch (IntNo) {
17567 case Intrinsic::arm_neon_vshifts:
17568 case Intrinsic::arm_neon_vshiftu:
17569 // Opcode already set above.
17570 break;
17571 case Intrinsic::arm_neon_vrshifts:
17572 VShiftOpc = ARMISD::VRSHRsIMM;
17573 break;
17574 case Intrinsic::arm_neon_vrshiftu:
17575 VShiftOpc = ARMISD::VRSHRuIMM;
17576 break;
17577 case Intrinsic::arm_neon_vrshiftn:
17578 VShiftOpc = ARMISD::VRSHRNIMM;
17579 break;
17580 case Intrinsic::arm_neon_vqshifts:
17581 VShiftOpc = ARMISD::VQSHLsIMM;
17582 break;
17583 case Intrinsic::arm_neon_vqshiftu:
17584 VShiftOpc = ARMISD::VQSHLuIMM;
17585 break;
17586 case Intrinsic::arm_neon_vqshiftsu:
17587 VShiftOpc = ARMISD::VQSHLsuIMM;
17588 break;
17589 case Intrinsic::arm_neon_vqshiftns:
17590 VShiftOpc = ARMISD::VQSHRNsIMM;
17591 break;
17592 case Intrinsic::arm_neon_vqshiftnu:
17593 VShiftOpc = ARMISD::VQSHRNuIMM;
17594 break;
17595 case Intrinsic::arm_neon_vqshiftnsu:
17596 VShiftOpc = ARMISD::VQSHRNsuIMM;
17597 break;
17598 case Intrinsic::arm_neon_vqrshiftns:
17599 VShiftOpc = ARMISD::VQRSHRNsIMM;
17600 break;
17601 case Intrinsic::arm_neon_vqrshiftnu:
17602 VShiftOpc = ARMISD::VQRSHRNuIMM;
17603 break;
17604 case Intrinsic::arm_neon_vqrshiftnsu:
17605 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17606 break;
17607 }
17608
17609 SDLoc dl(N);
17610 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17611 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17612 }
17613
17614 case Intrinsic::arm_neon_vshiftins: {
17615 EVT VT = N->getOperand(1).getValueType();
17616 int64_t Cnt;
17617 unsigned VShiftOpc = 0;
17618
17619 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17620 VShiftOpc = ARMISD::VSLIIMM;
17621 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17622 VShiftOpc = ARMISD::VSRIIMM;
17623 else {
17624 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17625 }
17626
17627 SDLoc dl(N);
17628 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17629 N->getOperand(1), N->getOperand(2),
17630 DAG.getConstant(Cnt, dl, MVT::i32));
17631 }
17632
17633 case Intrinsic::arm_neon_vqrshifts:
17634 case Intrinsic::arm_neon_vqrshiftu:
17635 // No immediate versions of these to check for.
17636 break;
17637
17638 case Intrinsic::arm_neon_vbsl: {
17639 SDLoc dl(N);
17640 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17641 N->getOperand(2), N->getOperand(3));
17642 }
17643 case Intrinsic::arm_mve_vqdmlah:
17644 case Intrinsic::arm_mve_vqdmlash:
17645 case Intrinsic::arm_mve_vqrdmlah:
17646 case Intrinsic::arm_mve_vqrdmlash:
17647 case Intrinsic::arm_mve_vmla_n_predicated:
17648 case Intrinsic::arm_mve_vmlas_n_predicated:
17649 case Intrinsic::arm_mve_vqdmlah_predicated:
17650 case Intrinsic::arm_mve_vqdmlash_predicated:
17651 case Intrinsic::arm_mve_vqrdmlah_predicated:
17652 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17653 // These intrinsics all take an i32 scalar operand which is narrowed to the
17654 // size of a single lane of the vector type they return. So we don't need
17655 // any bits of that operand above that point, which allows us to eliminate
17656 // uxth/sxth.
17657 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17658 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17659 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17660 return SDValue();
17661 break;
17662 }
17663
17664 case Intrinsic::arm_mve_minv:
17665 case Intrinsic::arm_mve_maxv:
17666 case Intrinsic::arm_mve_minav:
17667 case Intrinsic::arm_mve_maxav:
17668 case Intrinsic::arm_mve_minv_predicated:
17669 case Intrinsic::arm_mve_maxv_predicated:
17670 case Intrinsic::arm_mve_minav_predicated:
17671 case Intrinsic::arm_mve_maxav_predicated: {
17672 // These intrinsics all take an i32 scalar operand which is narrowed to the
17673 // size of a single lane of the vector type they take as the other input.
17674 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17675 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17676 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17677 return SDValue();
17678 break;
17679 }
17680
17681 case Intrinsic::arm_mve_addv: {
17682 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17683 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17684 bool Unsigned = N->getConstantOperandVal(2);
17685 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17686 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17687 }
17688
17689 case Intrinsic::arm_mve_addlv:
17690 case Intrinsic::arm_mve_addlv_predicated: {
17691 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17692 // which recombines the two outputs into an i64
17693 bool Unsigned = N->getConstantOperandVal(2);
17694 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17695 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17696 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17697
17699 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17700 if (i != 2) // skip the unsigned flag
17701 Ops.push_back(N->getOperand(i));
17702
17703 SDLoc dl(N);
17704 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17705 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17706 val.getValue(1));
17707 }
17708 }
17709
17710 return SDValue();
17711}
17712
17713/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17714/// lowers them. As with the vector shift intrinsics, this is done during DAG
17715/// combining instead of DAG legalizing because the build_vectors for 64-bit
17716/// vector element shift counts are generally not legal, and it is hard to see
17717/// their values after they get legalized to loads from a constant pool.
17720 const ARMSubtarget *ST) {
17721 SelectionDAG &DAG = DCI.DAG;
17722 EVT VT = N->getValueType(0);
17723
17724 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17725 N->getOperand(0)->getOpcode() == ISD::AND &&
17726 N->getOperand(0)->hasOneUse()) {
17727 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17728 return SDValue();
17729 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17730 // usually show up because instcombine prefers to canonicalize it to
17731 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17732 // out of GEP lowering in some cases.
17733 SDValue N0 = N->getOperand(0);
17734 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17735 if (!ShiftAmtNode)
17736 return SDValue();
17737 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17738 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17739 if (!AndMaskNode)
17740 return SDValue();
17741 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17742 // Don't transform uxtb/uxth.
17743 if (AndMask == 255 || AndMask == 65535)
17744 return SDValue();
17745 if (isMask_32(AndMask)) {
17746 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17747 if (MaskedBits > ShiftAmt) {
17748 SDLoc DL(N);
17749 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17750 DAG.getConstant(MaskedBits, DL, MVT::i32));
17751 return DAG.getNode(
17752 ISD::SRL, DL, MVT::i32, SHL,
17753 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17754 }
17755 }
17756 }
17757
17758 // Nothing to be done for scalar shifts.
17759 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17760 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17761 return SDValue();
17762 if (ST->hasMVEIntegerOps())
17763 return SDValue();
17764
17765 int64_t Cnt;
17766
17767 switch (N->getOpcode()) {
17768 default: llvm_unreachable("unexpected shift opcode");
17769
17770 case ISD::SHL:
17771 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17772 SDLoc dl(N);
17773 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17774 DAG.getConstant(Cnt, dl, MVT::i32));
17775 }
17776 break;
17777
17778 case ISD::SRA:
17779 case ISD::SRL:
17780 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17781 unsigned VShiftOpc =
17782 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17783 SDLoc dl(N);
17784 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17785 DAG.getConstant(Cnt, dl, MVT::i32));
17786 }
17787 }
17788 return SDValue();
17789}
17790
17791// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17792// split into multiple extending loads, which are simpler to deal with than an
17793// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17794// to convert the type to an f32.
17796 SDValue N0 = N->getOperand(0);
17797 if (N0.getOpcode() != ISD::LOAD)
17798 return SDValue();
17800 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17801 LD->getExtensionType() != ISD::NON_EXTLOAD)
17802 return SDValue();
17803 EVT FromVT = LD->getValueType(0);
17804 EVT ToVT = N->getValueType(0);
17805 if (!ToVT.isVector())
17806 return SDValue();
17808 EVT ToEltVT = ToVT.getVectorElementType();
17809 EVT FromEltVT = FromVT.getVectorElementType();
17810
17811 unsigned NumElements = 0;
17812 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17813 NumElements = 4;
17814 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17815 NumElements = 4;
17816 if (NumElements == 0 ||
17817 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17818 FromVT.getVectorNumElements() % NumElements != 0 ||
17819 !isPowerOf2_32(NumElements))
17820 return SDValue();
17821
17822 LLVMContext &C = *DAG.getContext();
17823 SDLoc DL(LD);
17824 // Details about the old load
17825 SDValue Ch = LD->getChain();
17826 SDValue BasePtr = LD->getBasePtr();
17827 Align Alignment = LD->getBaseAlign();
17828 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17829 AAMDNodes AAInfo = LD->getAAInfo();
17830
17831 ISD::LoadExtType NewExtType =
17832 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17833 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17834 EVT NewFromVT = EVT::getVectorVT(
17835 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17836 EVT NewToVT = EVT::getVectorVT(
17837 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17838
17841 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17842 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17843 SDValue NewPtr =
17844 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17845
17846 SDValue NewLoad =
17847 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17848 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17849 Alignment, MMOFlags, AAInfo);
17850 Loads.push_back(NewLoad);
17851 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17852 }
17853
17854 // Float truncs need to extended with VCVTB's into their floating point types.
17855 if (FromEltVT == MVT::f16) {
17857
17858 for (unsigned i = 0; i < Loads.size(); i++) {
17859 SDValue LoadBC =
17860 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17861 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17862 DAG.getConstant(0, DL, MVT::i32));
17863 Extends.push_back(FPExt);
17864 }
17865
17866 Loads = Extends;
17867 }
17868
17869 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17870 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17871 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17872}
17873
17874/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17875/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17877 const ARMSubtarget *ST) {
17878 SDValue N0 = N->getOperand(0);
17879
17880 // Check for sign- and zero-extensions of vector extract operations of 8- and
17881 // 16-bit vector elements. NEON and MVE support these directly. They are
17882 // handled during DAG combining because type legalization will promote them
17883 // to 32-bit types and it is messy to recognize the operations after that.
17884 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17886 SDValue Vec = N0.getOperand(0);
17887 SDValue Lane = N0.getOperand(1);
17888 EVT VT = N->getValueType(0);
17889 EVT EltVT = N0.getValueType();
17890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17891
17892 if (VT == MVT::i32 &&
17893 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17894 TLI.isTypeLegal(Vec.getValueType()) &&
17895 isa<ConstantSDNode>(Lane)) {
17896
17897 unsigned Opc = 0;
17898 switch (N->getOpcode()) {
17899 default: llvm_unreachable("unexpected opcode");
17900 case ISD::SIGN_EXTEND:
17901 Opc = ARMISD::VGETLANEs;
17902 break;
17903 case ISD::ZERO_EXTEND:
17904 case ISD::ANY_EXTEND:
17905 Opc = ARMISD::VGETLANEu;
17906 break;
17907 }
17908 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17909 }
17910 }
17911
17912 if (ST->hasMVEIntegerOps())
17913 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17914 return NewLoad;
17915
17916 return SDValue();
17917}
17918
17920 const ARMSubtarget *ST) {
17921 if (ST->hasMVEFloatOps())
17922 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17923 return NewLoad;
17924
17925 return SDValue();
17926}
17927
17928// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17929// constant bounds.
17931 const ARMSubtarget *Subtarget) {
17932 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17933 !Subtarget->isThumb2())
17934 return SDValue();
17935
17936 EVT VT = Op.getValueType();
17937 SDValue Op0 = Op.getOperand(0);
17938
17939 if (VT != MVT::i32 ||
17940 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17941 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17943 return SDValue();
17944
17945 SDValue Min = Op;
17946 SDValue Max = Op0;
17947 SDValue Input = Op0.getOperand(0);
17948 if (Min.getOpcode() == ISD::SMAX)
17949 std::swap(Min, Max);
17950
17951 APInt MinC = Min.getConstantOperandAPInt(1);
17952 APInt MaxC = Max.getConstantOperandAPInt(1);
17953
17954 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17955 !(MinC + 1).isPowerOf2())
17956 return SDValue();
17957
17958 SDLoc DL(Op);
17959 if (MinC == ~MaxC)
17960 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17961 DAG.getConstant(MinC.countr_one(), DL, VT));
17962 if (MaxC == 0)
17963 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17964 DAG.getConstant(MinC.countr_one(), DL, VT));
17965
17966 return SDValue();
17967}
17968
17969/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17970/// saturates.
17972 const ARMSubtarget *ST) {
17973 EVT VT = N->getValueType(0);
17974 SDValue N0 = N->getOperand(0);
17975
17976 if (VT == MVT::i32)
17977 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17978
17979 if (!ST->hasMVEIntegerOps())
17980 return SDValue();
17981
17982 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17983 return V;
17984
17985 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17986 return SDValue();
17987
17988 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17989 // Check one is a smin and the other is a smax
17990 if (Min->getOpcode() != ISD::SMIN)
17991 std::swap(Min, Max);
17992 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17993 return false;
17994
17995 APInt SaturateC;
17996 if (VT == MVT::v4i32)
17997 SaturateC = APInt(32, (1 << 15) - 1, true);
17998 else //if (VT == MVT::v8i16)
17999 SaturateC = APInt(16, (1 << 7) - 1, true);
18000
18001 APInt MinC, MaxC;
18002 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18003 MinC != SaturateC)
18004 return false;
18005 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18006 MaxC != ~SaturateC)
18007 return false;
18008 return true;
18009 };
18010
18011 if (IsSignedSaturate(N, N0.getNode())) {
18012 SDLoc DL(N);
18013 MVT ExtVT, HalfVT;
18014 if (VT == MVT::v4i32) {
18015 HalfVT = MVT::v8i16;
18016 ExtVT = MVT::v4i16;
18017 } else { // if (VT == MVT::v8i16)
18018 HalfVT = MVT::v16i8;
18019 ExtVT = MVT::v8i8;
18020 }
18021
18022 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18023 // half. That extend will hopefully be removed if only the bottom bits are
18024 // demanded (though a truncating store, for example).
18025 SDValue VQMOVN =
18026 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18027 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18028 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18029 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18030 DAG.getValueType(ExtVT));
18031 }
18032
18033 auto IsUnsignedSaturate = [&](SDNode *Min) {
18034 // For unsigned, we just need to check for <= 0xffff
18035 if (Min->getOpcode() != ISD::UMIN)
18036 return false;
18037
18038 APInt SaturateC;
18039 if (VT == MVT::v4i32)
18040 SaturateC = APInt(32, (1 << 16) - 1, true);
18041 else //if (VT == MVT::v8i16)
18042 SaturateC = APInt(16, (1 << 8) - 1, true);
18043
18044 APInt MinC;
18045 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18046 MinC != SaturateC)
18047 return false;
18048 return true;
18049 };
18050
18051 if (IsUnsignedSaturate(N)) {
18052 SDLoc DL(N);
18053 MVT HalfVT;
18054 unsigned ExtConst;
18055 if (VT == MVT::v4i32) {
18056 HalfVT = MVT::v8i16;
18057 ExtConst = 0x0000FFFF;
18058 } else { //if (VT == MVT::v8i16)
18059 HalfVT = MVT::v16i8;
18060 ExtConst = 0x00FF;
18061 }
18062
18063 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18064 // an AND. That extend will hopefully be removed if only the bottom bits are
18065 // demanded (though a truncating store, for example).
18066 SDValue VQMOVN =
18067 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18068 DAG.getConstant(0, DL, MVT::i32));
18069 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18070 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18071 DAG.getConstant(ExtConst, DL, VT));
18072 }
18073
18074 return SDValue();
18075}
18076
18079 if (!C)
18080 return nullptr;
18081 const APInt *CV = &C->getAPIntValue();
18082 return CV->isPowerOf2() ? CV : nullptr;
18083}
18084
18086 // If we have a CMOV, OR and AND combination such as:
18087 // if (x & CN)
18088 // y |= CM;
18089 //
18090 // And:
18091 // * CN is a single bit;
18092 // * All bits covered by CM are known zero in y
18093 //
18094 // Then we can convert this into a sequence of BFI instructions. This will
18095 // always be a win if CM is a single bit, will always be no worse than the
18096 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18097 // three bits (due to the extra IT instruction).
18098
18099 SDValue Op0 = CMOV->getOperand(0);
18100 SDValue Op1 = CMOV->getOperand(1);
18101 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18102 SDValue CmpZ = CMOV->getOperand(3);
18103
18104 // The compare must be against zero.
18105 if (!isNullConstant(CmpZ->getOperand(1)))
18106 return SDValue();
18107
18108 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18109 SDValue And = CmpZ->getOperand(0);
18110 if (And->getOpcode() != ISD::AND)
18111 return SDValue();
18112 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18113 if (!AndC)
18114 return SDValue();
18115 SDValue X = And->getOperand(0);
18116
18117 if (CC == ARMCC::EQ) {
18118 // We're performing an "equal to zero" compare. Swap the operands so we
18119 // canonicalize on a "not equal to zero" compare.
18120 std::swap(Op0, Op1);
18121 } else {
18122 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18123 }
18124
18125 if (Op1->getOpcode() != ISD::OR)
18126 return SDValue();
18127
18129 if (!OrC)
18130 return SDValue();
18131 SDValue Y = Op1->getOperand(0);
18132
18133 if (Op0 != Y)
18134 return SDValue();
18135
18136 // Now, is it profitable to continue?
18137 APInt OrCI = OrC->getAPIntValue();
18138 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18139 if (OrCI.popcount() > Heuristic)
18140 return SDValue();
18141
18142 // Lastly, can we determine that the bits defined by OrCI
18143 // are zero in Y?
18144 KnownBits Known = DAG.computeKnownBits(Y);
18145 if ((OrCI & Known.Zero) != OrCI)
18146 return SDValue();
18147
18148 // OK, we can do the combine.
18149 SDValue V = Y;
18150 SDLoc dl(X);
18151 EVT VT = X.getValueType();
18152 unsigned BitInX = AndC->logBase2();
18153
18154 if (BitInX != 0) {
18155 // We must shift X first.
18156 X = DAG.getNode(ISD::SRL, dl, VT, X,
18157 DAG.getConstant(BitInX, dl, VT));
18158 }
18159
18160 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18161 BitInY < NumActiveBits; ++BitInY) {
18162 if (OrCI[BitInY] == 0)
18163 continue;
18164 APInt Mask(VT.getSizeInBits(), 0);
18165 Mask.setBit(BitInY);
18166 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18167 // Confusingly, the operand is an *inverted* mask.
18168 DAG.getConstant(~Mask, dl, VT));
18169 }
18170
18171 return V;
18172}
18173
18174// Given N, the value controlling the conditional branch, search for the loop
18175// intrinsic, returning it, along with how the value is used. We need to handle
18176// patterns such as the following:
18177// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18178// (brcond (setcc (loop.decrement), 0, eq), exit)
18179// (brcond (setcc (loop.decrement), 0, ne), header)
18181 bool &Negate) {
18182 switch (N->getOpcode()) {
18183 default:
18184 break;
18185 case ISD::XOR: {
18186 if (!isa<ConstantSDNode>(N.getOperand(1)))
18187 return SDValue();
18188 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18189 return SDValue();
18190 Negate = !Negate;
18191 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18192 }
18193 case ISD::SETCC: {
18194 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18195 if (!Const)
18196 return SDValue();
18197 if (Const->isZero())
18198 Imm = 0;
18199 else if (Const->isOne())
18200 Imm = 1;
18201 else
18202 return SDValue();
18203 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18204 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18205 }
18207 unsigned IntOp = N.getConstantOperandVal(1);
18208 if (IntOp != Intrinsic::test_start_loop_iterations &&
18209 IntOp != Intrinsic::loop_decrement_reg)
18210 return SDValue();
18211 return N;
18212 }
18213 }
18214 return SDValue();
18215}
18216
18219 const ARMSubtarget *ST) {
18220
18221 // The hwloop intrinsics that we're interested are used for control-flow,
18222 // either for entering or exiting the loop:
18223 // - test.start.loop.iterations will test whether its operand is zero. If it
18224 // is zero, the proceeding branch should not enter the loop.
18225 // - loop.decrement.reg also tests whether its operand is zero. If it is
18226 // zero, the proceeding branch should not branch back to the beginning of
18227 // the loop.
18228 // So here, we need to check that how the brcond is using the result of each
18229 // of the intrinsics to ensure that we're branching to the right place at the
18230 // right time.
18231
18232 ISD::CondCode CC;
18233 SDValue Cond;
18234 int Imm = 1;
18235 bool Negate = false;
18236 SDValue Chain = N->getOperand(0);
18237 SDValue Dest;
18238
18239 if (N->getOpcode() == ISD::BRCOND) {
18240 CC = ISD::SETEQ;
18241 Cond = N->getOperand(1);
18242 Dest = N->getOperand(2);
18243 } else {
18244 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18245 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18246 Cond = N->getOperand(2);
18247 Dest = N->getOperand(4);
18248 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18249 if (!Const->isOne() && !Const->isZero())
18250 return SDValue();
18251 Imm = Const->getZExtValue();
18252 } else
18253 return SDValue();
18254 }
18255
18256 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18257 if (!Int)
18258 return SDValue();
18259
18260 if (Negate)
18261 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18262
18263 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18264 return (CC == ISD::SETEQ && Imm == 0) ||
18265 (CC == ISD::SETNE && Imm == 1) ||
18266 (CC == ISD::SETLT && Imm == 1) ||
18267 (CC == ISD::SETULT && Imm == 1);
18268 };
18269
18270 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18271 return (CC == ISD::SETEQ && Imm == 1) ||
18272 (CC == ISD::SETNE && Imm == 0) ||
18273 (CC == ISD::SETGT && Imm == 0) ||
18274 (CC == ISD::SETUGT && Imm == 0) ||
18275 (CC == ISD::SETGE && Imm == 1) ||
18276 (CC == ISD::SETUGE && Imm == 1);
18277 };
18278
18279 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18280 "unsupported condition");
18281
18282 SDLoc dl(Int);
18283 SelectionDAG &DAG = DCI.DAG;
18284 SDValue Elements = Int.getOperand(2);
18285 unsigned IntOp = Int->getConstantOperandVal(1);
18286 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18287 "expected single br user");
18288 SDNode *Br = *N->user_begin();
18289 SDValue OtherTarget = Br->getOperand(1);
18290
18291 // Update the unconditional branch to branch to the given Dest.
18292 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18293 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18294 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18295 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18296 };
18297
18298 if (IntOp == Intrinsic::test_start_loop_iterations) {
18299 SDValue Res;
18300 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18301 // We expect this 'instruction' to branch when the counter is zero.
18302 if (IsTrueIfZero(CC, Imm)) {
18303 SDValue Ops[] = {Chain, Setup, Dest};
18304 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18305 } else {
18306 // The logic is the reverse of what we need for WLS, so find the other
18307 // basic block target: the target of the proceeding br.
18308 UpdateUncondBr(Br, Dest, DAG);
18309
18310 SDValue Ops[] = {Chain, Setup, OtherTarget};
18311 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18312 }
18313 // Update LR count to the new value
18314 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18315 // Update chain
18316 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18317 return Res;
18318 } else {
18319 SDValue Size =
18320 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18321 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18322 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18323 DAG.getVTList(MVT::i32, MVT::Other), Args);
18324 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18325
18326 // We expect this instruction to branch when the count is not zero.
18327 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18328
18329 // Update the unconditional branch to target the loop preheader if we've
18330 // found the condition has been reversed.
18331 if (Target == OtherTarget)
18332 UpdateUncondBr(Br, Dest, DAG);
18333
18334 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18335 SDValue(LoopDec.getNode(), 1), Chain);
18336
18337 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18338 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18339 }
18340 return SDValue();
18341}
18342
18343/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18344SDValue
18346 SDValue Cmp = N->getOperand(3);
18347 if (Cmp.getOpcode() != ARMISD::CMPZ)
18348 // Only looking at NE cases.
18349 return SDValue();
18350
18351 SDLoc dl(N);
18352 SDValue LHS = Cmp.getOperand(0);
18353 SDValue RHS = Cmp.getOperand(1);
18354 SDValue Chain = N->getOperand(0);
18355 SDValue BB = N->getOperand(1);
18356 SDValue ARMcc = N->getOperand(2);
18358
18359 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18360 // -> (brcond Chain BB CC Flags)
18361 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18362 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18363 LHS->getOperand(0)->hasOneUse() &&
18364 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18365 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18366 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18367 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18368 LHS->getOperand(0)->getOperand(2),
18369 LHS->getOperand(0)->getOperand(3));
18370 }
18371
18372 return SDValue();
18373}
18374
18375/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18376SDValue
18378 SDLoc dl(N);
18379 EVT VT = N->getValueType(0);
18380 SDValue FalseVal = N->getOperand(0);
18381 SDValue TrueVal = N->getOperand(1);
18382 SDValue ARMcc = N->getOperand(2);
18383 SDValue Cmp = N->getOperand(3);
18384
18385 // Try to form CSINV etc.
18386 unsigned Opcode;
18387 bool InvertCond;
18388 if (SDValue CSetOp =
18389 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
18390 if (InvertCond) {
18391 ARMCC::CondCodes CondCode =
18392 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
18393 CondCode = ARMCC::getOppositeCondition(CondCode);
18394 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
18395 }
18396 return DAG.getNode(Opcode, dl, VT, CSetOp, CSetOp, ARMcc, Cmp);
18397 }
18398
18399 if (Cmp.getOpcode() != ARMISD::CMPZ)
18400 // Only looking at EQ and NE cases.
18401 return SDValue();
18402
18403 SDValue LHS = Cmp.getOperand(0);
18404 SDValue RHS = Cmp.getOperand(1);
18406
18407 // BFI is only available on V6T2+.
18408 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18410 if (R)
18411 return R;
18412 }
18413
18414 // Simplify
18415 // mov r1, r0
18416 // cmp r1, x
18417 // mov r0, y
18418 // moveq r0, x
18419 // to
18420 // cmp r0, x
18421 // movne r0, y
18422 //
18423 // mov r1, r0
18424 // cmp r1, x
18425 // mov r0, x
18426 // movne r0, y
18427 // to
18428 // cmp r0, x
18429 // movne r0, y
18430 /// FIXME: Turn this into a target neutral optimization?
18431 SDValue Res;
18432 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18433 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18434 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18435 SDValue ARMcc;
18436 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18437 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18438 }
18439
18440 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18441 // -> (cmov F T CC Flags)
18442 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18443 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18444 isNullConstant(RHS)) {
18445 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18446 LHS->getOperand(2), LHS->getOperand(3));
18447 }
18448
18449 if (!VT.isInteger())
18450 return SDValue();
18451
18452 // Fold away an unnecessary CMPZ/CMOV
18453 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18454 // if C1==EQ -> CMOV A, B, C2, D
18455 // if C1==NE -> CMOV A, B, NOT(C2), D
18456 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18457 N->getConstantOperandVal(2) == ARMCC::NE) {
18459 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18460 if (N->getConstantOperandVal(2) == ARMCC::NE)
18462 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18463 N->getOperand(1),
18464 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18465 }
18466 }
18467
18468 // Materialize a boolean comparison for integers so we can avoid branching.
18469 if (isNullConstant(FalseVal)) {
18470 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18471 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18472 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18473 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18474 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18475 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18476 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18477 DAG.getConstant(5, dl, MVT::i32));
18478 } else {
18479 // CMOV 0, 1, ==, (CMPZ x, y) ->
18480 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18481 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18482 //
18483 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18484 // x != y. In other words, a carry C == 1 when x == y, C == 0
18485 // otherwise.
18486 // The final UADDO_CARRY computes
18487 // x - y + (0 - (x - y)) + C == C
18488 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18489 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18490 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18491 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18492 // actually.
18493 SDValue Carry =
18494 DAG.getNode(ISD::SUB, dl, MVT::i32,
18495 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18496 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18497 }
18498 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18499 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18500 // This seems pointless but will allow us to combine it further below.
18501 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18502 SDValue Sub =
18503 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18504 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18505 Sub.getValue(1));
18506 FalseVal = Sub;
18507 }
18508 } else if (isNullConstant(TrueVal)) {
18509 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18510 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18511 // This seems pointless but will allow us to combine it further below
18512 // Note that we change == for != as this is the dual for the case above.
18513 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18514 SDValue Sub =
18515 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18516 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18517 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18518 Sub.getValue(1));
18519 FalseVal = Sub;
18520 }
18521 }
18522
18523 // On Thumb1, the DAG above may be further combined if z is a power of 2
18524 // (z == 2 ^ K).
18525 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18526 // t1 = (USUBO (SUB x, y), 1)
18527 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18528 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18529 //
18530 // This also handles the special case of comparing against zero; it's
18531 // essentially, the same pattern, except there's no SUBC:
18532 // CMOV x, z, !=, (CMPZ x, 0) ->
18533 // t1 = (USUBO x, 1)
18534 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18535 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18536 const APInt *TrueConst;
18537 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18538 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18539 FalseVal.getOperand(1) == RHS) ||
18540 (FalseVal == LHS && isNullConstant(RHS))) &&
18541 (TrueConst = isPowerOf2Constant(TrueVal))) {
18542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18543 unsigned ShiftAmount = TrueConst->logBase2();
18544 if (ShiftAmount)
18545 TrueVal = DAG.getConstant(1, dl, VT);
18546 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18547 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18548 Subc.getValue(1));
18549
18550 if (ShiftAmount)
18551 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18552 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18553 }
18554
18555 if (Res.getNode()) {
18556 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18557 // Capture demanded bits information that would be otherwise lost.
18558 if (Known.Zero == 0xfffffffe)
18559 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18560 DAG.getValueType(MVT::i1));
18561 else if (Known.Zero == 0xffffff00)
18562 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18563 DAG.getValueType(MVT::i8));
18564 else if (Known.Zero == 0xffff0000)
18565 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18566 DAG.getValueType(MVT::i16));
18567 }
18568
18569 return Res;
18570}
18571
18574 const ARMSubtarget *ST) {
18575 SelectionDAG &DAG = DCI.DAG;
18576 SDValue Src = N->getOperand(0);
18577 EVT DstVT = N->getValueType(0);
18578
18579 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18580 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18581 EVT SrcVT = Src.getValueType();
18582 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18583 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18584 }
18585
18586 // We may have a bitcast of something that has already had this bitcast
18587 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18588 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18589 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18590 Src.getValueType().getScalarSizeInBits())
18591 Src = Src.getOperand(0);
18592
18593 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18594 // would be generated is at least the width of the element type.
18595 EVT SrcVT = Src.getValueType();
18596 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18597 Src.getOpcode() == ARMISD::VMVNIMM ||
18598 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18599 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18600 DAG.getDataLayout().isBigEndian())
18601 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18602
18603 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18604 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18605 return R;
18606
18607 return SDValue();
18608}
18609
18610// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18611// node into stack operations after legalizeOps.
18614 SelectionDAG &DAG = DCI.DAG;
18615 EVT VT = N->getValueType(0);
18616 SDLoc DL(N);
18617
18618 // MVETrunc(Undef, Undef) -> Undef
18619 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18620 return DAG.getUNDEF(VT);
18621
18622 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18623 if (N->getNumOperands() == 2 &&
18624 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18625 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18626 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18627 N->getOperand(0).getOperand(1),
18628 N->getOperand(1).getOperand(0),
18629 N->getOperand(1).getOperand(1));
18630
18631 // MVETrunc(shuffle, shuffle) -> VMOVN
18632 if (N->getNumOperands() == 2 &&
18633 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18634 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18635 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18636 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18637
18638 if (S0->getOperand(0) == S1->getOperand(0) &&
18639 S0->getOperand(1) == S1->getOperand(1)) {
18640 // Construct complete shuffle mask
18641 SmallVector<int, 8> Mask(S0->getMask());
18642 Mask.append(S1->getMask().begin(), S1->getMask().end());
18643
18644 if (isVMOVNTruncMask(Mask, VT, false))
18645 return DAG.getNode(
18646 ARMISD::VMOVN, DL, VT,
18647 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18648 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18649 DAG.getConstant(1, DL, MVT::i32));
18650 if (isVMOVNTruncMask(Mask, VT, true))
18651 return DAG.getNode(
18652 ARMISD::VMOVN, DL, VT,
18653 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18654 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18655 DAG.getConstant(1, DL, MVT::i32));
18656 }
18657 }
18658
18659 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18660 // truncate to a buildvector to allow the generic optimisations to kick in.
18661 if (all_of(N->ops(), [](SDValue Op) {
18662 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18663 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18664 (Op.getOpcode() == ISD::BITCAST &&
18665 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18666 })) {
18667 SmallVector<SDValue, 8> Extracts;
18668 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18669 SDValue O = N->getOperand(Op);
18670 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18671 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18672 DAG.getConstant(i, DL, MVT::i32));
18673 Extracts.push_back(Ext);
18674 }
18675 }
18676 return DAG.getBuildVector(VT, DL, Extracts);
18677 }
18678
18679 // If we are late in the legalization process and nothing has optimised
18680 // the trunc to anything better, lower it to a stack store and reload,
18681 // performing the truncation whilst keeping the lanes in the correct order:
18682 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18683 if (!DCI.isAfterLegalizeDAG())
18684 return SDValue();
18685
18686 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18687 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18688 int NumIns = N->getNumOperands();
18689 assert((NumIns == 2 || NumIns == 4) &&
18690 "Expected 2 or 4 inputs to an MVETrunc");
18691 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18692 if (N->getNumOperands() == 4)
18693 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18694
18695 SmallVector<SDValue> Chains;
18696 for (int I = 0; I < NumIns; I++) {
18697 SDValue Ptr = DAG.getNode(
18698 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18699 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18701 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18702 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18703 Ptr, MPI, StoreVT, Align(4));
18704 Chains.push_back(Ch);
18705 }
18706
18707 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18708 MachinePointerInfo MPI =
18710 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18711}
18712
18713// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18715 SelectionDAG &DAG) {
18716 SDValue N0 = N->getOperand(0);
18718 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18719 return SDValue();
18720
18721 EVT FromVT = LD->getMemoryVT();
18722 EVT ToVT = N->getValueType(0);
18723 if (!ToVT.isVector())
18724 return SDValue();
18725 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18726 EVT ToEltVT = ToVT.getVectorElementType();
18727 EVT FromEltVT = FromVT.getVectorElementType();
18728
18729 unsigned NumElements = 0;
18730 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18731 NumElements = 4;
18732 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18733 NumElements = 8;
18734 assert(NumElements != 0);
18735
18736 ISD::LoadExtType NewExtType =
18737 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18738 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18739 LD->getExtensionType() != ISD::EXTLOAD &&
18740 LD->getExtensionType() != NewExtType)
18741 return SDValue();
18742
18743 LLVMContext &C = *DAG.getContext();
18744 SDLoc DL(LD);
18745 // Details about the old load
18746 SDValue Ch = LD->getChain();
18747 SDValue BasePtr = LD->getBasePtr();
18748 Align Alignment = LD->getBaseAlign();
18749 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18750 AAMDNodes AAInfo = LD->getAAInfo();
18751
18752 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18753 EVT NewFromVT = EVT::getVectorVT(
18754 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18755 EVT NewToVT = EVT::getVectorVT(
18756 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18757
18760 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18761 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18762 SDValue NewPtr =
18763 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18764
18765 SDValue NewLoad =
18766 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18767 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18768 Alignment, MMOFlags, AAInfo);
18769 Loads.push_back(NewLoad);
18770 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18771 }
18772
18773 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18774 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18775 return DAG.getMergeValues(Loads, DL);
18776}
18777
18778// Perform combines for MVEEXT. If it has not be optimized to anything better
18779// before lowering, it gets converted to stack store and extloads performing the
18780// extend whilst still keeping the same lane ordering.
18783 SelectionDAG &DAG = DCI.DAG;
18784 EVT VT = N->getValueType(0);
18785 SDLoc DL(N);
18786 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18787 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18788
18789 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18790 *DAG.getContext());
18791 auto Extend = [&](SDValue V) {
18792 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18793 return N->getOpcode() == ARMISD::MVESEXT
18794 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18795 DAG.getValueType(ExtVT))
18796 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18797 };
18798
18799 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18800 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18801 SDValue Ext = Extend(N->getOperand(0));
18802 return DAG.getMergeValues({Ext, Ext}, DL);
18803 }
18804
18805 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18806 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18807 ArrayRef<int> Mask = SVN->getMask();
18808 assert(Mask.size() == 2 * VT.getVectorNumElements());
18809 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18810 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18811 SDValue Op0 = SVN->getOperand(0);
18812 SDValue Op1 = SVN->getOperand(1);
18813
18814 auto CheckInregMask = [&](int Start, int Offset) {
18815 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18816 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18817 return false;
18818 return true;
18819 };
18820 SDValue V0 = SDValue(N, 0);
18821 SDValue V1 = SDValue(N, 1);
18822 if (CheckInregMask(0, 0))
18823 V0 = Extend(Op0);
18824 else if (CheckInregMask(0, 1))
18825 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18826 else if (CheckInregMask(0, Mask.size()))
18827 V0 = Extend(Op1);
18828 else if (CheckInregMask(0, Mask.size() + 1))
18829 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18830
18831 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18832 V1 = Extend(Op1);
18833 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18834 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18835 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18836 V1 = Extend(Op0);
18837 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18838 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18839
18840 if (V0.getNode() != N || V1.getNode() != N)
18841 return DAG.getMergeValues({V0, V1}, DL);
18842 }
18843
18844 // MVEEXT(load) -> extload, extload
18845 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18847 return L;
18848
18849 if (!DCI.isAfterLegalizeDAG())
18850 return SDValue();
18851
18852 // Lower to a stack store and reload:
18853 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18854 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18855 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18856 int NumOuts = N->getNumValues();
18857 assert((NumOuts == 2 || NumOuts == 4) &&
18858 "Expected 2 or 4 outputs to an MVEEXT");
18859 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18860 *DAG.getContext());
18861 if (N->getNumOperands() == 4)
18862 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18863
18864 MachinePointerInfo MPI =
18866 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18867 StackPtr, MPI, Align(4));
18868
18870 for (int I = 0; I < NumOuts; I++) {
18871 SDValue Ptr = DAG.getNode(
18872 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18873 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18875 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18876 SDValue Load = DAG.getExtLoad(
18877 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18878 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18879 Loads.push_back(Load);
18880 }
18881
18882 return DAG.getMergeValues(Loads, DL);
18883}
18884
18886 DAGCombinerInfo &DCI) const {
18887 switch (N->getOpcode()) {
18888 default: break;
18889 case ISD::SELECT_CC:
18890 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18891 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18892 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18893 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18894 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18895 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18896 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18897 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18898 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18899 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18900 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18901 case ISD::BRCOND:
18902 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18903 case ARMISD::ADDC:
18904 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18905 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18906 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18907 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18908 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18909 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18910 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18911 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18912 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18915 return PerformExtractEltCombine(N, DCI, Subtarget);
18919 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18920 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18921 case ISD::FP_TO_SINT:
18922 case ISD::FP_TO_UINT:
18923 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18924 case ISD::FADD:
18925 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18926 case ISD::FMUL:
18927 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18929 return PerformIntrinsicCombine(N, DCI);
18930 case ISD::SHL:
18931 case ISD::SRA:
18932 case ISD::SRL:
18933 return PerformShiftCombine(N, DCI, Subtarget);
18934 case ISD::SIGN_EXTEND:
18935 case ISD::ZERO_EXTEND:
18936 case ISD::ANY_EXTEND:
18937 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18938 case ISD::FP_EXTEND:
18939 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18940 case ISD::SMIN:
18941 case ISD::UMIN:
18942 case ISD::SMAX:
18943 case ISD::UMAX:
18944 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18945 case ARMISD::CMOV:
18946 return PerformCMOVCombine(N, DCI.DAG);
18947 case ARMISD::BRCOND:
18948 return PerformBRCONDCombine(N, DCI.DAG);
18949 case ARMISD::CMPZ:
18950 return PerformCMPZCombine(N, DCI.DAG);
18951 case ARMISD::CSINC:
18952 case ARMISD::CSINV:
18953 case ARMISD::CSNEG:
18954 return PerformCSETCombine(N, DCI.DAG);
18955 case ISD::LOAD:
18956 return PerformLOADCombine(N, DCI, Subtarget);
18957 case ARMISD::VLD1DUP:
18958 case ARMISD::VLD2DUP:
18959 case ARMISD::VLD3DUP:
18960 case ARMISD::VLD4DUP:
18961 return PerformVLDCombine(N, DCI);
18963 return PerformARMBUILD_VECTORCombine(N, DCI);
18964 case ISD::BITCAST:
18965 return PerformBITCASTCombine(N, DCI, Subtarget);
18966 case ARMISD::PREDICATE_CAST:
18967 return PerformPREDICATE_CASTCombine(N, DCI);
18968 case ARMISD::VECTOR_REG_CAST:
18969 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18970 case ARMISD::MVETRUNC:
18971 return PerformMVETruncCombine(N, DCI);
18972 case ARMISD::MVESEXT:
18973 case ARMISD::MVEZEXT:
18974 return PerformMVEExtCombine(N, DCI);
18975 case ARMISD::VCMP:
18976 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18977 case ISD::VECREDUCE_ADD:
18978 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18979 case ARMISD::VADDVs:
18980 case ARMISD::VADDVu:
18981 case ARMISD::VADDLVs:
18982 case ARMISD::VADDLVu:
18983 case ARMISD::VADDLVAs:
18984 case ARMISD::VADDLVAu:
18985 case ARMISD::VMLAVs:
18986 case ARMISD::VMLAVu:
18987 case ARMISD::VMLALVs:
18988 case ARMISD::VMLALVu:
18989 case ARMISD::VMLALVAs:
18990 case ARMISD::VMLALVAu:
18991 return PerformReduceShuffleCombine(N, DCI.DAG);
18992 case ARMISD::VMOVN:
18993 return PerformVMOVNCombine(N, DCI);
18994 case ARMISD::VQMOVNs:
18995 case ARMISD::VQMOVNu:
18996 return PerformVQMOVNCombine(N, DCI);
18997 case ARMISD::VQDMULH:
18998 return PerformVQDMULHCombine(N, DCI);
18999 case ARMISD::ASRL:
19000 case ARMISD::LSRL:
19001 case ARMISD::LSLL:
19002 return PerformLongShiftCombine(N, DCI.DAG);
19003 case ARMISD::SMULWB: {
19004 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19005 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19006 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19007 return SDValue();
19008 break;
19009 }
19010 case ARMISD::SMULWT: {
19011 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19012 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19013 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19014 return SDValue();
19015 break;
19016 }
19017 case ARMISD::SMLALBB:
19018 case ARMISD::QADD16b:
19019 case ARMISD::QSUB16b:
19020 case ARMISD::UQADD16b:
19021 case ARMISD::UQSUB16b: {
19022 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19023 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19024 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19025 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19026 return SDValue();
19027 break;
19028 }
19029 case ARMISD::SMLALBT: {
19030 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19031 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19032 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19033 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19034 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19035 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19036 return SDValue();
19037 break;
19038 }
19039 case ARMISD::SMLALTB: {
19040 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19041 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19042 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19043 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19044 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19045 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19046 return SDValue();
19047 break;
19048 }
19049 case ARMISD::SMLALTT: {
19050 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19051 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19052 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19053 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19054 return SDValue();
19055 break;
19056 }
19057 case ARMISD::QADD8b:
19058 case ARMISD::QSUB8b:
19059 case ARMISD::UQADD8b:
19060 case ARMISD::UQSUB8b: {
19061 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19062 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19063 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19064 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19065 return SDValue();
19066 break;
19067 }
19068 case ARMISD::VBSP:
19069 if (N->getOperand(1) == N->getOperand(2))
19070 return N->getOperand(1);
19071 return SDValue();
19074 switch (N->getConstantOperandVal(1)) {
19075 case Intrinsic::arm_neon_vld1:
19076 case Intrinsic::arm_neon_vld1x2:
19077 case Intrinsic::arm_neon_vld1x3:
19078 case Intrinsic::arm_neon_vld1x4:
19079 case Intrinsic::arm_neon_vld2:
19080 case Intrinsic::arm_neon_vld3:
19081 case Intrinsic::arm_neon_vld4:
19082 case Intrinsic::arm_neon_vld2lane:
19083 case Intrinsic::arm_neon_vld3lane:
19084 case Intrinsic::arm_neon_vld4lane:
19085 case Intrinsic::arm_neon_vld2dup:
19086 case Intrinsic::arm_neon_vld3dup:
19087 case Intrinsic::arm_neon_vld4dup:
19088 case Intrinsic::arm_neon_vst1:
19089 case Intrinsic::arm_neon_vst1x2:
19090 case Intrinsic::arm_neon_vst1x3:
19091 case Intrinsic::arm_neon_vst1x4:
19092 case Intrinsic::arm_neon_vst2:
19093 case Intrinsic::arm_neon_vst3:
19094 case Intrinsic::arm_neon_vst4:
19095 case Intrinsic::arm_neon_vst2lane:
19096 case Intrinsic::arm_neon_vst3lane:
19097 case Intrinsic::arm_neon_vst4lane:
19098 return PerformVLDCombine(N, DCI);
19099 case Intrinsic::arm_mve_vld2q:
19100 case Intrinsic::arm_mve_vld4q:
19101 case Intrinsic::arm_mve_vst2q:
19102 case Intrinsic::arm_mve_vst4q:
19103 return PerformMVEVLDCombine(N, DCI);
19104 default: break;
19105 }
19106 break;
19107 }
19108 return SDValue();
19109}
19110
19112 EVT VT) const {
19113 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19114}
19115
19117 Align Alignment,
19119 unsigned *Fast) const {
19120 // Depends what it gets converted into if the type is weird.
19121 if (!VT.isSimple())
19122 return false;
19123
19124 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19125 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19126 auto Ty = VT.getSimpleVT().SimpleTy;
19127
19128 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19129 // Unaligned access can use (for example) LRDB, LRDH, LDR
19130 if (AllowsUnaligned) {
19131 if (Fast)
19132 *Fast = Subtarget->hasV7Ops();
19133 return true;
19134 }
19135 }
19136
19137 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19138 // For any little-endian targets with neon, we can support unaligned ld/st
19139 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19140 // A big-endian target may also explicitly support unaligned accesses
19141 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19142 if (Fast)
19143 *Fast = 1;
19144 return true;
19145 }
19146 }
19147
19148 if (!Subtarget->hasMVEIntegerOps())
19149 return false;
19150
19151 // These are for predicates
19152 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19153 Ty == MVT::v2i1)) {
19154 if (Fast)
19155 *Fast = 1;
19156 return true;
19157 }
19158
19159 // These are for truncated stores/narrowing loads. They are fine so long as
19160 // the alignment is at least the size of the item being loaded
19161 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19162 Alignment >= VT.getScalarSizeInBits() / 8) {
19163 if (Fast)
19164 *Fast = true;
19165 return true;
19166 }
19167
19168 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19169 // VSTRW.U32 all store the vector register in exactly the same format, and
19170 // differ only in the range of their immediate offset field and the required
19171 // alignment. So there is always a store that can be used, regardless of
19172 // actual type.
19173 //
19174 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19175 // VREV64.8) pair and get the same effect. This will likely be better than
19176 // aligning the vector through the stack.
19177 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19178 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19179 Ty == MVT::v2f64) {
19180 if (Fast)
19181 *Fast = 1;
19182 return true;
19183 }
19184
19185 return false;
19186}
19187
19189 LLVMContext &Context, const MemOp &Op,
19190 const AttributeList &FuncAttributes) const {
19191 // See if we can use NEON instructions for this...
19192 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19193 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19194 unsigned Fast;
19195 if (Op.size() >= 16 &&
19196 (Op.isAligned(Align(16)) ||
19197 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19199 Fast))) {
19200 return MVT::v2f64;
19201 } else if (Op.size() >= 8 &&
19202 (Op.isAligned(Align(8)) ||
19204 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19205 Fast))) {
19206 return MVT::f64;
19207 }
19208 }
19209
19210 // Let the target-independent logic figure it out.
19211 return MVT::Other;
19212}
19213
19214// 64-bit integers are split into their high and low parts and held in two
19215// different registers, so the trunc is free since the low register can just
19216// be used.
19217bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19218 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19219 return false;
19220 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19221 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19222 return (SrcBits == 64 && DestBits == 32);
19223}
19224
19226 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19227 !DstVT.isInteger())
19228 return false;
19229 unsigned SrcBits = SrcVT.getSizeInBits();
19230 unsigned DestBits = DstVT.getSizeInBits();
19231 return (SrcBits == 64 && DestBits == 32);
19232}
19233
19235 if (Val.getOpcode() != ISD::LOAD)
19236 return false;
19237
19238 EVT VT1 = Val.getValueType();
19239 if (!VT1.isSimple() || !VT1.isInteger() ||
19240 !VT2.isSimple() || !VT2.isInteger())
19241 return false;
19242
19243 switch (VT1.getSimpleVT().SimpleTy) {
19244 default: break;
19245 case MVT::i1:
19246 case MVT::i8:
19247 case MVT::i16:
19248 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19249 return true;
19250 }
19251
19252 return false;
19253}
19254
19256 if (!VT.isSimple())
19257 return false;
19258
19259 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19260 // negate values directly (fneg is free). So, we don't want to let the DAG
19261 // combiner rewrite fneg into xors and some other instructions. For f16 and
19262 // FullFP16 argument passing, some bitcast nodes may be introduced,
19263 // triggering this DAG combine rewrite, so we are avoiding that with this.
19264 switch (VT.getSimpleVT().SimpleTy) {
19265 default: break;
19266 case MVT::f16:
19267 return Subtarget->hasFullFP16();
19268 }
19269
19270 return false;
19271}
19272
19274 if (!Subtarget->hasMVEIntegerOps())
19275 return nullptr;
19276 Type *SVIType = SVI->getType();
19277 Type *ScalarType = SVIType->getScalarType();
19278
19279 if (ScalarType->isFloatTy())
19280 return Type::getInt32Ty(SVIType->getContext());
19281 if (ScalarType->isHalfTy())
19282 return Type::getInt16Ty(SVIType->getContext());
19283 return nullptr;
19284}
19285
19287 EVT VT = ExtVal.getValueType();
19288
19289 if (!isTypeLegal(VT))
19290 return false;
19291
19292 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19293 if (Ld->isExpandingLoad())
19294 return false;
19295 }
19296
19297 if (Subtarget->hasMVEIntegerOps())
19298 return true;
19299
19300 // Don't create a loadext if we can fold the extension into a wide/long
19301 // instruction.
19302 // If there's more than one user instruction, the loadext is desirable no
19303 // matter what. There can be two uses by the same instruction.
19304 if (ExtVal->use_empty() ||
19305 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19306 return true;
19307
19308 SDNode *U = *ExtVal->user_begin();
19309 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19310 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19311 return false;
19312
19313 return true;
19314}
19315
19317 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19318 return false;
19319
19320 if (!isTypeLegal(EVT::getEVT(Ty1)))
19321 return false;
19322
19323 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19324
19325 // Assuming the caller doesn't have a zeroext or signext return parameter,
19326 // truncation all the way down to i1 is valid.
19327 return true;
19328}
19329
19330/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19331/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19332/// expanded to FMAs when this method returns true, otherwise fmuladd is
19333/// expanded to fmul + fadd.
19334///
19335/// ARM supports both fused and unfused multiply-add operations; we already
19336/// lower a pair of fmul and fadd to the latter so it's not clear that there
19337/// would be a gain or that the gain would be worthwhile enough to risk
19338/// correctness bugs.
19339///
19340/// For MVE, we set this to true as it helps simplify the need for some
19341/// patterns (and we don't have the non-fused floating point instruction).
19342bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19343 EVT VT) const {
19344 if (Subtarget->useSoftFloat())
19345 return false;
19346
19347 if (!VT.isSimple())
19348 return false;
19349
19350 switch (VT.getSimpleVT().SimpleTy) {
19351 case MVT::v4f32:
19352 case MVT::v8f16:
19353 return Subtarget->hasMVEFloatOps();
19354 case MVT::f16:
19355 return Subtarget->useFPVFMx16();
19356 case MVT::f32:
19357 return Subtarget->useFPVFMx();
19358 case MVT::f64:
19359 return Subtarget->useFPVFMx64();
19360 default:
19361 break;
19362 }
19363
19364 return false;
19365}
19366
19367static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19368 if (V < 0)
19369 return false;
19370
19371 unsigned Scale = 1;
19372 switch (VT.getSimpleVT().SimpleTy) {
19373 case MVT::i1:
19374 case MVT::i8:
19375 // Scale == 1;
19376 break;
19377 case MVT::i16:
19378 // Scale == 2;
19379 Scale = 2;
19380 break;
19381 default:
19382 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19383 // Scale == 4;
19384 Scale = 4;
19385 break;
19386 }
19387
19388 if ((V & (Scale - 1)) != 0)
19389 return false;
19390 return isUInt<5>(V / Scale);
19391}
19392
19393static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19394 const ARMSubtarget *Subtarget) {
19395 if (!VT.isInteger() && !VT.isFloatingPoint())
19396 return false;
19397 if (VT.isVector() && Subtarget->hasNEON())
19398 return false;
19399 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19400 !Subtarget->hasMVEFloatOps())
19401 return false;
19402
19403 bool IsNeg = false;
19404 if (V < 0) {
19405 IsNeg = true;
19406 V = -V;
19407 }
19408
19409 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19410
19411 // MVE: size * imm7
19412 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19413 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19414 case MVT::i32:
19415 case MVT::f32:
19416 return isShiftedUInt<7,2>(V);
19417 case MVT::i16:
19418 case MVT::f16:
19419 return isShiftedUInt<7,1>(V);
19420 case MVT::i8:
19421 return isUInt<7>(V);
19422 default:
19423 return false;
19424 }
19425 }
19426
19427 // half VLDR: 2 * imm8
19428 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19429 return isShiftedUInt<8, 1>(V);
19430 // VLDR and LDRD: 4 * imm8
19431 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19432 return isShiftedUInt<8, 2>(V);
19433
19434 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19435 // + imm12 or - imm8
19436 if (IsNeg)
19437 return isUInt<8>(V);
19438 return isUInt<12>(V);
19439 }
19440
19441 return false;
19442}
19443
19444/// isLegalAddressImmediate - Return true if the integer value can be used
19445/// as the offset of the target addressing mode for load / store of the
19446/// given type.
19447static bool isLegalAddressImmediate(int64_t V, EVT VT,
19448 const ARMSubtarget *Subtarget) {
19449 if (V == 0)
19450 return true;
19451
19452 if (!VT.isSimple())
19453 return false;
19454
19455 if (Subtarget->isThumb1Only())
19456 return isLegalT1AddressImmediate(V, VT);
19457 else if (Subtarget->isThumb2())
19458 return isLegalT2AddressImmediate(V, VT, Subtarget);
19459
19460 // ARM mode.
19461 if (V < 0)
19462 V = - V;
19463 switch (VT.getSimpleVT().SimpleTy) {
19464 default: return false;
19465 case MVT::i1:
19466 case MVT::i8:
19467 case MVT::i32:
19468 // +- imm12
19469 return isUInt<12>(V);
19470 case MVT::i16:
19471 // +- imm8
19472 return isUInt<8>(V);
19473 case MVT::f32:
19474 case MVT::f64:
19475 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19476 return false;
19477 return isShiftedUInt<8, 2>(V);
19478 }
19479}
19480
19482 EVT VT) const {
19483 int Scale = AM.Scale;
19484 if (Scale < 0)
19485 return false;
19486
19487 switch (VT.getSimpleVT().SimpleTy) {
19488 default: return false;
19489 case MVT::i1:
19490 case MVT::i8:
19491 case MVT::i16:
19492 case MVT::i32:
19493 if (Scale == 1)
19494 return true;
19495 // r + r << imm
19496 Scale = Scale & ~1;
19497 return Scale == 2 || Scale == 4 || Scale == 8;
19498 case MVT::i64:
19499 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19500 // version in Thumb mode.
19501 // r + r
19502 if (Scale == 1)
19503 return true;
19504 // r * 2 (this can be lowered to r + r).
19505 if (!AM.HasBaseReg && Scale == 2)
19506 return true;
19507 return false;
19508 case MVT::isVoid:
19509 // Note, we allow "void" uses (basically, uses that aren't loads or
19510 // stores), because arm allows folding a scale into many arithmetic
19511 // operations. This should be made more precise and revisited later.
19512
19513 // Allow r << imm, but the imm has to be a multiple of two.
19514 if (Scale & 1) return false;
19515 return isPowerOf2_32(Scale);
19516 }
19517}
19518
19520 EVT VT) const {
19521 const int Scale = AM.Scale;
19522
19523 // Negative scales are not supported in Thumb1.
19524 if (Scale < 0)
19525 return false;
19526
19527 // Thumb1 addressing modes do not support register scaling excepting the
19528 // following cases:
19529 // 1. Scale == 1 means no scaling.
19530 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19531 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19532}
19533
19534/// isLegalAddressingMode - Return true if the addressing mode represented
19535/// by AM is legal for this target, for a load/store of the specified type.
19537 const AddrMode &AM, Type *Ty,
19538 unsigned AS, Instruction *I) const {
19539 EVT VT = getValueType(DL, Ty, true);
19540 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19541 return false;
19542
19543 // Can never fold addr of global into load/store.
19544 if (AM.BaseGV)
19545 return false;
19546
19547 switch (AM.Scale) {
19548 case 0: // no scale reg, must be "r+i" or "r", or "i".
19549 break;
19550 default:
19551 // ARM doesn't support any R+R*scale+imm addr modes.
19552 if (AM.BaseOffs)
19553 return false;
19554
19555 if (!VT.isSimple())
19556 return false;
19557
19558 if (Subtarget->isThumb1Only())
19559 return isLegalT1ScaledAddressingMode(AM, VT);
19560
19561 if (Subtarget->isThumb2())
19562 return isLegalT2ScaledAddressingMode(AM, VT);
19563
19564 int Scale = AM.Scale;
19565 switch (VT.getSimpleVT().SimpleTy) {
19566 default: return false;
19567 case MVT::i1:
19568 case MVT::i8:
19569 case MVT::i32:
19570 if (Scale < 0) Scale = -Scale;
19571 if (Scale == 1)
19572 return true;
19573 // r + r << imm
19574 return isPowerOf2_32(Scale & ~1);
19575 case MVT::i16:
19576 case MVT::i64:
19577 // r +/- r
19578 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19579 return true;
19580 // r * 2 (this can be lowered to r + r).
19581 if (!AM.HasBaseReg && Scale == 2)
19582 return true;
19583 return false;
19584
19585 case MVT::isVoid:
19586 // Note, we allow "void" uses (basically, uses that aren't loads or
19587 // stores), because arm allows folding a scale into many arithmetic
19588 // operations. This should be made more precise and revisited later.
19589
19590 // Allow r << imm, but the imm has to be a multiple of two.
19591 if (Scale & 1) return false;
19592 return isPowerOf2_32(Scale);
19593 }
19594 }
19595 return true;
19596}
19597
19598/// isLegalICmpImmediate - Return true if the specified immediate is legal
19599/// icmp immediate, that is the target has icmp instructions which can compare
19600/// a register against the immediate without having to materialize the
19601/// immediate into a register.
19603 // Thumb2 and ARM modes can use cmn for negative immediates.
19604 if (!Subtarget->isThumb())
19605 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19606 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19607 if (Subtarget->isThumb2())
19608 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19609 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19610 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19611 return Imm >= 0 && Imm <= 255;
19612}
19613
19614/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19615/// *or sub* immediate, that is the target has add or sub instructions which can
19616/// add a register with the immediate without having to materialize the
19617/// immediate into a register.
19619 // Same encoding for add/sub, just flip the sign.
19620 uint64_t AbsImm = AbsoluteValue(Imm);
19621 if (!Subtarget->isThumb())
19622 return ARM_AM::getSOImmVal(AbsImm) != -1;
19623 if (Subtarget->isThumb2())
19624 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19625 // Thumb1 only has 8-bit unsigned immediate.
19626 return AbsImm <= 255;
19627}
19628
19629// Return false to prevent folding
19630// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19631// if the folding leads to worse code.
19633 SDValue ConstNode) const {
19634 // Let the DAGCombiner decide for vector types and large types.
19635 const EVT VT = AddNode.getValueType();
19636 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19637 return true;
19638
19639 // It is worse if c0 is legal add immediate, while c1*c0 is not
19640 // and has to be composed by at least two instructions.
19641 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19642 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19643 const int64_t C0 = C0Node->getSExtValue();
19644 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19646 return true;
19647 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19648 return false;
19649
19650 // Default to true and let the DAGCombiner decide.
19651 return true;
19652}
19653
19655 bool isSEXTLoad, SDValue &Base,
19656 SDValue &Offset, bool &isInc,
19657 SelectionDAG &DAG) {
19658 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19659 return false;
19660
19661 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19662 // AddressingMode 3
19663 Base = Ptr->getOperand(0);
19665 int RHSC = (int)RHS->getZExtValue();
19666 if (RHSC < 0 && RHSC > -256) {
19667 assert(Ptr->getOpcode() == ISD::ADD);
19668 isInc = false;
19669 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19670 return true;
19671 }
19672 }
19673 isInc = (Ptr->getOpcode() == ISD::ADD);
19674 Offset = Ptr->getOperand(1);
19675 return true;
19676 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19677 // AddressingMode 2
19679 int RHSC = (int)RHS->getZExtValue();
19680 if (RHSC < 0 && RHSC > -0x1000) {
19681 assert(Ptr->getOpcode() == ISD::ADD);
19682 isInc = false;
19683 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19684 Base = Ptr->getOperand(0);
19685 return true;
19686 }
19687 }
19688
19689 if (Ptr->getOpcode() == ISD::ADD) {
19690 isInc = true;
19691 ARM_AM::ShiftOpc ShOpcVal=
19693 if (ShOpcVal != ARM_AM::no_shift) {
19694 Base = Ptr->getOperand(1);
19695 Offset = Ptr->getOperand(0);
19696 } else {
19697 Base = Ptr->getOperand(0);
19698 Offset = Ptr->getOperand(1);
19699 }
19700 return true;
19701 }
19702
19703 isInc = (Ptr->getOpcode() == ISD::ADD);
19704 Base = Ptr->getOperand(0);
19705 Offset = Ptr->getOperand(1);
19706 return true;
19707 }
19708
19709 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19710 return false;
19711}
19712
19714 bool isSEXTLoad, SDValue &Base,
19715 SDValue &Offset, bool &isInc,
19716 SelectionDAG &DAG) {
19717 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19718 return false;
19719
19720 Base = Ptr->getOperand(0);
19722 int RHSC = (int)RHS->getZExtValue();
19723 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19724 assert(Ptr->getOpcode() == ISD::ADD);
19725 isInc = false;
19726 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19727 return true;
19728 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19729 isInc = Ptr->getOpcode() == ISD::ADD;
19730 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19731 return true;
19732 }
19733 }
19734
19735 return false;
19736}
19737
19738static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19739 bool isSEXTLoad, bool IsMasked, bool isLE,
19741 bool &isInc, SelectionDAG &DAG) {
19742 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19743 return false;
19744 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19745 return false;
19746
19747 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19748 // as opposed to a vldrw.32). This can allow extra addressing modes or
19749 // alignments for what is otherwise an equivalent instruction.
19750 bool CanChangeType = isLE && !IsMasked;
19751
19753 int RHSC = (int)RHS->getZExtValue();
19754
19755 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19756 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19757 assert(Ptr->getOpcode() == ISD::ADD);
19758 isInc = false;
19759 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19760 return true;
19761 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19762 isInc = Ptr->getOpcode() == ISD::ADD;
19763 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19764 return true;
19765 }
19766 return false;
19767 };
19768
19769 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19770 // (in BE/masked) type.
19771 Base = Ptr->getOperand(0);
19772 if (VT == MVT::v4i16) {
19773 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19774 return true;
19775 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19776 if (IsInRange(RHSC, 0x80, 1))
19777 return true;
19778 } else if (Alignment >= 4 &&
19779 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19780 IsInRange(RHSC, 0x80, 4))
19781 return true;
19782 else if (Alignment >= 2 &&
19783 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19784 IsInRange(RHSC, 0x80, 2))
19785 return true;
19786 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19787 return true;
19788 return false;
19789}
19790
19791/// getPreIndexedAddressParts - returns true by value, base pointer and
19792/// offset pointer and addressing mode by reference if the node's address
19793/// can be legally represented as pre-indexed load / store address.
19794bool
19796 SDValue &Offset,
19798 SelectionDAG &DAG) const {
19799 if (Subtarget->isThumb1Only())
19800 return false;
19801
19802 EVT VT;
19803 SDValue Ptr;
19804 Align Alignment;
19805 unsigned AS = 0;
19806 bool isSEXTLoad = false;
19807 bool IsMasked = false;
19808 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19809 Ptr = LD->getBasePtr();
19810 VT = LD->getMemoryVT();
19811 Alignment = LD->getAlign();
19812 AS = LD->getAddressSpace();
19813 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19814 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19815 Ptr = ST->getBasePtr();
19816 VT = ST->getMemoryVT();
19817 Alignment = ST->getAlign();
19818 AS = ST->getAddressSpace();
19819 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19820 Ptr = LD->getBasePtr();
19821 VT = LD->getMemoryVT();
19822 Alignment = LD->getAlign();
19823 AS = LD->getAddressSpace();
19824 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19825 IsMasked = true;
19827 Ptr = ST->getBasePtr();
19828 VT = ST->getMemoryVT();
19829 Alignment = ST->getAlign();
19830 AS = ST->getAddressSpace();
19831 IsMasked = true;
19832 } else
19833 return false;
19834
19835 unsigned Fast = 0;
19836 if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment,
19838 // Only generate post-increment or pre-increment forms when a real
19839 // hardware instruction exists for them. Do not emit postinc/preinc
19840 // if the operation will end up as a libcall.
19841 return false;
19842 }
19843
19844 bool isInc;
19845 bool isLegal = false;
19846 if (VT.isVector())
19847 isLegal = Subtarget->hasMVEIntegerOps() &&
19849 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19850 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19851 else {
19852 if (Subtarget->isThumb2())
19853 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19854 Offset, isInc, DAG);
19855 else
19856 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19857 Offset, isInc, DAG);
19858 }
19859 if (!isLegal)
19860 return false;
19861
19862 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19863 return true;
19864}
19865
19866/// getPostIndexedAddressParts - returns true by value, base pointer and
19867/// offset pointer and addressing mode by reference if this node can be
19868/// combined with a load / store to form a post-indexed load / store.
19870 SDValue &Base,
19871 SDValue &Offset,
19873 SelectionDAG &DAG) const {
19874 EVT VT;
19875 SDValue Ptr;
19876 Align Alignment;
19877 bool isSEXTLoad = false, isNonExt;
19878 bool IsMasked = false;
19879 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19880 VT = LD->getMemoryVT();
19881 Ptr = LD->getBasePtr();
19882 Alignment = LD->getAlign();
19883 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19884 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19885 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19886 VT = ST->getMemoryVT();
19887 Ptr = ST->getBasePtr();
19888 Alignment = ST->getAlign();
19889 isNonExt = !ST->isTruncatingStore();
19890 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19891 VT = LD->getMemoryVT();
19892 Ptr = LD->getBasePtr();
19893 Alignment = LD->getAlign();
19894 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19895 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19896 IsMasked = true;
19898 VT = ST->getMemoryVT();
19899 Ptr = ST->getBasePtr();
19900 Alignment = ST->getAlign();
19901 isNonExt = !ST->isTruncatingStore();
19902 IsMasked = true;
19903 } else
19904 return false;
19905
19906 if (Subtarget->isThumb1Only()) {
19907 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19908 // must be non-extending/truncating, i32, with an offset of 4.
19909 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19910 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19911 return false;
19912 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19913 if (!RHS || RHS->getZExtValue() != 4)
19914 return false;
19915 if (Alignment < Align(4))
19916 return false;
19917
19918 Offset = Op->getOperand(1);
19919 Base = Op->getOperand(0);
19920 AM = ISD::POST_INC;
19921 return true;
19922 }
19923
19924 bool isInc;
19925 bool isLegal = false;
19926 if (VT.isVector())
19927 isLegal = Subtarget->hasMVEIntegerOps() &&
19928 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19929 Subtarget->isLittle(), Base, Offset,
19930 isInc, DAG);
19931 else {
19932 if (Subtarget->isThumb2())
19933 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19934 isInc, DAG);
19935 else
19936 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19937 isInc, DAG);
19938 }
19939 if (!isLegal)
19940 return false;
19941
19942 if (Ptr != Base) {
19943 // Swap base ptr and offset to catch more post-index load / store when
19944 // it's legal. In Thumb2 mode, offset must be an immediate.
19945 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19946 !Subtarget->isThumb2())
19948
19949 // Post-indexed load / store update the base pointer.
19950 if (Ptr != Base)
19951 return false;
19952 }
19953
19954 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19955 return true;
19956}
19957
19959 KnownBits &Known,
19960 const APInt &DemandedElts,
19961 const SelectionDAG &DAG,
19962 unsigned Depth) const {
19963 unsigned BitWidth = Known.getBitWidth();
19964 Known.resetAll();
19965 switch (Op.getOpcode()) {
19966 default: break;
19967 case ARMISD::ADDC:
19968 case ARMISD::ADDE:
19969 case ARMISD::SUBC:
19970 case ARMISD::SUBE:
19971 // Special cases when we convert a carry to a boolean.
19972 if (Op.getResNo() == 0) {
19973 SDValue LHS = Op.getOperand(0);
19974 SDValue RHS = Op.getOperand(1);
19975 // (ADDE 0, 0, C) will give us a single bit.
19976 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19977 isNullConstant(RHS)) {
19979 return;
19980 }
19981 }
19982 break;
19983 case ARMISD::CMOV: {
19984 // Bits are known zero/one if known on the LHS and RHS.
19985 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19986 if (Known.isUnknown())
19987 return;
19988
19989 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19990 Known = Known.intersectWith(KnownRHS);
19991 return;
19992 }
19994 Intrinsic::ID IntID =
19995 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19996 switch (IntID) {
19997 default: return;
19998 case Intrinsic::arm_ldaex:
19999 case Intrinsic::arm_ldrex: {
20000 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20001 unsigned MemBits = VT.getScalarSizeInBits();
20002 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20003 return;
20004 }
20005 }
20006 }
20007 case ARMISD::BFI: {
20008 // Conservatively, we can recurse down the first operand
20009 // and just mask out all affected bits.
20010 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20011
20012 // The operand to BFI is already a mask suitable for removing the bits it
20013 // sets.
20014 const APInt &Mask = Op.getConstantOperandAPInt(2);
20015 Known.Zero &= Mask;
20016 Known.One &= Mask;
20017 return;
20018 }
20019 case ARMISD::VGETLANEs:
20020 case ARMISD::VGETLANEu: {
20021 const SDValue &SrcSV = Op.getOperand(0);
20022 EVT VecVT = SrcSV.getValueType();
20023 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20024 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20025 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20026 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20027 "VGETLANE index out of bounds");
20028 unsigned Idx = Pos->getZExtValue();
20029 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20030 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20031
20032 EVT VT = Op.getValueType();
20033 const unsigned DstSz = VT.getScalarSizeInBits();
20034 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20035 (void)SrcSz;
20036 assert(SrcSz == Known.getBitWidth());
20037 assert(DstSz > SrcSz);
20038 if (Op.getOpcode() == ARMISD::VGETLANEs)
20039 Known = Known.sext(DstSz);
20040 else {
20041 Known = Known.zext(DstSz);
20042 }
20043 assert(DstSz == Known.getBitWidth());
20044 break;
20045 }
20046 case ARMISD::VMOVrh: {
20047 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20048 assert(KnownOp.getBitWidth() == 16);
20049 Known = KnownOp.zext(32);
20050 break;
20051 }
20052 case ARMISD::CSINC:
20053 case ARMISD::CSINV:
20054 case ARMISD::CSNEG: {
20055 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20056 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20057
20058 // The result is either:
20059 // CSINC: KnownOp0 or KnownOp1 + 1
20060 // CSINV: KnownOp0 or ~KnownOp1
20061 // CSNEG: KnownOp0 or KnownOp1 * -1
20062 if (Op.getOpcode() == ARMISD::CSINC)
20063 KnownOp1 =
20064 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20065 else if (Op.getOpcode() == ARMISD::CSINV)
20066 std::swap(KnownOp1.Zero, KnownOp1.One);
20067 else if (Op.getOpcode() == ARMISD::CSNEG)
20068 KnownOp1 = KnownBits::mul(KnownOp1,
20070
20071 Known = KnownOp0.intersectWith(KnownOp1);
20072 break;
20073 }
20074 case ARMISD::VORRIMM:
20075 case ARMISD::VBICIMM: {
20076 unsigned Encoded = Op.getConstantOperandVal(1);
20077 unsigned DecEltBits = 0;
20078 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20079
20080 unsigned EltBits = Op.getScalarValueSizeInBits();
20081 if (EltBits != DecEltBits) {
20082 // Be conservative: only update Known when EltBits == DecEltBits.
20083 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20084 // that changes in the future, doing nothing here is safer than risking
20085 // subtle bugs.
20086 break;
20087 }
20088
20089 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20090 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20091 APInt Imm(DecEltBits, DecodedVal);
20092
20093 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20094 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20095 break;
20096 }
20097 }
20098}
20099
20101 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20102 TargetLoweringOpt &TLO) const {
20103 // Delay optimization, so we don't have to deal with illegal types, or block
20104 // optimizations.
20105 if (!TLO.LegalOps)
20106 return false;
20107
20108 // Only optimize AND for now.
20109 if (Op.getOpcode() != ISD::AND)
20110 return false;
20111
20112 EVT VT = Op.getValueType();
20113
20114 // Ignore vectors.
20115 if (VT.isVector())
20116 return false;
20117
20118 assert(VT == MVT::i32 && "Unexpected integer type");
20119
20120 // Make sure the RHS really is a constant.
20121 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20122 if (!C)
20123 return false;
20124
20125 unsigned Mask = C->getZExtValue();
20126
20127 unsigned Demanded = DemandedBits.getZExtValue();
20128 unsigned ShrunkMask = Mask & Demanded;
20129 unsigned ExpandedMask = Mask | ~Demanded;
20130
20131 // If the mask is all zeros, let the target-independent code replace the
20132 // result with zero.
20133 if (ShrunkMask == 0)
20134 return false;
20135
20136 // If the mask is all ones, erase the AND. (Currently, the target-independent
20137 // code won't do this, so we have to do it explicitly to avoid an infinite
20138 // loop in obscure cases.)
20139 if (ExpandedMask == ~0U)
20140 return TLO.CombineTo(Op, Op.getOperand(0));
20141
20142 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20143 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20144 };
20145 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20146 if (NewMask == Mask)
20147 return true;
20148 SDLoc DL(Op);
20149 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20150 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20151 return TLO.CombineTo(Op, NewOp);
20152 };
20153
20154 // Prefer uxtb mask.
20155 if (IsLegalMask(0xFF))
20156 return UseMask(0xFF);
20157
20158 // Prefer uxth mask.
20159 if (IsLegalMask(0xFFFF))
20160 return UseMask(0xFFFF);
20161
20162 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20163 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20164 if (ShrunkMask < 256)
20165 return UseMask(ShrunkMask);
20166
20167 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20168 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20169 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20170 return UseMask(ExpandedMask);
20171
20172 // Potential improvements:
20173 //
20174 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20175 // We could try to prefer Thumb1 immediates which can be lowered to a
20176 // two-instruction sequence.
20177 // We could try to recognize more legal ARM/Thumb2 immediates here.
20178
20179 return false;
20180}
20181
20183 SDValue Op, const APInt &OriginalDemandedBits,
20184 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20185 unsigned Depth) const {
20186 unsigned Opc = Op.getOpcode();
20187
20188 switch (Opc) {
20189 case ARMISD::ASRL:
20190 case ARMISD::LSRL: {
20191 // If this is result 0 and the other result is unused, see if the demand
20192 // bits allow us to shrink this long shift into a standard small shift in
20193 // the opposite direction.
20194 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20195 isa<ConstantSDNode>(Op->getOperand(2))) {
20196 unsigned ShAmt = Op->getConstantOperandVal(2);
20197 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20198 << (32 - ShAmt)))
20199 return TLO.CombineTo(
20200 Op, TLO.DAG.getNode(
20201 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20202 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20203 }
20204 break;
20205 }
20206 case ARMISD::VBICIMM: {
20207 SDValue Op0 = Op.getOperand(0);
20208 unsigned ModImm = Op.getConstantOperandVal(1);
20209 unsigned EltBits = 0;
20210 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20211 if ((OriginalDemandedBits & Mask) == 0)
20212 return TLO.CombineTo(Op, Op0);
20213 }
20214 }
20215
20217 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20218}
20219
20220//===----------------------------------------------------------------------===//
20221// ARM Inline Assembly Support
20222//===----------------------------------------------------------------------===//
20223
20224const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20225 // At this point, we have to lower this constraint to something else, so we
20226 // lower it to an "r" or "w". However, by doing this we will force the result
20227 // to be in register, while the X constraint is much more permissive.
20228 //
20229 // Although we are correct (we are free to emit anything, without
20230 // constraints), we might break use cases that would expect us to be more
20231 // efficient and emit something else.
20232 if (!Subtarget->hasVFP2Base())
20233 return "r";
20234 if (ConstraintVT.isFloatingPoint())
20235 return "w";
20236 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20237 (ConstraintVT.getSizeInBits() == 64 ||
20238 ConstraintVT.getSizeInBits() == 128))
20239 return "w";
20240
20241 return "r";
20242}
20243
20244/// getConstraintType - Given a constraint letter, return the type of
20245/// constraint it is for this target.
20248 unsigned S = Constraint.size();
20249 if (S == 1) {
20250 switch (Constraint[0]) {
20251 default: break;
20252 case 'l': return C_RegisterClass;
20253 case 'w': return C_RegisterClass;
20254 case 'h': return C_RegisterClass;
20255 case 'x': return C_RegisterClass;
20256 case 't': return C_RegisterClass;
20257 case 'j': return C_Immediate; // Constant for movw.
20258 // An address with a single base register. Due to the way we
20259 // currently handle addresses it is the same as an 'r' memory constraint.
20260 case 'Q': return C_Memory;
20261 }
20262 } else if (S == 2) {
20263 switch (Constraint[0]) {
20264 default: break;
20265 case 'T': return C_RegisterClass;
20266 // All 'U+' constraints are addresses.
20267 case 'U': return C_Memory;
20268 }
20269 }
20270 return TargetLowering::getConstraintType(Constraint);
20271}
20272
20273/// Examine constraint type and operand type and determine a weight value.
20274/// This object must already have been set up with the operand type
20275/// and the current alternative constraint selected.
20278 AsmOperandInfo &info, const char *constraint) const {
20280 Value *CallOperandVal = info.CallOperandVal;
20281 // If we don't have a value, we can't do a match,
20282 // but allow it at the lowest weight.
20283 if (!CallOperandVal)
20284 return CW_Default;
20285 Type *type = CallOperandVal->getType();
20286 // Look at the constraint type.
20287 switch (*constraint) {
20288 default:
20290 break;
20291 case 'l':
20292 if (type->isIntegerTy()) {
20293 if (Subtarget->isThumb())
20294 weight = CW_SpecificReg;
20295 else
20296 weight = CW_Register;
20297 }
20298 break;
20299 case 'w':
20300 if (type->isFloatingPointTy())
20301 weight = CW_Register;
20302 break;
20303 }
20304 return weight;
20305}
20306
20307static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20308 if (PR == 0 || VT == MVT::Other)
20309 return false;
20310 if (ARM::SPRRegClass.contains(PR))
20311 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20312 if (ARM::DPRRegClass.contains(PR))
20313 return VT != MVT::f64 && !VT.is64BitVector();
20314 return false;
20315}
20316
20317using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20318
20320 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20321 switch (Constraint.size()) {
20322 case 1:
20323 // GCC ARM Constraint Letters
20324 switch (Constraint[0]) {
20325 case 'l': // Low regs or general regs.
20326 if (Subtarget->isThumb())
20327 return RCPair(0U, &ARM::tGPRRegClass);
20328 return RCPair(0U, &ARM::GPRRegClass);
20329 case 'h': // High regs or no regs.
20330 if (Subtarget->isThumb())
20331 return RCPair(0U, &ARM::hGPRRegClass);
20332 break;
20333 case 'r':
20334 if (Subtarget->isThumb1Only())
20335 return RCPair(0U, &ARM::tGPRRegClass);
20336 return RCPair(0U, &ARM::GPRRegClass);
20337 case 'w':
20338 if (VT == MVT::Other)
20339 break;
20340 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20341 return RCPair(0U, &ARM::SPRRegClass);
20342 if (VT.getSizeInBits() == 64)
20343 return RCPair(0U, &ARM::DPRRegClass);
20344 if (VT.getSizeInBits() == 128)
20345 return RCPair(0U, &ARM::QPRRegClass);
20346 break;
20347 case 'x':
20348 if (VT == MVT::Other)
20349 break;
20350 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20351 return RCPair(0U, &ARM::SPR_8RegClass);
20352 if (VT.getSizeInBits() == 64)
20353 return RCPair(0U, &ARM::DPR_8RegClass);
20354 if (VT.getSizeInBits() == 128)
20355 return RCPair(0U, &ARM::QPR_8RegClass);
20356 break;
20357 case 't':
20358 if (VT == MVT::Other)
20359 break;
20360 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20361 return RCPair(0U, &ARM::SPRRegClass);
20362 if (VT.getSizeInBits() == 64)
20363 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20364 if (VT.getSizeInBits() == 128)
20365 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20366 break;
20367 }
20368 break;
20369
20370 case 2:
20371 if (Constraint[0] == 'T') {
20372 switch (Constraint[1]) {
20373 default:
20374 break;
20375 case 'e':
20376 return RCPair(0U, &ARM::tGPREvenRegClass);
20377 case 'o':
20378 return RCPair(0U, &ARM::tGPROddRegClass);
20379 }
20380 }
20381 break;
20382
20383 default:
20384 break;
20385 }
20386
20387 if (StringRef("{cc}").equals_insensitive(Constraint))
20388 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20389
20390 // r14 is an alias of lr.
20391 if (StringRef("{r14}").equals_insensitive(Constraint))
20392 return std::make_pair(unsigned(ARM::LR), getRegClassFor(MVT::i32));
20393
20394 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20395 if (isIncompatibleReg(RCP.first, VT))
20396 return {0, nullptr};
20397 return RCP;
20398}
20399
20400/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20401/// vector. If it is invalid, don't add anything to Ops.
20403 StringRef Constraint,
20404 std::vector<SDValue> &Ops,
20405 SelectionDAG &DAG) const {
20406 SDValue Result;
20407
20408 // Currently only support length 1 constraints.
20409 if (Constraint.size() != 1)
20410 return;
20411
20412 char ConstraintLetter = Constraint[0];
20413 switch (ConstraintLetter) {
20414 default: break;
20415 case 'j':
20416 case 'I': case 'J': case 'K': case 'L':
20417 case 'M': case 'N': case 'O':
20419 if (!C)
20420 return;
20421
20422 int64_t CVal64 = C->getSExtValue();
20423 int CVal = (int) CVal64;
20424 // None of these constraints allow values larger than 32 bits. Check
20425 // that the value fits in an int.
20426 if (CVal != CVal64)
20427 return;
20428
20429 switch (ConstraintLetter) {
20430 case 'j':
20431 // Constant suitable for movw, must be between 0 and
20432 // 65535.
20433 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20434 if (CVal >= 0 && CVal <= 65535)
20435 break;
20436 return;
20437 case 'I':
20438 if (Subtarget->isThumb1Only()) {
20439 // This must be a constant between 0 and 255, for ADD
20440 // immediates.
20441 if (CVal >= 0 && CVal <= 255)
20442 break;
20443 } else if (Subtarget->isThumb2()) {
20444 // A constant that can be used as an immediate value in a
20445 // data-processing instruction.
20446 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20447 break;
20448 } else {
20449 // A constant that can be used as an immediate value in a
20450 // data-processing instruction.
20451 if (ARM_AM::getSOImmVal(CVal) != -1)
20452 break;
20453 }
20454 return;
20455
20456 case 'J':
20457 if (Subtarget->isThumb1Only()) {
20458 // This must be a constant between -255 and -1, for negated ADD
20459 // immediates. This can be used in GCC with an "n" modifier that
20460 // prints the negated value, for use with SUB instructions. It is
20461 // not useful otherwise but is implemented for compatibility.
20462 if (CVal >= -255 && CVal <= -1)
20463 break;
20464 } else {
20465 // This must be a constant between -4095 and 4095. This is suitable
20466 // for use as the immediate offset field in LDR and STR instructions
20467 // such as LDR r0,[r1,#offset].
20468 if (CVal >= -4095 && CVal <= 4095)
20469 break;
20470 }
20471 return;
20472
20473 case 'K':
20474 if (Subtarget->isThumb1Only()) {
20475 // A 32-bit value where only one byte has a nonzero value. Exclude
20476 // zero to match GCC. This constraint is used by GCC internally for
20477 // constants that can be loaded with a move/shift combination.
20478 // It is not useful otherwise but is implemented for compatibility.
20479 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20480 break;
20481 } else if (Subtarget->isThumb2()) {
20482 // A constant whose bitwise inverse can be used as an immediate
20483 // value in a data-processing instruction. This can be used in GCC
20484 // with a "B" modifier that prints the inverted value, for use with
20485 // BIC and MVN instructions. It is not useful otherwise but is
20486 // implemented for compatibility.
20487 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20488 break;
20489 } else {
20490 // A constant whose bitwise inverse can be used as an immediate
20491 // value in a data-processing instruction. This can be used in GCC
20492 // with a "B" modifier that prints the inverted value, for use with
20493 // BIC and MVN instructions. It is not useful otherwise but is
20494 // implemented for compatibility.
20495 if (ARM_AM::getSOImmVal(~CVal) != -1)
20496 break;
20497 }
20498 return;
20499
20500 case 'L':
20501 if (Subtarget->isThumb1Only()) {
20502 // This must be a constant between -7 and 7,
20503 // for 3-operand ADD/SUB immediate instructions.
20504 if (CVal >= -7 && CVal < 7)
20505 break;
20506 } else if (Subtarget->isThumb2()) {
20507 // A constant whose negation can be used as an immediate value in a
20508 // data-processing instruction. This can be used in GCC with an "n"
20509 // modifier that prints the negated value, for use with SUB
20510 // instructions. It is not useful otherwise but is implemented for
20511 // compatibility.
20512 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20513 break;
20514 } else {
20515 // A constant whose negation can be used as an immediate value in a
20516 // data-processing instruction. This can be used in GCC with an "n"
20517 // modifier that prints the negated value, for use with SUB
20518 // instructions. It is not useful otherwise but is implemented for
20519 // compatibility.
20520 if (ARM_AM::getSOImmVal(-CVal) != -1)
20521 break;
20522 }
20523 return;
20524
20525 case 'M':
20526 if (Subtarget->isThumb1Only()) {
20527 // This must be a multiple of 4 between 0 and 1020, for
20528 // ADD sp + immediate.
20529 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20530 break;
20531 } else {
20532 // A power of two or a constant between 0 and 32. This is used in
20533 // GCC for the shift amount on shifted register operands, but it is
20534 // useful in general for any shift amounts.
20535 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20536 break;
20537 }
20538 return;
20539
20540 case 'N':
20541 if (Subtarget->isThumb1Only()) {
20542 // This must be a constant between 0 and 31, for shift amounts.
20543 if (CVal >= 0 && CVal <= 31)
20544 break;
20545 }
20546 return;
20547
20548 case 'O':
20549 if (Subtarget->isThumb1Only()) {
20550 // This must be a multiple of 4 between -508 and 508, for
20551 // ADD/SUB sp = sp + immediate.
20552 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20553 break;
20554 }
20555 return;
20556 }
20557 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20558 break;
20559 }
20560
20561 if (Result.getNode()) {
20562 Ops.push_back(Result);
20563 return;
20564 }
20565 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20566}
20567
20568static RTLIB::Libcall getDivRemLibcall(
20569 const SDNode *N, MVT::SimpleValueType SVT) {
20570 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20571 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20572 "Unhandled Opcode in getDivRemLibcall");
20573 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20574 N->getOpcode() == ISD::SREM;
20575 RTLIB::Libcall LC;
20576 switch (SVT) {
20577 default: llvm_unreachable("Unexpected request for libcall!");
20578 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20579 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20580 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20581 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20582 }
20583 return LC;
20584}
20585
20587 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20588 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20589 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20590 "Unhandled Opcode in getDivRemArgList");
20591 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20592 N->getOpcode() == ISD::SREM;
20594 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20595 EVT ArgVT = N->getOperand(i).getValueType();
20596 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20597 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20598 Entry.IsSExt = isSigned;
20599 Entry.IsZExt = !isSigned;
20600 Args.push_back(Entry);
20601 }
20602 if (Subtarget->getTargetTriple().isOSWindows() && Args.size() >= 2)
20603 std::swap(Args[0], Args[1]);
20604 return Args;
20605}
20606
20607SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20608 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20609 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20610 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20611 "Register-based DivRem lowering only");
20612 unsigned Opcode = Op->getOpcode();
20613 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20614 "Invalid opcode for Div/Rem lowering");
20615 bool isSigned = (Opcode == ISD::SDIVREM);
20616 EVT VT = Op->getValueType(0);
20617 SDLoc dl(Op);
20618
20619 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20621 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20622 SDValue Res0 =
20623 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20624 SDValue Res1 =
20625 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20626 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20627 {Res0, Res1});
20628 }
20629 }
20630
20631 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20632
20633 // If the target has hardware divide, use divide + multiply + subtract:
20634 // div = a / b
20635 // rem = a - b * div
20636 // return {div, rem}
20637 // This should be lowered into UDIV/SDIV + MLS later on.
20638 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20639 : Subtarget->hasDivideInARMMode();
20640 if (hasDivide && Op->getValueType(0).isSimple() &&
20641 Op->getSimpleValueType(0) == MVT::i32) {
20642 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20643 const SDValue Dividend = Op->getOperand(0);
20644 const SDValue Divisor = Op->getOperand(1);
20645 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20646 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20647 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20648
20649 SDValue Values[2] = {Div, Rem};
20650 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20651 }
20652
20653 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20654 VT.getSimpleVT().SimpleTy);
20655 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20656
20657 SDValue InChain = DAG.getEntryNode();
20658
20660 DAG.getContext(),
20661 Subtarget);
20662
20663 SDValue Callee =
20664 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20665
20666 Type *RetTy = StructType::get(Ty, Ty);
20667
20668 if (getTM().getTargetTriple().isOSWindows())
20669 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20670
20671 TargetLowering::CallLoweringInfo CLI(DAG);
20672 CLI.setDebugLoc(dl)
20673 .setChain(InChain)
20674 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20675 Callee, std::move(Args))
20676 .setInRegister()
20677 .setSExtResult(isSigned)
20678 .setZExtResult(!isSigned);
20679
20680 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20681 return CallInfo.first;
20682}
20683
20684// Lowers REM using divmod helpers
20685// see RTABI section 4.2/4.3
20686SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20687 EVT VT = N->getValueType(0);
20688
20689 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20691 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20692 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20693 Result[0], Result[1]);
20694 }
20695
20696 // Build return types (div and rem)
20697 std::vector<Type*> RetTyParams;
20698 Type *RetTyElement;
20699
20700 switch (VT.getSimpleVT().SimpleTy) {
20701 default: llvm_unreachable("Unexpected request for libcall!");
20702 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20703 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20704 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20705 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20706 }
20707
20708 RetTyParams.push_back(RetTyElement);
20709 RetTyParams.push_back(RetTyElement);
20710 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20711 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20712
20713 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20714 SimpleTy);
20715 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20716 SDValue InChain = DAG.getEntryNode();
20718 Subtarget);
20719 bool isSigned = N->getOpcode() == ISD::SREM;
20720
20721 SDValue Callee =
20722 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20723
20724 if (getTM().getTargetTriple().isOSWindows())
20725 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20726
20727 // Lower call
20728 CallLoweringInfo CLI(DAG);
20729 CLI.setChain(InChain)
20730 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20731 Callee, std::move(Args))
20732 .setSExtResult(isSigned)
20733 .setZExtResult(!isSigned)
20734 .setDebugLoc(SDLoc(N));
20735 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20736
20737 // Return second (rem) result operand (first contains div)
20738 SDNode *ResNode = CallResult.first.getNode();
20739 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20740 return ResNode->getOperand(1);
20741}
20742
20743SDValue
20744ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20745 assert(getTM().getTargetTriple().isOSWindows() &&
20746 "unsupported target platform");
20747 SDLoc DL(Op);
20748
20749 // Get the inputs.
20750 SDValue Chain = Op.getOperand(0);
20751 SDValue Size = Op.getOperand(1);
20752
20754 "no-stack-arg-probe")) {
20755 MaybeAlign Align =
20756 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20757 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20758 Chain = SP.getValue(1);
20759 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20760 if (Align)
20761 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20762 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20763 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20764 SDValue Ops[2] = { SP, Chain };
20765 return DAG.getMergeValues(Ops, DL);
20766 }
20767
20768 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20769 DAG.getConstant(2, DL, MVT::i32));
20770
20771 SDValue Glue;
20772 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20773 Glue = Chain.getValue(1);
20774
20775 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20776 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20777
20778 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20779 Chain = NewSP.getValue(1);
20780
20781 SDValue Ops[2] = { NewSP, Chain };
20782 return DAG.getMergeValues(Ops, DL);
20783}
20784
20785SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20786 bool IsStrict = Op->isStrictFPOpcode();
20787 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20788 const unsigned DstSz = Op.getValueType().getSizeInBits();
20789 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20790 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20791 "Unexpected type for custom-lowering FP_EXTEND");
20792
20793 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20794 "With both FP DP and 16, any FP conversion is legal!");
20795
20796 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20797 "With FP16, 16 to 32 conversion is legal!");
20798
20799 // Converting from 32 -> 64 is valid if we have FP64.
20800 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20801 // FIXME: Remove this when we have strict fp instruction selection patterns
20802 if (IsStrict) {
20803 SDLoc Loc(Op);
20805 Loc, Op.getValueType(), SrcVal);
20806 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20807 }
20808 return Op;
20809 }
20810
20811 // Either we are converting from 16 -> 64, without FP16 and/or
20812 // FP.double-precision or without Armv8-fp. So we must do it in two
20813 // steps.
20814 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20815 // without FP16. So we must do a function call.
20816 SDLoc Loc(Op);
20817 RTLIB::Libcall LC;
20818 MakeLibCallOptions CallOptions;
20819 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20820 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20821 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20822 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20823 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20824 if (Supported) {
20825 if (IsStrict) {
20826 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20827 {DstVT, MVT::Other}, {Chain, SrcVal});
20828 Chain = SrcVal.getValue(1);
20829 } else {
20830 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20831 }
20832 } else {
20833 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20834 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20835 "Unexpected type for custom-lowering FP_EXTEND");
20836 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20837 Loc, Chain);
20838 }
20839 }
20840
20841 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20842}
20843
20844SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20845 bool IsStrict = Op->isStrictFPOpcode();
20846
20847 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20848 EVT SrcVT = SrcVal.getValueType();
20849 EVT DstVT = Op.getValueType();
20850 const unsigned DstSz = Op.getValueType().getSizeInBits();
20851 const unsigned SrcSz = SrcVT.getSizeInBits();
20852 (void)DstSz;
20853 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20854 "Unexpected type for custom-lowering FP_ROUND");
20855
20856 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20857 "With both FP DP and 16, any FP conversion is legal!");
20858
20859 SDLoc Loc(Op);
20860
20861 // Instruction from 32 -> 16 if hasFP16 is valid
20862 if (SrcSz == 32 && Subtarget->hasFP16())
20863 return Op;
20864
20865 // Lib call from 32 -> 16 / 64 -> [32, 16]
20866 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20867 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20868 "Unexpected type for custom-lowering FP_ROUND");
20869 MakeLibCallOptions CallOptions;
20870 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20872 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20873 Loc, Chain);
20874 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20875}
20876
20877bool
20879 // The ARM target isn't yet aware of offsets.
20880 return false;
20881}
20882
20884 if (v == 0xffffffff)
20885 return false;
20886
20887 // there can be 1's on either or both "outsides", all the "inside"
20888 // bits must be 0's
20889 return isShiftedMask_32(~v);
20890}
20891
20892/// isFPImmLegal - Returns true if the target can instruction select the
20893/// specified FP immediate natively. If false, the legalizer will
20894/// materialize the FP immediate as a load from a constant pool.
20896 bool ForCodeSize) const {
20897 if (!Subtarget->hasVFP3Base())
20898 return false;
20899 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20900 return ARM_AM::getFP16Imm(Imm) != -1;
20901 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20902 ARM_AM::getFP32FP16Imm(Imm) != -1)
20903 return true;
20904 if (VT == MVT::f32)
20905 return ARM_AM::getFP32Imm(Imm) != -1;
20906 if (VT == MVT::f64 && Subtarget->hasFP64())
20907 return ARM_AM::getFP64Imm(Imm) != -1;
20908 return false;
20909}
20910
20911/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20912/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20913/// specified in the intrinsic calls.
20916 MachineFunction &MF, unsigned Intrinsic) const {
20917 IntrinsicInfo Info;
20918 switch (Intrinsic) {
20919 case Intrinsic::arm_neon_vld1:
20920 case Intrinsic::arm_neon_vld2:
20921 case Intrinsic::arm_neon_vld3:
20922 case Intrinsic::arm_neon_vld4:
20923 case Intrinsic::arm_neon_vld2lane:
20924 case Intrinsic::arm_neon_vld3lane:
20925 case Intrinsic::arm_neon_vld4lane:
20926 case Intrinsic::arm_neon_vld2dup:
20927 case Intrinsic::arm_neon_vld3dup:
20928 case Intrinsic::arm_neon_vld4dup: {
20929 Info.opc = ISD::INTRINSIC_W_CHAIN;
20930 // Conservatively set memVT to the entire set of vectors loaded.
20931 auto &DL = I.getDataLayout();
20932 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20933 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20934 Info.ptrVal = I.getArgOperand(0);
20935 Info.offset = 0;
20936 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20937 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20938 // volatile loads with NEON intrinsics not supported
20939 Info.flags = MachineMemOperand::MOLoad;
20940 Infos.push_back(Info);
20941 return;
20942 }
20943 case Intrinsic::arm_neon_vld1x2:
20944 case Intrinsic::arm_neon_vld1x3:
20945 case Intrinsic::arm_neon_vld1x4: {
20946 Info.opc = ISD::INTRINSIC_W_CHAIN;
20947 // Conservatively set memVT to the entire set of vectors loaded.
20948 auto &DL = I.getDataLayout();
20949 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20950 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20951 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20952 Info.offset = 0;
20953 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20954 // volatile loads with NEON intrinsics not supported
20955 Info.flags = MachineMemOperand::MOLoad;
20956 Infos.push_back(Info);
20957 return;
20958 }
20959 case Intrinsic::arm_neon_vst1:
20960 case Intrinsic::arm_neon_vst2:
20961 case Intrinsic::arm_neon_vst3:
20962 case Intrinsic::arm_neon_vst4:
20963 case Intrinsic::arm_neon_vst2lane:
20964 case Intrinsic::arm_neon_vst3lane:
20965 case Intrinsic::arm_neon_vst4lane: {
20966 Info.opc = ISD::INTRINSIC_VOID;
20967 // Conservatively set memVT to the entire set of vectors stored.
20968 auto &DL = I.getDataLayout();
20969 unsigned NumElts = 0;
20970 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20971 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20972 if (!ArgTy->isVectorTy())
20973 break;
20974 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20975 }
20976 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20977 Info.ptrVal = I.getArgOperand(0);
20978 Info.offset = 0;
20979 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20980 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20981 // volatile stores with NEON intrinsics not supported
20982 Info.flags = MachineMemOperand::MOStore;
20983 Infos.push_back(Info);
20984 return;
20985 }
20986 case Intrinsic::arm_neon_vst1x2:
20987 case Intrinsic::arm_neon_vst1x3:
20988 case Intrinsic::arm_neon_vst1x4: {
20989 Info.opc = ISD::INTRINSIC_VOID;
20990 // Conservatively set memVT to the entire set of vectors stored.
20991 auto &DL = I.getDataLayout();
20992 unsigned NumElts = 0;
20993 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20994 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20995 if (!ArgTy->isVectorTy())
20996 break;
20997 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20998 }
20999 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21000 Info.ptrVal = I.getArgOperand(0);
21001 Info.offset = 0;
21002 Info.align = I.getParamAlign(0).valueOrOne();
21003 // volatile stores with NEON intrinsics not supported
21004 Info.flags = MachineMemOperand::MOStore;
21005 Infos.push_back(Info);
21006 return;
21007 }
21008 case Intrinsic::arm_mve_vld2q:
21009 case Intrinsic::arm_mve_vld4q: {
21010 Info.opc = ISD::INTRINSIC_W_CHAIN;
21011 // Conservatively set memVT to the entire set of vectors loaded.
21012 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21013 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21014 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21015 Info.ptrVal = I.getArgOperand(0);
21016 Info.offset = 0;
21017 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21018 // volatile loads with MVE intrinsics not supported
21019 Info.flags = MachineMemOperand::MOLoad;
21020 Infos.push_back(Info);
21021 return;
21022 }
21023 case Intrinsic::arm_mve_vst2q:
21024 case Intrinsic::arm_mve_vst4q: {
21025 Info.opc = ISD::INTRINSIC_VOID;
21026 // Conservatively set memVT to the entire set of vectors stored.
21027 Type *VecTy = I.getArgOperand(1)->getType();
21028 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21029 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21030 Info.ptrVal = I.getArgOperand(0);
21031 Info.offset = 0;
21032 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21033 // volatile stores with MVE intrinsics not supported
21034 Info.flags = MachineMemOperand::MOStore;
21035 Infos.push_back(Info);
21036 return;
21037 }
21038 case Intrinsic::arm_mve_vldr_gather_base:
21039 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21040 Info.opc = ISD::INTRINSIC_W_CHAIN;
21041 Info.ptrVal = nullptr;
21042 Info.memVT = MVT::getVT(I.getType());
21043 Info.align = Align(1);
21044 Info.flags |= MachineMemOperand::MOLoad;
21045 Infos.push_back(Info);
21046 return;
21047 }
21048 case Intrinsic::arm_mve_vldr_gather_base_wb:
21049 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21050 Info.opc = ISD::INTRINSIC_W_CHAIN;
21051 Info.ptrVal = nullptr;
21052 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21053 Info.align = Align(1);
21054 Info.flags |= MachineMemOperand::MOLoad;
21055 Infos.push_back(Info);
21056 return;
21057 }
21058 case Intrinsic::arm_mve_vldr_gather_offset:
21059 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21060 Info.opc = ISD::INTRINSIC_W_CHAIN;
21061 Info.ptrVal = nullptr;
21062 MVT DataVT = MVT::getVT(I.getType());
21063 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21064 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21065 DataVT.getVectorNumElements());
21066 Info.align = Align(1);
21067 Info.flags |= MachineMemOperand::MOLoad;
21068 Infos.push_back(Info);
21069 return;
21070 }
21071 case Intrinsic::arm_mve_vstr_scatter_base:
21072 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21073 Info.opc = ISD::INTRINSIC_VOID;
21074 Info.ptrVal = nullptr;
21075 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21076 Info.align = Align(1);
21077 Info.flags |= MachineMemOperand::MOStore;
21078 Infos.push_back(Info);
21079 return;
21080 }
21081 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21082 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21083 Info.opc = ISD::INTRINSIC_W_CHAIN;
21084 Info.ptrVal = nullptr;
21085 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21086 Info.align = Align(1);
21087 Info.flags |= MachineMemOperand::MOStore;
21088 Infos.push_back(Info);
21089 return;
21090 }
21091 case Intrinsic::arm_mve_vstr_scatter_offset:
21092 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21093 Info.opc = ISD::INTRINSIC_VOID;
21094 Info.ptrVal = nullptr;
21095 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21096 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21097 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21098 DataVT.getVectorNumElements());
21099 Info.align = Align(1);
21100 Info.flags |= MachineMemOperand::MOStore;
21101 Infos.push_back(Info);
21102 return;
21103 }
21104 case Intrinsic::arm_ldaex:
21105 case Intrinsic::arm_ldrex: {
21106 auto &DL = I.getDataLayout();
21107 Type *ValTy = I.getParamElementType(0);
21108 Info.opc = ISD::INTRINSIC_W_CHAIN;
21109 Info.memVT = MVT::getVT(ValTy);
21110 Info.ptrVal = I.getArgOperand(0);
21111 Info.offset = 0;
21112 Info.align = DL.getABITypeAlign(ValTy);
21114 Infos.push_back(Info);
21115 return;
21116 }
21117 case Intrinsic::arm_stlex:
21118 case Intrinsic::arm_strex: {
21119 auto &DL = I.getDataLayout();
21120 Type *ValTy = I.getParamElementType(1);
21121 Info.opc = ISD::INTRINSIC_W_CHAIN;
21122 Info.memVT = MVT::getVT(ValTy);
21123 Info.ptrVal = I.getArgOperand(1);
21124 Info.offset = 0;
21125 Info.align = DL.getABITypeAlign(ValTy);
21127 Infos.push_back(Info);
21128 return;
21129 }
21130 case Intrinsic::arm_stlexd:
21131 case Intrinsic::arm_strexd:
21132 Info.opc = ISD::INTRINSIC_W_CHAIN;
21133 Info.memVT = MVT::i64;
21134 Info.ptrVal = I.getArgOperand(2);
21135 Info.offset = 0;
21136 Info.align = Align(8);
21138 Infos.push_back(Info);
21139 return;
21140
21141 case Intrinsic::arm_ldaexd:
21142 case Intrinsic::arm_ldrexd:
21143 Info.opc = ISD::INTRINSIC_W_CHAIN;
21144 Info.memVT = MVT::i64;
21145 Info.ptrVal = I.getArgOperand(0);
21146 Info.offset = 0;
21147 Info.align = Align(8);
21149 Infos.push_back(Info);
21150 return;
21151
21152 default:
21153 break;
21154 }
21155}
21156
21157/// Returns true if it is beneficial to convert a load of a constant
21158/// to just the constant itself.
21160 Type *Ty) const {
21161 assert(Ty->isIntegerTy());
21162
21163 unsigned Bits = Ty->getPrimitiveSizeInBits();
21164 if (Bits == 0 || Bits > 32)
21165 return false;
21166 return true;
21167}
21168
21170 unsigned Index) const {
21172 return false;
21173
21174 return (Index == 0 || Index == ResVT.getVectorNumElements());
21175}
21176
21178 ARM_MB::MemBOpt Domain) const {
21179 // First, if the target has no DMB, see what fallback we can use.
21180 if (!Subtarget->hasDataBarrier()) {
21181 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21182 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21183 // here.
21184 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21185 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21186 Builder.getInt32(0), Builder.getInt32(7),
21187 Builder.getInt32(10), Builder.getInt32(5)};
21188 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21189 } else {
21190 // Instead of using barriers, atomic accesses on these subtargets use
21191 // libcalls.
21192 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21193 }
21194 } else {
21195 // Only a full system barrier exists in the M-class architectures.
21196 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21197 Constant *CDomain = Builder.getInt32(Domain);
21198 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21199 }
21200}
21201
21202// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21204 Instruction *Inst,
21205 AtomicOrdering Ord) const {
21206 switch (Ord) {
21209 llvm_unreachable("Invalid fence: unordered/non-atomic");
21212 return nullptr; // Nothing to do
21214 if (!Inst->hasAtomicStore())
21215 return nullptr; // Nothing to do
21216 [[fallthrough]];
21219 if (Subtarget->preferISHSTBarriers())
21220 return makeDMB(Builder, ARM_MB::ISHST);
21221 // FIXME: add a comment with a link to documentation justifying this.
21222 else
21223 return makeDMB(Builder, ARM_MB::ISH);
21224 }
21225 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21226}
21227
21229 Instruction *Inst,
21230 AtomicOrdering Ord) const {
21231 switch (Ord) {
21234 llvm_unreachable("Invalid fence: unordered/not-atomic");
21237 return nullptr; // Nothing to do
21241 return makeDMB(Builder, ARM_MB::ISH);
21242 }
21243 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21244}
21245
21246// Loads and stores less than 64-bits are already atomic; ones above that
21247// are doomed anyway, so defer to the default libcall and blame the OS when
21248// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21249// anything for those.
21252 bool has64BitAtomicStore;
21253 if (Subtarget->isMClass())
21254 has64BitAtomicStore = false;
21255 else if (Subtarget->isThumb())
21256 has64BitAtomicStore = Subtarget->hasV7Ops();
21257 else
21258 has64BitAtomicStore = Subtarget->hasV6Ops();
21259
21260 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21261 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21263}
21264
21265// Loads and stores less than 64-bits are already atomic; ones above that
21266// are doomed anyway, so defer to the default libcall and blame the OS when
21267// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21268// anything for those.
21269// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21270// guarantee, see DDI0406C ARM architecture reference manual,
21271// sections A8.8.72-74 LDRD)
21274 bool has64BitAtomicLoad;
21275 if (Subtarget->isMClass())
21276 has64BitAtomicLoad = false;
21277 else if (Subtarget->isThumb())
21278 has64BitAtomicLoad = Subtarget->hasV7Ops();
21279 else
21280 has64BitAtomicLoad = Subtarget->hasV6Ops();
21281
21282 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21283 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21285}
21286
21287// For the real atomic operations, we have ldrex/strex up to 32 bits,
21288// and up to 64 bits on the non-M profiles
21291 if (AI->isFloatingPointOperation())
21293
21294 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21295 bool hasAtomicRMW;
21296 if (Subtarget->isMClass())
21297 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21298 else if (Subtarget->isThumb())
21299 hasAtomicRMW = Subtarget->hasV7Ops();
21300 else
21301 hasAtomicRMW = Subtarget->hasV6Ops();
21302 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21303 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21304 // implement atomicrmw without spilling. If the target address is also on
21305 // the stack and close enough to the spill slot, this can lead to a
21306 // situation where the monitor always gets cleared and the atomic operation
21307 // can never succeed. So at -O0 lower this operation to a CAS loop.
21308 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21311 }
21313}
21314
21315// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21316// bits, and up to 64 bits on the non-M profiles.
21319 const AtomicCmpXchgInst *AI) const {
21320 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21321 // implement cmpxchg without spilling. If the address being exchanged is also
21322 // on the stack and close enough to the spill slot, this can lead to a
21323 // situation where the monitor always gets cleared and the atomic operation
21324 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21325 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21326 bool HasAtomicCmpXchg;
21327 if (Subtarget->isMClass())
21328 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21329 else if (Subtarget->isThumb())
21330 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21331 else
21332 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21333 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21334 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21337}
21338
21340 const Instruction *I) const {
21341 return InsertFencesForAtomic;
21342}
21343
21345 // ROPI/RWPI are not supported currently.
21346 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21347}
21348
21350 Module &M, const LibcallLoweringInfo &Libcalls) const {
21351 // MSVC CRT provides functionalities for stack protection.
21352 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21353 Libcalls.getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21354
21355 RTLIB::LibcallImpl SecurityCookieVar =
21356 Libcalls.getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21357 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21358 SecurityCookieVar != RTLIB::Unsupported) {
21359 // MSVC CRT has a global variable holding security cookie.
21360 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21361 PointerType::getUnqual(M.getContext()));
21362
21363 // MSVC CRT has a function to validate security cookie.
21364 FunctionCallee SecurityCheckCookie =
21365 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21366 Type::getVoidTy(M.getContext()),
21367 PointerType::getUnqual(M.getContext()));
21368 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21369 F->addParamAttr(0, Attribute::AttrKind::InReg);
21370 }
21371
21373}
21374
21376 unsigned &Cost) const {
21377 // If we do not have NEON, vector types are not natively supported.
21378 if (!Subtarget->hasNEON())
21379 return false;
21380
21381 // Floating point values and vector values map to the same register file.
21382 // Therefore, although we could do a store extract of a vector type, this is
21383 // better to leave at float as we have more freedom in the addressing mode for
21384 // those.
21385 if (VectorTy->isFPOrFPVectorTy())
21386 return false;
21387
21388 // If the index is unknown at compile time, this is very expensive to lower
21389 // and it is not possible to combine the store with the extract.
21390 if (!isa<ConstantInt>(Idx))
21391 return false;
21392
21393 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21394 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21395 // We can do a store + vector extract on any vector that fits perfectly in a D
21396 // or Q register.
21397 if (BitWidth == 64 || BitWidth == 128) {
21398 Cost = 0;
21399 return true;
21400 }
21401 return false;
21402}
21403
21405 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21406 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21407 unsigned Opcode = Op.getOpcode();
21408 switch (Opcode) {
21409 case ARMISD::VORRIMM:
21410 case ARMISD::VBICIMM:
21411 return false;
21412 }
21414 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21415}
21416
21418 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21419}
21420
21422 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21423}
21424
21426 const Instruction &AndI) const {
21427 if (!Subtarget->hasV7Ops())
21428 return false;
21429
21430 // Sink the `and` instruction only if the mask would fit into a modified
21431 // immediate operand.
21433 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21434 return false;
21435 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21436 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21437 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21438}
21439
21442 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21443 if (Subtarget->hasMinSize() && !getTM().getTargetTriple().isOSWindows())
21446 ExpansionFactor);
21447}
21448
21450 Value *Addr,
21451 AtomicOrdering Ord) const {
21452 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21453 bool IsAcquire = isAcquireOrStronger(Ord);
21454
21455 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21456 // intrinsic must return {i32, i32} and we have to recombine them into a
21457 // single i64 here.
21458 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21460 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21461
21462 Value *LoHi =
21463 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21464
21465 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21466 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21467 if (!Subtarget->isLittle())
21468 std::swap (Lo, Hi);
21469 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21470 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21471 return Builder.CreateOr(
21472 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21473 }
21474
21475 Type *Tys[] = { Addr->getType() };
21476 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21477 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21478
21479 CI->addParamAttr(
21480 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21481 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21482}
21483
21485 IRBuilderBase &Builder) const {
21486 if (!Subtarget->hasV7Ops())
21487 return;
21488 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21489}
21490
21492 Value *Val, Value *Addr,
21493 AtomicOrdering Ord) const {
21494 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21495 bool IsRelease = isReleaseOrStronger(Ord);
21496
21497 // Since the intrinsics must have legal type, the i64 intrinsics take two
21498 // parameters: "i32, i32". We must marshal Val into the appropriate form
21499 // before the call.
21500 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21502 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21503 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21504
21505 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21506 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21507 if (!Subtarget->isLittle())
21508 std::swap(Lo, Hi);
21509 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21510 }
21511
21512 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21513 Type *Tys[] = { Addr->getType() };
21515
21516 CallInst *CI = Builder.CreateCall(
21517 Strex, {Builder.CreateZExtOrBitCast(
21518 Val, Strex->getFunctionType()->getParamType(0)),
21519 Addr});
21520 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21521 Val->getType()));
21522 return CI;
21523}
21524
21525
21527 return Subtarget->isMClass();
21528}
21529
21530/// A helper function for determining the number of interleaved accesses we
21531/// will generate when lowering accesses of the given type.
21532unsigned
21534 const DataLayout &DL) const {
21535 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21536}
21537
21539 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21540 const DataLayout &DL) const {
21541
21542 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21543 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21544
21545 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21546 return false;
21547
21548 // Ensure the vector doesn't have f16 elements. Even though we could do an
21549 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21550 // f32.
21551 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21552 return false;
21553 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21554 return false;
21555
21556 // Ensure the number of vector elements is greater than 1.
21557 if (VecTy->getNumElements() < 2)
21558 return false;
21559
21560 // Ensure the element type is legal.
21561 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21562 return false;
21563 // And the alignment if high enough under MVE.
21564 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21565 return false;
21566
21567 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21568 // 128 will be split into multiple interleaved accesses.
21569 if (Subtarget->hasNEON() && VecSize == 64)
21570 return true;
21571 return VecSize % 128 == 0;
21572}
21573
21575 if (Subtarget->hasNEON())
21576 return 4;
21577 if (Subtarget->hasMVEIntegerOps())
21580}
21581
21582/// Lower an interleaved load into a vldN intrinsic.
21583///
21584/// E.g. Lower an interleaved load (Factor = 2):
21585/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21586/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21587/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21588///
21589/// Into:
21590/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21591/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21592/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21594 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21595 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21596 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21597 "Invalid interleave factor");
21598 assert(!Shuffles.empty() && "Empty shufflevector input");
21599 assert(Shuffles.size() == Indices.size() &&
21600 "Unmatched number of shufflevectors and indices");
21601
21602 auto *LI = dyn_cast<LoadInst>(Load);
21603 if (!LI)
21604 return false;
21605 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21606
21607 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21608 Type *EltTy = VecTy->getElementType();
21609
21610 const DataLayout &DL = LI->getDataLayout();
21611 Align Alignment = LI->getAlign();
21612
21613 // Skip if we do not have NEON and skip illegal vector types. We can
21614 // "legalize" wide vector types into multiple interleaved accesses as long as
21615 // the vector types are divisible by 128.
21616 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21617 return false;
21618
21619 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21620
21621 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21622 // load integer vectors first and then convert to pointer vectors.
21623 if (EltTy->isPointerTy())
21624 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21625
21626 IRBuilder<> Builder(LI);
21627
21628 // The base address of the load.
21629 Value *BaseAddr = LI->getPointerOperand();
21630
21631 if (NumLoads > 1) {
21632 // If we're going to generate more than one load, reset the sub-vector type
21633 // to something legal.
21634 VecTy = FixedVectorType::get(VecTy->getElementType(),
21635 VecTy->getNumElements() / NumLoads);
21636 }
21637
21638 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21639
21640 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21641 if (Subtarget->hasNEON()) {
21642 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21643 Type *Tys[] = {VecTy, PtrTy};
21644 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21645 Intrinsic::arm_neon_vld3,
21646 Intrinsic::arm_neon_vld4};
21647
21649 Ops.push_back(BaseAddr);
21650 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21651
21652 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21653 /*FMFSource=*/nullptr, "vldN");
21654 } else {
21655 assert((Factor == 2 || Factor == 4) &&
21656 "expected interleave factor of 2 or 4 for MVE");
21657 Intrinsic::ID LoadInts =
21658 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21659 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21660 Type *Tys[] = {VecTy, PtrTy};
21661
21663 Ops.push_back(BaseAddr);
21664 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21665 "vldN");
21666 }
21667 };
21668
21669 // Holds sub-vectors extracted from the load intrinsic return values. The
21670 // sub-vectors are associated with the shufflevector instructions they will
21671 // replace.
21673
21674 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21675 // If we're generating more than one load, compute the base address of
21676 // subsequent loads as an offset from the previous.
21677 if (LoadCount > 0)
21678 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21679 VecTy->getNumElements() * Factor);
21680
21681 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21682
21683 // Replace uses of each shufflevector with the corresponding vector loaded
21684 // by ldN.
21685 for (unsigned i = 0; i < Shuffles.size(); i++) {
21686 ShuffleVectorInst *SV = Shuffles[i];
21687 unsigned Index = Indices[i];
21688
21689 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21690
21691 // Convert the integer vector to pointer vector if the element is pointer.
21692 if (EltTy->isPointerTy())
21693 SubVec = Builder.CreateIntToPtr(
21694 SubVec,
21696
21697 SubVecs[SV].push_back(SubVec);
21698 }
21699 }
21700
21701 // Replace uses of the shufflevector instructions with the sub-vectors
21702 // returned by the load intrinsic. If a shufflevector instruction is
21703 // associated with more than one sub-vector, those sub-vectors will be
21704 // concatenated into a single wide vector.
21705 for (ShuffleVectorInst *SVI : Shuffles) {
21706 auto &SubVec = SubVecs[SVI];
21707 auto *WideVec =
21708 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21709 SVI->replaceAllUsesWith(WideVec);
21710 }
21711
21712 return true;
21713}
21714
21715/// Lower an interleaved store into a vstN intrinsic.
21716///
21717/// E.g. Lower an interleaved store (Factor = 3):
21718/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21719/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21720/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21721///
21722/// Into:
21723/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21724/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21725/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21726/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21727///
21728/// Note that the new shufflevectors will be removed and we'll only generate one
21729/// vst3 instruction in CodeGen.
21730///
21731/// Example for a more general valid mask (Factor 3). Lower:
21732/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21733/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21734/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21735///
21736/// Into:
21737/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21738/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21739/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21740/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21742 Value *LaneMask,
21743 ShuffleVectorInst *SVI,
21744 unsigned Factor,
21745 const APInt &GapMask) const {
21746 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21747 "Invalid interleave factor");
21748 auto *SI = dyn_cast<StoreInst>(Store);
21749 if (!SI)
21750 return false;
21751 assert(!LaneMask && GapMask.popcount() == Factor &&
21752 "Unexpected mask on store");
21753
21754 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21755 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21756
21757 unsigned LaneLen = VecTy->getNumElements() / Factor;
21758 Type *EltTy = VecTy->getElementType();
21759 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21760
21761 const DataLayout &DL = SI->getDataLayout();
21762 Align Alignment = SI->getAlign();
21763
21764 // Skip if we do not have NEON and skip illegal vector types. We can
21765 // "legalize" wide vector types into multiple interleaved accesses as long as
21766 // the vector types are divisible by 128.
21767 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21768 return false;
21769
21770 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21771
21772 Value *Op0 = SVI->getOperand(0);
21773 Value *Op1 = SVI->getOperand(1);
21774 IRBuilder<> Builder(SI);
21775
21776 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21777 // vectors to integer vectors.
21778 if (EltTy->isPointerTy()) {
21779 Type *IntTy = DL.getIntPtrType(EltTy);
21780
21781 // Convert to the corresponding integer vector.
21782 auto *IntVecTy =
21784 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21785 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21786
21787 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21788 }
21789
21790 // The base address of the store.
21791 Value *BaseAddr = SI->getPointerOperand();
21792
21793 if (NumStores > 1) {
21794 // If we're going to generate more than one store, reset the lane length
21795 // and sub-vector type to something legal.
21796 LaneLen /= NumStores;
21797 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21798 }
21799
21800 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21801
21802 auto Mask = SVI->getShuffleMask();
21803
21804 auto createStoreIntrinsic = [&](Value *BaseAddr,
21805 SmallVectorImpl<Value *> &Shuffles) {
21806 if (Subtarget->hasNEON()) {
21807 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21808 Intrinsic::arm_neon_vst3,
21809 Intrinsic::arm_neon_vst4};
21810 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21811 Type *Tys[] = {PtrTy, SubVecTy};
21812
21814 Ops.push_back(BaseAddr);
21815 append_range(Ops, Shuffles);
21816 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21817 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21818 } else {
21819 assert((Factor == 2 || Factor == 4) &&
21820 "expected interleave factor of 2 or 4 for MVE");
21821 Intrinsic::ID StoreInts =
21822 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21823 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21824 Type *Tys[] = {PtrTy, SubVecTy};
21825
21827 Ops.push_back(BaseAddr);
21828 append_range(Ops, Shuffles);
21829 for (unsigned F = 0; F < Factor; F++) {
21830 Ops.push_back(Builder.getInt32(F));
21831 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21832 Ops.pop_back();
21833 }
21834 }
21835 };
21836
21837 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21838 // If we generating more than one store, we compute the base address of
21839 // subsequent stores as an offset from the previous.
21840 if (StoreCount > 0)
21841 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21842 BaseAddr, LaneLen * Factor);
21843
21844 SmallVector<Value *, 4> Shuffles;
21845
21846 // Split the shufflevector operands into sub vectors for the new vstN call.
21847 for (unsigned i = 0; i < Factor; i++) {
21848 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21849 if (Mask[IdxI] >= 0) {
21850 Shuffles.push_back(Builder.CreateShuffleVector(
21851 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21852 } else {
21853 unsigned StartMask = 0;
21854 for (unsigned j = 1; j < LaneLen; j++) {
21855 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21856 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21857 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21858 break;
21859 }
21860 }
21861 // Note: If all elements in a chunk are undefs, StartMask=0!
21862 // Note: Filling undef gaps with random elements is ok, since
21863 // those elements were being written anyway (with undefs).
21864 // In the case of all undefs we're defaulting to using elems from 0
21865 // Note: StartMask cannot be negative, it's checked in
21866 // isReInterleaveMask
21867 Shuffles.push_back(Builder.CreateShuffleVector(
21868 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21869 }
21870 }
21871
21872 createStoreIntrinsic(BaseAddr, Shuffles);
21873 }
21874 return true;
21875}
21876
21884
21886 uint64_t &Members) {
21887 if (auto *ST = dyn_cast<StructType>(Ty)) {
21888 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21889 uint64_t SubMembers = 0;
21890 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21891 return false;
21892 Members += SubMembers;
21893 }
21894 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21895 uint64_t SubMembers = 0;
21896 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21897 return false;
21898 Members += SubMembers * AT->getNumElements();
21899 } else if (Ty->isFloatTy()) {
21900 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21901 return false;
21902 Members = 1;
21903 Base = HA_FLOAT;
21904 } else if (Ty->isDoubleTy()) {
21905 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21906 return false;
21907 Members = 1;
21908 Base = HA_DOUBLE;
21909 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21910 Members = 1;
21911 switch (Base) {
21912 case HA_FLOAT:
21913 case HA_DOUBLE:
21914 return false;
21915 case HA_VECT64:
21916 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21917 case HA_VECT128:
21918 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21919 case HA_UNKNOWN:
21920 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21921 case 64:
21922 Base = HA_VECT64;
21923 return true;
21924 case 128:
21925 Base = HA_VECT128;
21926 return true;
21927 default:
21928 return false;
21929 }
21930 }
21931 }
21932
21933 return (Members > 0 && Members <= 4);
21934}
21935
21936/// Return the correct alignment for the current calling convention.
21938 Type *ArgTy, const DataLayout &DL) const {
21939 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21940 if (!ArgTy->isVectorTy())
21941 return ABITypeAlign;
21942
21943 // Avoid over-aligning vector parameters. It would require realigning the
21944 // stack and waste space for no real benefit.
21945 MaybeAlign StackAlign = DL.getStackAlignment();
21946 assert(StackAlign && "data layout string is missing stack alignment");
21947 return std::min(ABITypeAlign, *StackAlign);
21948}
21949
21950/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21951/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21952/// passing according to AAPCS rules.
21954 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21955 const DataLayout &DL) const {
21956 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21958 return false;
21959
21961 uint64_t Members = 0;
21962 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21963 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21964
21965 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21966 return IsHA || IsIntArray;
21967}
21968
21970 const Constant *PersonalityFn) const {
21971 // Platforms which do not use SjLj EH may return values in these registers
21972 // via the personality function.
21974 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21975}
21976
21978 const Constant *PersonalityFn) const {
21979 // Platforms which do not use SjLj EH may return values in these registers
21980 // via the personality function.
21982 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21983}
21984
21985void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21986 // Update IsSplitCSR in ARMFunctionInfo.
21987 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21988 AFI->setIsSplitCSR(true);
21989}
21990
21991void ARMTargetLowering::insertCopiesSplitCSR(
21992 MachineBasicBlock *Entry,
21993 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21994 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21995 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21996 if (!IStart)
21997 return;
21998
21999 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22000 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22001 MachineBasicBlock::iterator MBBI = Entry->begin();
22002 for (const MCPhysReg *I = IStart; *I; ++I) {
22003 const TargetRegisterClass *RC = nullptr;
22004 if (ARM::GPRRegClass.contains(*I))
22005 RC = &ARM::GPRRegClass;
22006 else if (ARM::DPRRegClass.contains(*I))
22007 RC = &ARM::DPRRegClass;
22008 else
22009 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22010
22011 Register NewVR = MRI->createVirtualRegister(RC);
22012 // Create copy from CSR to a virtual register.
22013 // FIXME: this currently does not emit CFI pseudo-instructions, it works
22014 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22015 // nounwind. If we want to generalize this later, we may need to emit
22016 // CFI pseudo-instructions.
22017 assert(Entry->getParent()->getFunction().hasFnAttribute(
22018 Attribute::NoUnwind) &&
22019 "Function should be nounwind in insertCopiesSplitCSR!");
22020 Entry->addLiveIn(*I);
22021 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
22022 .addReg(*I);
22023
22024 // Insert the copy-back instructions right before the terminator.
22025 for (auto *Exit : Exits)
22026 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22027 TII->get(TargetOpcode::COPY), *I)
22028 .addReg(NewVR);
22029 }
22030}
22031
22036
22038 return Subtarget->hasMVEIntegerOps();
22039}
22040
22043 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22044 if (!VTy)
22045 return false;
22046
22047 auto *ScalarTy = VTy->getScalarType();
22048 unsigned NumElements = VTy->getNumElements();
22049
22050 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22051 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22052 return false;
22053
22054 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22055 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22056 return Subtarget->hasMVEFloatOps();
22057
22059 return false;
22060
22061 return Subtarget->hasMVEIntegerOps() &&
22062 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22063 ScalarTy->isIntegerTy(32));
22064}
22065
22067 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
22068 return RCRegs;
22069}
22070
22073 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22074 Value *Accumulator) const {
22075
22077
22078 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22079
22080 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22081
22082 if (TyWidth > 128) {
22083 int Stride = Ty->getNumElements() / 2;
22084 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22085 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22086 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22087 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22088
22089 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22090 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22091 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22092 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22093 Value *LowerSplitAcc = nullptr;
22094 Value *UpperSplitAcc = nullptr;
22095
22096 if (Accumulator) {
22097 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22098 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22099 }
22100
22101 auto *LowerSplitInt = createComplexDeinterleavingIR(
22102 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22103 auto *UpperSplitInt = createComplexDeinterleavingIR(
22104 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22105
22106 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22107 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22108 }
22109
22110 auto *IntTy = Type::getInt32Ty(B.getContext());
22111
22112 ConstantInt *ConstRotation = nullptr;
22113 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22114 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22115
22116 if (Accumulator)
22117 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22118 {ConstRotation, Accumulator, InputB, InputA});
22119 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22120 {ConstRotation, InputB, InputA});
22121 }
22122
22123 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22124 // 1 means the value is not halved.
22125 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22126
22128 ConstRotation = ConstantInt::get(IntTy, 0);
22130 ConstRotation = ConstantInt::get(IntTy, 1);
22131
22132 if (!ConstRotation)
22133 return nullptr; // Invalid rotation for arm_mve_vcaddq
22134
22135 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22136 {ConstHalving, ConstRotation, InputA, InputB});
22137 }
22138
22139 return nullptr;
22140}
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal, SDValue FalseVal, const ARMSubtarget *Subtarget)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getInvertedARMCondCode(SDValue ARMcc, SelectionDAG &DAG)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static int getNegationCost(SDValue Op)
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformORCombineToShiftInsert(SelectionDAG &DAG, SDValue AndOp, SDValue ShiftOp, EVT VT, SDLoc dl)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5918
APInt bitcastToAPInt() const
Definition APFloat.h:1408
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1387
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1044
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1208
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1613
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1776
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:865
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:858
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1671
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:1065
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
MachineConstantPoolValue * getMachineCPVal() const
const Constant * getConstVal() const
LLVM_ABI Type * getType() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:215
bool isBigEndian() const
Definition DataLayout.h:216
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:248
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
StringRef getInternalSymbolPrefix() const
Definition DataLayout.h:306
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
arg_iterator arg_begin()
Definition Function.h:868
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:695
const Argument * const_arg_iterator
Definition Function.h:74
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:229
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Tracks which library functions to use for a particular subtarget.
LLVM_ABI CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall.
LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Return the lowering's selection of implementation call for Call.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
const LibcallLoweringInfo & getLibcalls() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:124
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:137
const unsigned char * bytes_begin() const
Definition StringRef.h:121
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
virtual void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::LibcallImpl LibcallImpl, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
void setTypeIdForCallsiteInfo(const CallBase *CB, MachineFunction &MF, MachineFunction::CallSiteInfo &CSInfo) const
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:446
bool isOSWindows() const
Tests whether the OS is Windows.
Definition Triple.h:709
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering)
const unsigned FPReservedBits
const unsigned RoundingBitsPos
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:511
@ SET_FPENV
Sets the current floating-point environment.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:168
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:538
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ RESET_FPENV
Set floating-point environment to default state.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:172
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:438
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:975
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:715
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ BR
Control flow instructions. These all have token chains.
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:827
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:792
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:970
@ STRICT_FP_TO_FP16
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ STRICT_FP16_TO_FP
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:139
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:735
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:710
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:427
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:162
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:722
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Define
Register definition.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1530
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:90
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:308
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:358
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:471
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:367
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:478
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
EVT changeVectorElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:98
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:215
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:389
bool isFixedLengthVector() const
Definition ValueTypes.h:189
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:55
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:316
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:461
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:210
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:317
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:178
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:327
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:186
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:138
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...